From a41175a2b395e137f78827519a8a53b45cd99543 Mon Sep 17 00:00:00 2001
From: Anton <100830759+antonwolfy@users.noreply.github.com>
Date: Mon, 26 Jan 2026 20:36:00 +0100
Subject: [PATCH 01/43] Add new header file `dpnp4pybind11.hpp` (#2734)

The PR adds a header file `dpnp4pybind11.hpp` which contains minimum
necessary content to write pybind11 extensions and includes a caster for
`usm_ndarray` and type enumerators.

This PR also includes movement for a part of dpctl.tensor header which
previously used in dpnp code. It is needed to get rid of include
conflicts, since now repiques including new `dpnp4pybind11.hpp` header
everywhere.
---
 CMakeLists.txt                                |    1 -
 .../libtensor/include/kernels/alignment.hpp   |   46 +
 .../include/kernels/dpctl_tensor_types.hpp    |   40 +
 .../kernels/elementwise_functions/common.hpp  | 1045 +++++++++++++
 .../elementwise_functions/common_detail.hpp   |   70 +
 .../elementwise_functions/logaddexp.hpp       |  268 ++++
 .../kernels/elementwise_functions/maximum.hpp |  322 ++++
 .../kernels/elementwise_functions/minimum.hpp |  321 ++++
 .../elementwise_functions/sycl_complex.hpp    |   44 +
 .../elementwise_functions/vec_size_util.hpp   |   70 +
 .../include/utils/indexing_utils.hpp          |  153 ++
 .../libtensor/include/utils/math_utils.hpp    |  148 ++
 .../include/utils/memory_overlap.hpp          |  157 ++
 .../libtensor/include/utils/offset_utils.hpp  |  824 ++++++++++
 .../include/utils/output_validation.hpp       |   79 +
 .../libtensor/include/utils/strided_iters.hpp |  996 ++++++++++++
 .../include/utils/sycl_alloc_utils.hpp        |  223 +++
 .../libtensor/include/utils/sycl_utils.hpp    |  662 ++++++++
 .../libtensor/include/utils/type_dispatch.hpp |  134 ++
 .../include/utils/type_dispatch_building.hpp  |  300 ++++
 .../libtensor/include/utils/type_utils.hpp    |  164 ++
 dpnp/backend/CMakeLists.txt                   |    1 -
 dpnp/backend/extensions/blas/CMakeLists.txt   |   13 +-
 dpnp/backend/extensions/blas/dot_common.hpp   |    1 +
 dpnp/backend/extensions/blas/gemm.hpp         |    2 +-
 dpnp/backend/extensions/blas/gemv.hpp         |    2 +-
 dpnp/backend/extensions/blas/syrk.hpp         |    2 +-
 dpnp/backend/extensions/common/ext/common.hpp |    2 +
 .../common/ext/details/common_internal.hpp    |    4 +-
 .../common/ext/validation_utils.hpp           |    5 +-
 .../elementwise_functions.hpp                 |    8 +-
 .../elementwise_functions_type_utils.cpp      |    5 +-
 .../elementwise_functions_type_utils.hpp      |    4 +-
 dpnp/backend/extensions/fft/CMakeLists.txt    |   12 +-
 dpnp/backend/extensions/fft/in_place.hpp      |    5 +-
 dpnp/backend/extensions/fft/in_place.tpp      |   10 +-
 dpnp/backend/extensions/fft/out_of_place.hpp  |    5 +-
 dpnp/backend/extensions/fft/out_of_place.tpp  |   12 +-
 .../extensions/indexing/CMakeLists.txt        |   11 +-
 dpnp/backend/extensions/indexing/choose.cpp   |   13 +-
 dpnp/backend/extensions/lapack/CMakeLists.txt |   12 +-
 dpnp/backend/extensions/lapack/geqrf.hpp      |    2 +-
 dpnp/backend/extensions/lapack/gesv.hpp       |    2 +-
 dpnp/backend/extensions/lapack/gesvd.hpp      |    2 +-
 dpnp/backend/extensions/lapack/getrf.hpp      |    2 +-
 dpnp/backend/extensions/lapack/getri.hpp      |    2 +-
 dpnp/backend/extensions/lapack/getrs.hpp      |    2 +-
 dpnp/backend/extensions/lapack/heevd.cpp      |    1 +
 .../backend/extensions/lapack/heevd_batch.cpp |    1 +
 dpnp/backend/extensions/lapack/orgqr.hpp      |    2 +-
 dpnp/backend/extensions/lapack/potrf.hpp      |    2 +-
 dpnp/backend/extensions/lapack/syevd.cpp      |    1 +
 .../backend/extensions/lapack/syevd_batch.cpp |    1 +
 dpnp/backend/extensions/lapack/ungqr.hpp      |    2 +-
 .../extensions/statistics/CMakeLists.txt      |   15 +-
 .../extensions/statistics/bincount.hpp        |    3 +-
 .../extensions/statistics/histogram.cpp       |    4 +-
 .../extensions/statistics/histogram.hpp       |    4 +-
 .../statistics/histogram_common.cpp           |   10 +-
 .../extensions/statistics/histogramdd.hpp     |    4 +-
 .../statistics/sliding_dot_product1d.cpp      |    8 +-
 .../statistics/sliding_window1d.cpp           |   11 +-
 dpnp/backend/extensions/ufunc/CMakeLists.txt  |   12 +-
 .../elementwise_functions/bitwise_count.cpp   |    6 +-
 .../ufunc/elementwise_functions/degrees.cpp   |    6 +-
 .../ufunc/elementwise_functions/divmod.cpp    |    6 +-
 .../ufunc/elementwise_functions/erf_funcs.cpp |    6 +-
 .../ufunc/elementwise_functions/fabs.cpp      |    6 +-
 .../elementwise_functions/float_power.cpp     |    6 +-
 .../ufunc/elementwise_functions/fmax.cpp      |    6 +-
 .../ufunc/elementwise_functions/fmin.cpp      |    6 +-
 .../ufunc/elementwise_functions/fmod.cpp      |    6 +-
 .../ufunc/elementwise_functions/frexp.cpp     |    6 +-
 .../ufunc/elementwise_functions/gcd.cpp       |    6 +-
 .../ufunc/elementwise_functions/heaviside.cpp |    6 +-
 .../ufunc/elementwise_functions/i0.cpp        |    6 +-
 .../elementwise_functions/interpolate.cpp     |    8 +-
 .../ufunc/elementwise_functions/isclose.cpp   |    8 +-
 .../ufunc/elementwise_functions/lcm.cpp       |    6 +-
 .../ufunc/elementwise_functions/ldexp.cpp     |    6 +-
 .../elementwise_functions/logaddexp2.cpp      |    6 +-
 .../ufunc/elementwise_functions/modf.cpp      |    6 +-
 .../elementwise_functions/nan_to_num.cpp      |    3 +-
 .../ufunc/elementwise_functions/radians.cpp   |    6 +-
 .../ufunc/elementwise_functions/sinc.cpp      |    6 +-
 .../ufunc/elementwise_functions/spacing.cpp   |    6 +-
 dpnp/backend/extensions/vm/CMakeLists.txt     |   11 +-
 dpnp/backend/extensions/vm/abs.cpp            |    5 +-
 dpnp/backend/extensions/vm/acos.cpp           |    5 +-
 dpnp/backend/extensions/vm/acosh.cpp          |    5 +-
 dpnp/backend/extensions/vm/add.cpp            |    5 +-
 dpnp/backend/extensions/vm/arg.cpp            |    5 +-
 dpnp/backend/extensions/vm/asin.cpp           |    5 +-
 dpnp/backend/extensions/vm/asinh.cpp          |    5 +-
 dpnp/backend/extensions/vm/atan.cpp           |    5 +-
 dpnp/backend/extensions/vm/atan2.cpp          |    5 +-
 dpnp/backend/extensions/vm/atanh.cpp          |    5 +-
 dpnp/backend/extensions/vm/cbrt.cpp           |    5 +-
 dpnp/backend/extensions/vm/ceil.cpp           |    5 +-
 dpnp/backend/extensions/vm/common.hpp         |    4 +-
 dpnp/backend/extensions/vm/conj.cpp           |    5 +-
 dpnp/backend/extensions/vm/copysign.cpp       |    5 +-
 dpnp/backend/extensions/vm/cos.cpp            |    5 +-
 dpnp/backend/extensions/vm/cosh.cpp           |    5 +-
 dpnp/backend/extensions/vm/div.cpp            |    5 +-
 dpnp/backend/extensions/vm/erf_funcs.cpp      |    5 +-
 dpnp/backend/extensions/vm/exp.cpp            |    5 +-
 dpnp/backend/extensions/vm/exp2.cpp           |    5 +-
 dpnp/backend/extensions/vm/expm1.cpp          |    5 +-
 dpnp/backend/extensions/vm/floor.cpp          |    5 +-
 dpnp/backend/extensions/vm/fmax.cpp           |    5 +-
 dpnp/backend/extensions/vm/fmin.cpp           |    5 +-
 dpnp/backend/extensions/vm/fmod.cpp           |    5 +-
 dpnp/backend/extensions/vm/hypot.cpp          |    5 +-
 dpnp/backend/extensions/vm/i0.cpp             |    5 +-
 dpnp/backend/extensions/vm/inv.cpp            |    5 +-
 dpnp/backend/extensions/vm/ln.cpp             |    5 +-
 dpnp/backend/extensions/vm/log10.cpp          |    5 +-
 dpnp/backend/extensions/vm/log1p.cpp          |    5 +-
 dpnp/backend/extensions/vm/log2.cpp           |    5 +-
 dpnp/backend/extensions/vm/modf.cpp           |    5 +-
 dpnp/backend/extensions/vm/mul.cpp            |    5 +-
 dpnp/backend/extensions/vm/nextafter.cpp      |    5 +-
 dpnp/backend/extensions/vm/pow.cpp            |    5 +-
 dpnp/backend/extensions/vm/rint.cpp           |    5 +-
 dpnp/backend/extensions/vm/sin.cpp            |    5 +-
 dpnp/backend/extensions/vm/sinh.cpp           |    5 +-
 dpnp/backend/extensions/vm/sqr.cpp            |    5 +-
 dpnp/backend/extensions/vm/sqrt.cpp           |    5 +-
 dpnp/backend/extensions/vm/sub.cpp            |    5 +-
 dpnp/backend/extensions/vm/tan.cpp            |    5 +-
 dpnp/backend/extensions/vm/tanh.cpp           |    5 +-
 dpnp/backend/extensions/vm/trunc.cpp          |    5 +-
 dpnp/backend/extensions/window/CMakeLists.txt |   12 +-
 dpnp/backend/extensions/window/common.hpp     |    4 +-
 dpnp/backend/extensions/window/kaiser.hpp     |    4 +-
 dpnp/backend/include/dpnp4pybind11.hpp        | 1373 +++++++++++++++++
 pyproject.toml                                |   10 +-
 138 files changed, 7880 insertions(+), 191 deletions(-)
 create mode 100644 dpctl/tensor/libtensor/include/kernels/alignment.hpp
 create mode 100644 dpctl/tensor/libtensor/include/kernels/dpctl_tensor_types.hpp
 create mode 100644 dpctl/tensor/libtensor/include/kernels/elementwise_functions/common.hpp
 create mode 100644 dpctl/tensor/libtensor/include/kernels/elementwise_functions/common_detail.hpp
 create mode 100644 dpctl/tensor/libtensor/include/kernels/elementwise_functions/logaddexp.hpp
 create mode 100644 dpctl/tensor/libtensor/include/kernels/elementwise_functions/maximum.hpp
 create mode 100644 dpctl/tensor/libtensor/include/kernels/elementwise_functions/minimum.hpp
 create mode 100644 dpctl/tensor/libtensor/include/kernels/elementwise_functions/sycl_complex.hpp
 create mode 100644 dpctl/tensor/libtensor/include/kernels/elementwise_functions/vec_size_util.hpp
 create mode 100644 dpctl/tensor/libtensor/include/utils/indexing_utils.hpp
 create mode 100644 dpctl/tensor/libtensor/include/utils/math_utils.hpp
 create mode 100644 dpctl/tensor/libtensor/include/utils/memory_overlap.hpp
 create mode 100644 dpctl/tensor/libtensor/include/utils/offset_utils.hpp
 create mode 100644 dpctl/tensor/libtensor/include/utils/output_validation.hpp
 create mode 100644 dpctl/tensor/libtensor/include/utils/strided_iters.hpp
 create mode 100644 dpctl/tensor/libtensor/include/utils/sycl_alloc_utils.hpp
 create mode 100644 dpctl/tensor/libtensor/include/utils/sycl_utils.hpp
 create mode 100644 dpctl/tensor/libtensor/include/utils/type_dispatch.hpp
 create mode 100644 dpctl/tensor/libtensor/include/utils/type_dispatch_building.hpp
 create mode 100644 dpctl/tensor/libtensor/include/utils/type_utils.hpp
 create mode 100644 dpnp/backend/include/dpnp4pybind11.hpp

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 9d676232f08e..386b17b44294 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -97,7 +97,6 @@ find_package(Cython REQUIRED)
 
 find_package(Dpctl REQUIRED)
 message(STATUS "Dpctl_INCLUDE_DIR=" ${Dpctl_INCLUDE_DIR})
-message(STATUS "Dpctl_TENSOR_INCLUDE_DIR=" ${Dpctl_TENSOR_INCLUDE_DIR})
 
 option(DPNP_USE_ONEMATH "Build DPNP with oneMath" OFF)
 set(DPNP_TARGET_CUDA
diff --git a/dpctl/tensor/libtensor/include/kernels/alignment.hpp b/dpctl/tensor/libtensor/include/kernels/alignment.hpp
new file mode 100644
index 000000000000..a67e9b15306e
--- /dev/null
+++ b/dpctl/tensor/libtensor/include/kernels/alignment.hpp
@@ -0,0 +1,46 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+
+#pragma once
+
+#include <cstddef>
+#include <cstdint>
+
+namespace dpctl::tensor::kernels::alignment_utils
+{
+inline constexpr std::size_t required_alignment = 64UL;
+
+template <std::uintptr_t alignment, typename Ptr>
+bool is_aligned(Ptr p)
+{
+    return !(reinterpret_cast<std::uintptr_t>(p) % alignment);
+}
+
+template <typename KernelName>
+class disabled_sg_loadstore_wrapper_krn;
+} // namespace dpctl::tensor::kernels::alignment_utils
diff --git a/dpctl/tensor/libtensor/include/kernels/dpctl_tensor_types.hpp b/dpctl/tensor/libtensor/include/kernels/dpctl_tensor_types.hpp
new file mode 100644
index 000000000000..4db78e1805e3
--- /dev/null
+++ b/dpctl/tensor/libtensor/include/kernels/dpctl_tensor_types.hpp
@@ -0,0 +1,40 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions
+//===--------------------------------------------------------------------===//
+
+#pragma once
+
+#include <cstddef>
+
+namespace dpctl::tensor
+{
+typedef std::ptrdiff_t ssize_t;
+} // namespace dpctl::tensor
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/common.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/common.hpp
new file mode 100644
index 000000000000..d19930b722a9
--- /dev/null
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/common.hpp
@@ -0,0 +1,1045 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+///
+/// \file
+/// This file defines common code for elementwise tensor operations.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+
+#include <algorithm>
+#include <cmath>
+#include <complex>
+#include <cstddef>
+#include <cstdint>
+#include <limits>
+#include <type_traits>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "common_detail.hpp"
+
+#include "utils/offset_utils.hpp"
+#include "utils/sycl_alloc_utils.hpp"
+#include "utils/sycl_utils.hpp"
+
+#include "kernels/alignment.hpp"
+#include "kernels/dpctl_tensor_types.hpp"
+
+namespace dpctl::tensor::kernels::elementwise_common
+{
+using dpctl::tensor::ssize_t;
+using dpctl::tensor::kernels::alignment_utils::
+    disabled_sg_loadstore_wrapper_krn;
+using dpctl::tensor::kernels::alignment_utils::is_aligned;
+using dpctl::tensor::kernels::alignment_utils::required_alignment;
+
+using dpctl::tensor::sycl_utils::sub_group_load;
+using dpctl::tensor::sycl_utils::sub_group_store;
+
+/*! @brief Functor for unary function evaluation on contiguous array */
+template <typename argT,
+          typename resT,
+          typename UnaryOperatorT,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
+          bool enable_sg_loadstore = true>
+struct UnaryContigFunctor
+{
+private:
+    const argT *in = nullptr;
+    resT *out = nullptr;
+    std::size_t nelems_;
+
+public:
+    UnaryContigFunctor(const argT *inp, resT *res, const std::size_t n_elems)
+        : in(inp), out(res), nelems_(n_elems)
+    {
+    }
+
+    void operator()(sycl::nd_item<1> ndit) const
+    {
+        static constexpr std::uint8_t elems_per_wi = n_vecs * vec_sz;
+        UnaryOperatorT op{};
+        /* Each work-item processes vec_sz elements, contiguous in memory */
+        /* NOTE: work-group size must be divisible by sub-group size */
+
+        if constexpr (enable_sg_loadstore && UnaryOperatorT::is_constant::value)
+        {
+            // value of operator is known to be a known constant
+            constexpr resT const_val = UnaryOperatorT::constant_value;
+
+            auto sg = ndit.get_sub_group();
+            const std::uint16_t sgSize = sg.get_max_local_range()[0];
+
+            const std::size_t base =
+                elems_per_wi * (ndit.get_group(0) * ndit.get_local_range(0) +
+                                sg.get_group_id()[0] * sgSize);
+            if (base + elems_per_wi * sgSize < nelems_) {
+                static constexpr sycl::vec<resT, vec_sz> res_vec(const_val);
+#pragma unroll
+                for (std::uint8_t it = 0; it < elems_per_wi; it += vec_sz) {
+                    const std::size_t offset = base + it * sgSize;
+                    auto out_multi_ptr = sycl::address_space_cast<
+                        sycl::access::address_space::global_space,
+                        sycl::access::decorated::yes>(&out[offset]);
+
+                    sub_group_store<vec_sz>(sg, res_vec, out_multi_ptr);
+                }
+            }
+            else {
+                const std::size_t lane_id = sg.get_local_id()[0];
+                for (std::size_t k = base + lane_id; k < nelems_; k += sgSize) {
+                    out[k] = const_val;
+                }
+            }
+        }
+        else if constexpr (enable_sg_loadstore &&
+                           UnaryOperatorT::supports_sg_loadstore::value &&
+                           UnaryOperatorT::supports_vec::value && (vec_sz > 1))
+        {
+            auto sg = ndit.get_sub_group();
+            const std::uint16_t sgSize = sg.get_max_local_range()[0];
+
+            const std::size_t base =
+                elems_per_wi * (ndit.get_group(0) * ndit.get_local_range(0) +
+                                sg.get_group_id()[0] * sgSize);
+            if (base + elems_per_wi * sgSize < nelems_) {
+#pragma unroll
+                for (std::uint8_t it = 0; it < elems_per_wi; it += vec_sz) {
+                    const std::size_t offset = base + it * sgSize;
+                    auto in_multi_ptr = sycl::address_space_cast<
+                        sycl::access::address_space::global_space,
+                        sycl::access::decorated::yes>(&in[offset]);
+                    auto out_multi_ptr = sycl::address_space_cast<
+                        sycl::access::address_space::global_space,
+                        sycl::access::decorated::yes>(&out[offset]);
+
+                    const sycl::vec<argT, vec_sz> x =
+                        sub_group_load<vec_sz>(sg, in_multi_ptr);
+                    const sycl::vec<resT, vec_sz> res_vec = op(x);
+                    sub_group_store<vec_sz>(sg, res_vec, out_multi_ptr);
+                }
+            }
+            else {
+                const std::size_t lane_id = sg.get_local_id()[0];
+                for (std::size_t k = base + lane_id; k < nelems_; k += sgSize) {
+                    // scalar call
+                    out[k] = op(in[k]);
+                }
+            }
+        }
+        else if constexpr (enable_sg_loadstore &&
+                           UnaryOperatorT::supports_sg_loadstore::value &&
+                           std::is_same_v<resT, argT>)
+        {
+            // default: use scalar-value function
+
+            auto sg = ndit.get_sub_group();
+            const std::uint16_t sgSize = sg.get_max_local_range()[0];
+            const std::size_t base =
+                elems_per_wi * (ndit.get_group(0) * ndit.get_local_range(0) +
+                                sg.get_group_id()[0] * sgSize);
+
+            if (base + elems_per_wi * sgSize < nelems_) {
+#pragma unroll
+                for (std::uint8_t it = 0; it < elems_per_wi; it += vec_sz) {
+                    const std::size_t offset = base + it * sgSize;
+                    auto in_multi_ptr = sycl::address_space_cast<
+                        sycl::access::address_space::global_space,
+                        sycl::access::decorated::yes>(&in[offset]);
+                    auto out_multi_ptr = sycl::address_space_cast<
+                        sycl::access::address_space::global_space,
+                        sycl::access::decorated::yes>(&out[offset]);
+
+                    sycl::vec<argT, vec_sz> arg_vec =
+                        sub_group_load<vec_sz>(sg, in_multi_ptr);
+#pragma unroll
+                    for (std::uint32_t k = 0; k < vec_sz; ++k) {
+                        arg_vec[k] = op(arg_vec[k]);
+                    }
+                    sub_group_store<vec_sz>(sg, arg_vec, out_multi_ptr);
+                }
+            }
+            else {
+                const std::size_t lane_id = sg.get_local_id()[0];
+                for (std::size_t k = base + lane_id; k < nelems_; k += sgSize) {
+                    out[k] = op(in[k]);
+                }
+            }
+        }
+        else if constexpr (enable_sg_loadstore &&
+                           UnaryOperatorT::supports_sg_loadstore::value)
+        {
+            // default: use scalar-value function
+
+            auto sg = ndit.get_sub_group();
+            const std::uint16_t sgSize = sg.get_max_local_range()[0];
+            const std::size_t base =
+                elems_per_wi * (ndit.get_group(0) * ndit.get_local_range(0) +
+                                sg.get_group_id()[0] * sgSize);
+
+            if (base + elems_per_wi * sgSize < nelems_) {
+#pragma unroll
+                for (std::uint8_t it = 0; it < elems_per_wi; it += vec_sz) {
+                    const std::size_t offset = base + it * sgSize;
+                    auto in_multi_ptr = sycl::address_space_cast<
+                        sycl::access::address_space::global_space,
+                        sycl::access::decorated::yes>(&in[offset]);
+                    auto out_multi_ptr = sycl::address_space_cast<
+                        sycl::access::address_space::global_space,
+                        sycl::access::decorated::yes>(&out[offset]);
+
+                    const sycl::vec<argT, vec_sz> arg_vec =
+                        sub_group_load<vec_sz>(sg, in_multi_ptr);
+                    sycl::vec<resT, vec_sz> res_vec;
+#pragma unroll
+                    for (std::uint8_t k = 0; k < vec_sz; ++k) {
+                        res_vec[k] = op(arg_vec[k]);
+                    }
+                    sub_group_store<vec_sz>(sg, res_vec, out_multi_ptr);
+                }
+            }
+            else {
+                const std::size_t lane_id = sg.get_local_id()[0];
+                for (std::size_t k = base + lane_id; k < nelems_; k += sgSize) {
+                    out[k] = op(in[k]);
+                }
+            }
+        }
+        else {
+            const std::uint16_t sgSize =
+                ndit.get_sub_group().get_local_range()[0];
+            const std::size_t gid = ndit.get_global_linear_id();
+            const std::uint16_t elems_per_sg = sgSize * elems_per_wi;
+
+            const std::size_t start =
+                (gid / sgSize) * (elems_per_sg - sgSize) + gid;
+            const std::size_t end = std::min(nelems_, start + elems_per_sg);
+            for (std::size_t offset = start; offset < end; offset += sgSize) {
+                out[offset] = op(in[offset]);
+            }
+        }
+    }
+};
+
+template <typename argT, typename resT, typename IndexerT, typename UnaryOpT>
+struct UnaryStridedFunctor
+{
+private:
+    const argT *inp_ = nullptr;
+    resT *res_ = nullptr;
+    IndexerT inp_out_indexer_;
+
+public:
+    UnaryStridedFunctor(const argT *inp_p,
+                        resT *res_p,
+                        const IndexerT &inp_out_indexer)
+        : inp_(inp_p), res_(res_p), inp_out_indexer_(inp_out_indexer)
+    {
+    }
+
+    void operator()(sycl::id<1> wid) const
+    {
+        const auto &offsets_ = inp_out_indexer_(wid.get(0));
+        const ssize_t &inp_offset = offsets_.get_first_offset();
+        const ssize_t &res_offset = offsets_.get_second_offset();
+
+        UnaryOpT op{};
+
+        res_[res_offset] = op(inp_[inp_offset]);
+    }
+};
+
+template <typename SizeT>
+SizeT select_lws(const sycl::device &, SizeT n_work_items_needed)
+{
+    // TODO: make the decision based on device descriptors
+
+    // constexpr SizeT few_threshold = (SizeT(1) << 17);
+    static constexpr SizeT med_threshold = (SizeT(1) << 21);
+
+    const SizeT lws =
+        (n_work_items_needed <= med_threshold ? SizeT(128) : SizeT(256));
+
+    return lws;
+}
+
+template <typename argTy,
+          template <typename T>
+          class UnaryOutputType,
+          template <typename A,
+                    typename R,
+                    std::uint8_t vs,
+                    std::uint8_t nv,
+                    bool enable>
+          class ContigFunctorT,
+          template <typename A, typename R, std::uint8_t vs, std::uint8_t nv>
+          class kernel_name,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u>
+sycl::event unary_contig_impl(sycl::queue &exec_q,
+                              std::size_t nelems,
+                              const char *arg_p,
+                              char *res_p,
+                              const std::vector<sycl::event> &depends = {})
+{
+    static constexpr std::uint8_t elems_per_wi = n_vecs * vec_sz;
+    const std::size_t n_work_items_needed = nelems / elems_per_wi;
+    const std::size_t lws =
+        select_lws(exec_q.get_device(), n_work_items_needed);
+
+    const std::size_t n_groups =
+        ((nelems + lws * elems_per_wi - 1) / (lws * elems_per_wi));
+    const auto gws_range = sycl::range<1>(n_groups * lws);
+    const auto lws_range = sycl::range<1>(lws);
+
+    using resTy = typename UnaryOutputType<argTy>::value_type;
+    using BaseKernelName = kernel_name<argTy, resTy, vec_sz, n_vecs>;
+
+    const argTy *arg_tp = reinterpret_cast<const argTy *>(arg_p);
+    resTy *res_tp = reinterpret_cast<resTy *>(res_p);
+
+    sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+
+        if (is_aligned<required_alignment>(arg_p) &&
+            is_aligned<required_alignment>(res_p))
+        {
+            static constexpr bool enable_sg_loadstore = true;
+            using KernelName = BaseKernelName;
+            using Impl = ContigFunctorT<argTy, resTy, vec_sz, n_vecs,
+                                        enable_sg_loadstore>;
+
+            cgh.parallel_for<KernelName>(
+                sycl::nd_range<1>(gws_range, lws_range),
+                Impl(arg_tp, res_tp, nelems));
+        }
+        else {
+            static constexpr bool disable_sg_loadstore = false;
+            using KernelName =
+                disabled_sg_loadstore_wrapper_krn<BaseKernelName>;
+            using Impl = ContigFunctorT<argTy, resTy, vec_sz, n_vecs,
+                                        disable_sg_loadstore>;
+
+            cgh.parallel_for<KernelName>(
+                sycl::nd_range<1>(gws_range, lws_range),
+                Impl(arg_tp, res_tp, nelems));
+        }
+    });
+
+    return comp_ev;
+}
+
+template <typename argTy,
+          template <typename T>
+          class UnaryOutputType,
+          template <typename A, typename R, typename I>
+          class StridedFunctorT,
+          template <typename A, typename R, typename I>
+          class kernel_name>
+sycl::event
+    unary_strided_impl(sycl::queue &exec_q,
+                       std::size_t nelems,
+                       int nd,
+                       const ssize_t *shape_and_strides,
+                       const char *arg_p,
+                       ssize_t arg_offset,
+                       char *res_p,
+                       ssize_t res_offset,
+                       const std::vector<sycl::event> &depends,
+                       const std::vector<sycl::event> &additional_depends)
+{
+    sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+        cgh.depends_on(additional_depends);
+
+        using resTy = typename UnaryOutputType<argTy>::value_type;
+        using IndexerT =
+            typename dpctl::tensor::offset_utils::TwoOffsets_StridedIndexer;
+
+        const IndexerT indexer{nd, arg_offset, res_offset, shape_and_strides};
+
+        const argTy *arg_tp = reinterpret_cast<const argTy *>(arg_p);
+        resTy *res_tp = reinterpret_cast<resTy *>(res_p);
+
+        using Impl = StridedFunctorT<argTy, resTy, IndexerT>;
+
+        cgh.parallel_for<kernel_name<argTy, resTy, IndexerT>>(
+            {nelems}, Impl(arg_tp, res_tp, indexer));
+    });
+    return comp_ev;
+}
+
+template <typename argT1,
+          typename argT2,
+          typename resT,
+          typename BinaryOperatorT,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
+          bool enable_sg_loadstore = true>
+struct BinaryContigFunctor
+{
+private:
+    const argT1 *in1 = nullptr;
+    const argT2 *in2 = nullptr;
+    resT *out = nullptr;
+    std::size_t nelems_;
+
+public:
+    BinaryContigFunctor(const argT1 *inp1,
+                        const argT2 *inp2,
+                        resT *res,
+                        const std::size_t n_elems)
+        : in1(inp1), in2(inp2), out(res), nelems_(n_elems)
+    {
+    }
+
+    void operator()(sycl::nd_item<1> ndit) const
+    {
+        static constexpr std::uint8_t elems_per_wi = n_vecs * vec_sz;
+        BinaryOperatorT op{};
+        /* Each work-item processes vec_sz elements, contiguous in memory */
+        /* NOTE: work-group size must be divisible by sub-group size */
+
+        if constexpr (enable_sg_loadstore &&
+                      BinaryOperatorT::supports_sg_loadstore::value &&
+                      BinaryOperatorT::supports_vec::value && (vec_sz > 1))
+        {
+            auto sg = ndit.get_sub_group();
+            std::uint16_t sgSize = sg.get_max_local_range()[0];
+
+            const std::size_t base =
+                elems_per_wi * (ndit.get_group(0) * ndit.get_local_range(0) +
+                                sg.get_group_id()[0] * sgSize);
+
+            if (base + elems_per_wi * sgSize < nelems_) {
+                sycl::vec<resT, vec_sz> res_vec;
+
+#pragma unroll
+                for (std::uint8_t it = 0; it < elems_per_wi; it += vec_sz) {
+                    std::size_t offset = base + it * sgSize;
+                    auto in1_multi_ptr = sycl::address_space_cast<
+                        sycl::access::address_space::global_space,
+                        sycl::access::decorated::yes>(&in1[offset]);
+                    auto in2_multi_ptr = sycl::address_space_cast<
+                        sycl::access::address_space::global_space,
+                        sycl::access::decorated::yes>(&in2[offset]);
+                    auto out_multi_ptr = sycl::address_space_cast<
+                        sycl::access::address_space::global_space,
+                        sycl::access::decorated::yes>(&out[offset]);
+
+                    const sycl::vec<argT1, vec_sz> arg1_vec =
+                        sub_group_load<vec_sz>(sg, in1_multi_ptr);
+                    const sycl::vec<argT2, vec_sz> arg2_vec =
+                        sub_group_load<vec_sz>(sg, in2_multi_ptr);
+                    res_vec = op(arg1_vec, arg2_vec);
+                    sub_group_store<vec_sz>(sg, res_vec, out_multi_ptr);
+                }
+            }
+            else {
+                const std::size_t lane_id = sg.get_local_id()[0];
+                for (std::size_t k = base + lane_id; k < nelems_; k += sgSize) {
+                    out[k] = op(in1[k], in2[k]);
+                }
+            }
+        }
+        else if constexpr (enable_sg_loadstore &&
+                           BinaryOperatorT::supports_sg_loadstore::value)
+        {
+            auto sg = ndit.get_sub_group();
+            const std::uint16_t sgSize = sg.get_max_local_range()[0];
+
+            const std::size_t base =
+                elems_per_wi * (ndit.get_group(0) * ndit.get_local_range(0) +
+                                sg.get_group_id()[0] * sgSize);
+
+            if (base + elems_per_wi * sgSize < nelems_) {
+#pragma unroll
+                for (std::uint8_t it = 0; it < elems_per_wi; it += vec_sz) {
+                    const std::size_t offset = base + it * sgSize;
+                    auto in1_multi_ptr = sycl::address_space_cast<
+                        sycl::access::address_space::global_space,
+                        sycl::access::decorated::yes>(&in1[offset]);
+                    auto in2_multi_ptr = sycl::address_space_cast<
+                        sycl::access::address_space::global_space,
+                        sycl::access::decorated::yes>(&in2[offset]);
+                    auto out_multi_ptr = sycl::address_space_cast<
+                        sycl::access::address_space::global_space,
+                        sycl::access::decorated::yes>(&out[offset]);
+
+                    const sycl::vec<argT1, vec_sz> arg1_vec =
+                        sub_group_load<vec_sz>(sg, in1_multi_ptr);
+                    const sycl::vec<argT2, vec_sz> arg2_vec =
+                        sub_group_load<vec_sz>(sg, in2_multi_ptr);
+
+                    sycl::vec<resT, vec_sz> res_vec;
+#pragma unroll
+                    for (std::uint8_t vec_id = 0; vec_id < vec_sz; ++vec_id) {
+                        res_vec[vec_id] =
+                            op(arg1_vec[vec_id], arg2_vec[vec_id]);
+                    }
+                    sub_group_store<vec_sz>(sg, res_vec, out_multi_ptr);
+                }
+            }
+            else {
+                const std::size_t lane_id = sg.get_local_id()[0];
+                for (std::size_t k = base + lane_id; k < nelems_; k += sgSize) {
+                    out[k] = op(in1[k], in2[k]);
+                }
+            }
+        }
+        else {
+            const std::size_t sgSize =
+                ndit.get_sub_group().get_local_range()[0];
+            const std::size_t gid = ndit.get_global_linear_id();
+            const std::size_t elems_per_sg = sgSize * elems_per_wi;
+
+            const std::size_t start =
+                (gid / sgSize) * (elems_per_sg - sgSize) + gid;
+            const std::size_t end = std::min(nelems_, start + elems_per_sg);
+            for (std::size_t offset = start; offset < end; offset += sgSize) {
+                out[offset] = op(in1[offset], in2[offset]);
+            }
+        }
+    }
+};
+
+template <typename argT1,
+          typename argT2,
+          typename resT,
+          typename ThreeOffsets_IndexerT,
+          typename BinaryOperatorT>
+struct BinaryStridedFunctor
+{
+private:
+    const argT1 *in1 = nullptr;
+    const argT2 *in2 = nullptr;
+    resT *out = nullptr;
+    ThreeOffsets_IndexerT three_offsets_indexer_;
+
+public:
+    BinaryStridedFunctor(const argT1 *inp1_tp,
+                         const argT2 *inp2_tp,
+                         resT *res_tp,
+                         const ThreeOffsets_IndexerT &inps_res_indexer)
+        : in1(inp1_tp), in2(inp2_tp), out(res_tp),
+          three_offsets_indexer_(inps_res_indexer)
+    {
+    }
+
+    void operator()(sycl::id<1> wid) const
+    {
+        const auto &three_offsets_ =
+            three_offsets_indexer_(static_cast<ssize_t>(wid.get(0)));
+
+        const auto &inp1_offset = three_offsets_.get_first_offset();
+        const auto &inp2_offset = three_offsets_.get_second_offset();
+        const auto &out_offset = three_offsets_.get_third_offset();
+
+        BinaryOperatorT op{};
+        out[out_offset] = op(in1[inp1_offset], in2[inp2_offset]);
+    }
+};
+
+template <typename argT1,
+          typename argT2,
+          typename resT,
+          typename BinaryOperatorT>
+struct BinaryContigMatrixContigRowBroadcastingFunctor
+{
+private:
+    const argT1 *mat;
+    const argT2 *padded_vec;
+    resT *res;
+    std::size_t n_elems;
+    std::size_t n1;
+
+public:
+    BinaryContigMatrixContigRowBroadcastingFunctor(const argT1 *mat_tp,
+                                                   const argT2 *row_tp,
+                                                   resT *res_tp,
+                                                   std::size_t n_elems_in_mat,
+                                                   std::size_t n_elems_in_row)
+        : mat(mat_tp), padded_vec(row_tp), res(res_tp), n_elems(n_elems_in_mat),
+          n1(n_elems_in_row)
+    {
+    }
+
+    void operator()(sycl::nd_item<1> ndit) const
+    {
+        /* NOTE: work-group size must be divisible by sub-group size */
+
+        BinaryOperatorT op{};
+        static_assert(BinaryOperatorT::supports_sg_loadstore::value);
+
+        const auto &sg = ndit.get_sub_group();
+        const std::size_t gid = ndit.get_global_linear_id();
+
+        const std::size_t sgSize = sg.get_max_local_range()[0];
+        const std::size_t base = gid - sg.get_local_id()[0];
+
+        if (base + sgSize < n_elems) {
+            auto in1_multi_ptr = sycl::address_space_cast<
+                sycl::access::address_space::global_space,
+                sycl::access::decorated::yes>(&mat[base]);
+
+            auto in2_multi_ptr = sycl::address_space_cast<
+                sycl::access::address_space::global_space,
+                sycl::access::decorated::yes>(&padded_vec[base % n1]);
+
+            auto out_multi_ptr = sycl::address_space_cast<
+                sycl::access::address_space::global_space,
+                sycl::access::decorated::yes>(&res[base]);
+
+            const argT1 mat_el = sub_group_load(sg, in1_multi_ptr);
+            const argT2 vec_el = sub_group_load(sg, in2_multi_ptr);
+
+            resT res_el = op(mat_el, vec_el);
+
+            sub_group_store(sg, res_el, out_multi_ptr);
+        }
+        else {
+            const std::size_t lane_id = sg.get_local_id()[0];
+            for (std::size_t k = base + lane_id; k < n_elems; k += sgSize) {
+                res[k] = op(mat[k], padded_vec[k % n1]);
+            }
+        }
+    }
+};
+
+template <typename argT1,
+          typename argT2,
+          typename resT,
+          typename BinaryOperatorT>
+struct BinaryContigRowContigMatrixBroadcastingFunctor
+{
+private:
+    const argT1 *padded_vec;
+    const argT2 *mat;
+    resT *res;
+    std::size_t n_elems;
+    std::size_t n1;
+
+public:
+    BinaryContigRowContigMatrixBroadcastingFunctor(const argT1 *row_tp,
+                                                   const argT2 *mat_tp,
+                                                   resT *res_tp,
+                                                   std::size_t n_elems_in_mat,
+                                                   std::size_t n_elems_in_row)
+        : padded_vec(row_tp), mat(mat_tp), res(res_tp), n_elems(n_elems_in_mat),
+          n1(n_elems_in_row)
+    {
+    }
+
+    void operator()(sycl::nd_item<1> ndit) const
+    {
+        /* NOTE: work-group size must be divisible by sub-group size */
+        BinaryOperatorT op{};
+        static_assert(BinaryOperatorT::supports_sg_loadstore::value);
+
+        const auto &sg = ndit.get_sub_group();
+        std::size_t gid = ndit.get_global_linear_id();
+
+        const std::size_t sgSize = sg.get_max_local_range()[0];
+        const std::size_t base = gid - sg.get_local_id()[0];
+
+        if (base + sgSize < n_elems) {
+            auto in1_multi_ptr = sycl::address_space_cast<
+                sycl::access::address_space::global_space,
+                sycl::access::decorated::yes>(&padded_vec[base % n1]);
+
+            auto in2_multi_ptr = sycl::address_space_cast<
+                sycl::access::address_space::global_space,
+                sycl::access::decorated::yes>(&mat[base]);
+
+            auto out_multi_ptr = sycl::address_space_cast<
+                sycl::access::address_space::global_space,
+                sycl::access::decorated::yes>(&res[base]);
+
+            const argT2 mat_el = sub_group_load(sg, in2_multi_ptr);
+            const argT1 vec_el = sub_group_load(sg, in1_multi_ptr);
+
+            resT res_el = op(vec_el, mat_el);
+
+            sub_group_store(sg, res_el, out_multi_ptr);
+        }
+        else {
+            const std::size_t lane_id = sg.get_local_id()[0];
+            for (std::size_t k = base + lane_id; k < n_elems; k += sgSize) {
+                res[k] = op(padded_vec[k % n1], mat[k]);
+            }
+        }
+    }
+};
+
+// Typedefs for function pointers
+
+typedef sycl::event (*unary_contig_impl_fn_ptr_t)(
+    sycl::queue &,
+    std::size_t,
+    const char *,
+    char *,
+    const std::vector<sycl::event> &);
+
+typedef sycl::event (*unary_strided_impl_fn_ptr_t)(
+    sycl::queue &,
+    std::size_t,
+    int,
+    const ssize_t *,
+    const char *,
+    ssize_t,
+    char *,
+    ssize_t,
+    const std::vector<sycl::event> &,
+    const std::vector<sycl::event> &);
+
+typedef sycl::event (*binary_contig_impl_fn_ptr_t)(
+    sycl::queue &,
+    std::size_t,
+    const char *,
+    ssize_t,
+    const char *,
+    ssize_t,
+    char *,
+    ssize_t,
+    const std::vector<sycl::event> &);
+
+typedef sycl::event (*binary_strided_impl_fn_ptr_t)(
+    sycl::queue &,
+    std::size_t,
+    int,
+    const ssize_t *,
+    const char *,
+    ssize_t,
+    const char *,
+    ssize_t,
+    char *,
+    ssize_t,
+    const std::vector<sycl::event> &,
+    const std::vector<sycl::event> &);
+
+typedef sycl::event (*binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t)(
+    sycl::queue &,
+    std::vector<sycl::event> &,
+    std::size_t,
+    std::size_t,
+    const char *,
+    ssize_t,
+    const char *,
+    ssize_t,
+    char *,
+    ssize_t,
+    const std::vector<sycl::event> &);
+
+typedef sycl::event (*binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t)(
+    sycl::queue &,
+    std::vector<sycl::event> &,
+    std::size_t,
+    std::size_t,
+    const char *,
+    ssize_t,
+    const char *,
+    ssize_t,
+    char *,
+    ssize_t,
+    const std::vector<sycl::event> &);
+
+template <typename argTy1,
+          typename argTy2,
+          template <typename T1, typename T2>
+          class BinaryOutputType,
+          template <typename T1,
+                    typename T2,
+                    typename T3,
+                    std::uint8_t vs,
+                    std::uint8_t nv,
+                    bool enable_sg_loadstore>
+          class BinaryContigFunctorT,
+          template <typename T1,
+                    typename T2,
+                    typename T3,
+                    std::uint8_t vs,
+                    std::uint8_t nv>
+          class kernel_name,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u>
+sycl::event binary_contig_impl(sycl::queue &exec_q,
+                               std::size_t nelems,
+                               const char *arg1_p,
+                               ssize_t arg1_offset,
+                               const char *arg2_p,
+                               ssize_t arg2_offset,
+                               char *res_p,
+                               ssize_t res_offset,
+                               const std::vector<sycl::event> &depends = {})
+{
+    const std::size_t n_work_items_needed = nelems / (n_vecs * vec_sz);
+    const std::size_t lws =
+        select_lws(exec_q.get_device(), n_work_items_needed);
+
+    const std::size_t n_groups =
+        ((nelems + lws * n_vecs * vec_sz - 1) / (lws * n_vecs * vec_sz));
+    const auto gws_range = sycl::range<1>(n_groups * lws);
+    const auto lws_range = sycl::range<1>(lws);
+
+    using resTy = typename BinaryOutputType<argTy1, argTy2>::value_type;
+    using BaseKernelName = kernel_name<argTy1, argTy2, resTy, vec_sz, n_vecs>;
+
+    const argTy1 *arg1_tp =
+        reinterpret_cast<const argTy1 *>(arg1_p) + arg1_offset;
+    const argTy2 *arg2_tp =
+        reinterpret_cast<const argTy2 *>(arg2_p) + arg2_offset;
+    resTy *res_tp = reinterpret_cast<resTy *>(res_p) + res_offset;
+
+    sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+
+        if (is_aligned<required_alignment>(arg1_tp) &&
+            is_aligned<required_alignment>(arg2_tp) &&
+            is_aligned<required_alignment>(res_tp))
+        {
+            static constexpr bool enable_sg_loadstore = true;
+            using KernelName = BaseKernelName;
+            using Impl = BinaryContigFunctorT<argTy1, argTy2, resTy, vec_sz,
+                                              n_vecs, enable_sg_loadstore>;
+
+            cgh.parallel_for<KernelName>(
+                sycl::nd_range<1>(gws_range, lws_range),
+                Impl(arg1_tp, arg2_tp, res_tp, nelems));
+        }
+        else {
+            static constexpr bool disable_sg_loadstore = false;
+            using KernelName =
+                disabled_sg_loadstore_wrapper_krn<BaseKernelName>;
+            using Impl = BinaryContigFunctorT<argTy1, argTy2, resTy, vec_sz,
+                                              n_vecs, disable_sg_loadstore>;
+
+            cgh.parallel_for<KernelName>(
+                sycl::nd_range<1>(gws_range, lws_range),
+                Impl(arg1_tp, arg2_tp, res_tp, nelems));
+        }
+    });
+    return comp_ev;
+}
+
+template <typename argTy1,
+          typename argTy2,
+          template <typename T1, typename T2>
+          class BinaryOutputType,
+          template <typename T1, typename T2, typename T3, typename IndT>
+          class BinaryStridedFunctorT,
+          template <typename T1, typename T2, typename T3, typename IndT>
+          class kernel_name>
+sycl::event
+    binary_strided_impl(sycl::queue &exec_q,
+                        std::size_t nelems,
+                        int nd,
+                        const ssize_t *shape_and_strides,
+                        const char *arg1_p,
+                        ssize_t arg1_offset,
+                        const char *arg2_p,
+                        ssize_t arg2_offset,
+                        char *res_p,
+                        ssize_t res_offset,
+                        const std::vector<sycl::event> &depends,
+                        const std::vector<sycl::event> &additional_depends)
+{
+    sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+        cgh.depends_on(additional_depends);
+
+        using resTy = typename BinaryOutputType<argTy1, argTy2>::value_type;
+
+        using IndexerT =
+            typename dpctl::tensor::offset_utils::ThreeOffsets_StridedIndexer;
+
+        const IndexerT indexer{nd, arg1_offset, arg2_offset, res_offset,
+                               shape_and_strides};
+
+        const argTy1 *arg1_tp = reinterpret_cast<const argTy1 *>(arg1_p);
+        const argTy2 *arg2_tp = reinterpret_cast<const argTy2 *>(arg2_p);
+        resTy *res_tp = reinterpret_cast<resTy *>(res_p);
+
+        using Impl = BinaryStridedFunctorT<argTy1, argTy2, resTy, IndexerT>;
+
+        cgh.parallel_for<kernel_name<argTy1, argTy2, resTy, IndexerT>>(
+            {nelems}, Impl(arg1_tp, arg2_tp, res_tp, indexer));
+    });
+    return comp_ev;
+}
+
+template <typename argT1,
+          typename argT2,
+          typename resT,
+          template <typename T1, typename T2, typename T3>
+          class BinaryContigMatrixContigRowBroadcastFunctorT,
+          template <typename T1, typename T2, typename T3>
+          class kernel_name>
+sycl::event binary_contig_matrix_contig_row_broadcast_impl(
+    sycl::queue &exec_q,
+    std::vector<sycl::event> &host_tasks,
+    std::size_t n0,
+    std::size_t n1,
+    const char *mat_p, // typeless pointer to (n0, n1) C-contiguous matrix
+    ssize_t mat_offset,
+    const char *vec_p, // typeless pointer to (n1,) contiguous row
+    ssize_t vec_offset,
+    char *res_p, // typeless pointer to (n0, n1) result C-contig. matrix,
+                 //    res[i,j] = op(mat[i,j], vec[j])
+    ssize_t res_offset,
+    const std::vector<sycl::event> &depends = {})
+{
+    const argT1 *mat = reinterpret_cast<const argT1 *>(mat_p) + mat_offset;
+    const argT2 *vec = reinterpret_cast<const argT2 *>(vec_p) + vec_offset;
+    resT *res = reinterpret_cast<resT *>(res_p) + res_offset;
+
+    const auto &dev = exec_q.get_device();
+    const auto &sg_sizes = dev.get_info<sycl::info::device::sub_group_sizes>();
+    // Get device-specific kernel info max_sub_group_size
+    std::size_t max_sgSize =
+        *(std::max_element(std::begin(sg_sizes), std::end(sg_sizes)));
+
+    std::size_t n1_padded = n1 + max_sgSize;
+    auto padded_vec_owner =
+        dpctl::tensor::alloc_utils::smart_malloc_device<argT2>(n1_padded,
+                                                               exec_q);
+    argT2 *padded_vec = padded_vec_owner.get();
+
+    sycl::event make_padded_vec_ev =
+        dpctl::tensor::kernels::elementwise_detail::populate_padded_vector<
+            argT2>(exec_q, vec, n1, padded_vec, n1_padded, depends);
+
+    // sub-group spans work-items [I, I + sgSize)
+    // base = ndit.get_global_linear_id() - sg.get_local_id()[0]
+    // Generically, sub_group_load( &mat[base]) may load arrays from
+    // different rows of mat. The start corresponds to row (base / n0)
+    // We read sub_group_load(&padded_vec[(base / n0)]).
+    // The vector is padded to ensure that reads are accessible
+
+    const std::size_t lws = 128;
+
+    sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(make_padded_vec_ev);
+
+        auto lwsRange = sycl::range<1>(lws);
+        std::size_t n_elems = n0 * n1;
+        std::size_t n_groups = (n_elems + lws - 1) / lws;
+        auto gwsRange = sycl::range<1>(n_groups * lws);
+
+        using Impl =
+            BinaryContigMatrixContigRowBroadcastFunctorT<argT1, argT2, resT>;
+
+        cgh.parallel_for<class kernel_name<argT1, argT2, resT>>(
+            sycl::nd_range<1>(gwsRange, lwsRange),
+            Impl(mat, padded_vec, res, n_elems, n1));
+    });
+
+    sycl::event tmp_cleanup_ev = dpctl::tensor::alloc_utils::async_smart_free(
+        exec_q, {comp_ev}, padded_vec_owner);
+
+    host_tasks.push_back(tmp_cleanup_ev);
+
+    return comp_ev;
+}
+
+template <typename argT1,
+          typename argT2,
+          typename resT,
+          template <typename T1, typename T2, typename T3>
+          class BinaryContigRowContigMatrixBroadcastFunctorT,
+          template <typename T1, typename T2, typename T3>
+          class kernel_name>
+sycl::event binary_contig_row_contig_matrix_broadcast_impl(
+    sycl::queue &exec_q,
+    std::vector<sycl::event> &host_tasks,
+    std::size_t n0,
+    std::size_t n1,
+    const char *vec_p, // typeless pointer to (n1,) contiguous row
+    ssize_t vec_offset,
+    const char *mat_p, // typeless pointer to (n0, n1) C-contiguous matrix
+    ssize_t mat_offset,
+    char *res_p, // typeless pointer to (n0, n1) result C-contig. matrix,
+                 //    res[i,j] = op(vec[j], mat[i,j])
+    ssize_t res_offset,
+    const std::vector<sycl::event> &depends = {})
+{
+    const argT1 *vec = reinterpret_cast<const argT2 *>(vec_p) + vec_offset;
+    const argT2 *mat = reinterpret_cast<const argT1 *>(mat_p) + mat_offset;
+    resT *res = reinterpret_cast<resT *>(res_p) + res_offset;
+
+    const auto &dev = exec_q.get_device();
+    const auto &sg_sizes = dev.get_info<sycl::info::device::sub_group_sizes>();
+    // Get device-specific kernel info max_sub_group_size
+    std::size_t max_sgSize =
+        *(std::max_element(std::begin(sg_sizes), std::end(sg_sizes)));
+
+    std::size_t n1_padded = n1 + max_sgSize;
+    auto padded_vec_owner =
+        dpctl::tensor::alloc_utils::smart_malloc_device<argT2>(n1_padded,
+                                                               exec_q);
+    argT2 *padded_vec = padded_vec_owner.get();
+
+    sycl::event make_padded_vec_ev =
+        dpctl::tensor::kernels::elementwise_detail::populate_padded_vector<
+            argT2>(exec_q, vec, n1, padded_vec, n1_padded, depends);
+
+    // sub-group spans work-items [I, I + sgSize)
+    // base = ndit.get_global_linear_id() - sg.get_local_id()[0]
+    // Generically, sub_group_load( &mat[base]) may load arrays from
+    // different rows of mat. The start corresponds to row (base / n0)
+    // We read sub_group_load(&padded_vec[(base / n0)]). The vector is
+    // padded to ensure that reads are accessible
+
+    const std::size_t lws = 128;
+
+    sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(make_padded_vec_ev);
+
+        auto lwsRange = sycl::range<1>(lws);
+        std::size_t n_elems = n0 * n1;
+        std::size_t n_groups = (n_elems + lws - 1) / lws;
+        auto gwsRange = sycl::range<1>(n_groups * lws);
+
+        using Impl =
+            BinaryContigRowContigMatrixBroadcastFunctorT<argT1, argT2, resT>;
+
+        cgh.parallel_for<class kernel_name<argT1, argT2, resT>>(
+            sycl::nd_range<1>(gwsRange, lwsRange),
+            Impl(padded_vec, mat, res, n_elems, n1));
+    });
+
+    sycl::event tmp_cleanup_ev = dpctl::tensor::alloc_utils::async_smart_free(
+        exec_q, {comp_ev}, padded_vec_owner);
+
+    host_tasks.push_back(tmp_cleanup_ev);
+
+    return comp_ev;
+};
+} // namespace dpctl::tensor::kernels::elementwise_common
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/common_detail.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/common_detail.hpp
new file mode 100644
index 000000000000..b304b5ac3a39
--- /dev/null
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/common_detail.hpp
@@ -0,0 +1,70 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+///
+/// \file
+/// This file defines common code for elementwise tensor operations.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+
+#include <cstddef>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+namespace dpctl::tensor::kernels::elementwise_detail
+{
+template <typename T>
+class populate_padded_vec_krn;
+
+template <typename T>
+sycl::event
+    populate_padded_vector(sycl::queue &exec_q,
+                           const T *vec,
+                           std::size_t vec_sz,
+                           T *padded_vec,
+                           size_t padded_vec_sz,
+                           const std::vector<sycl::event> &dependent_events)
+{
+    sycl::event populate_padded_vec_ev = exec_q.submit([&](sycl::handler &cgh) {
+        // ensure vec contains actual data
+        cgh.depends_on(dependent_events);
+
+        sycl::range<1> gRange{padded_vec_sz};
+
+        cgh.parallel_for<class populate_padded_vec_krn<T>>(
+            gRange, [=](sycl::id<1> id)
+        {
+            std::size_t i = id[0];
+            padded_vec[i] = vec[i % vec_sz];
+            });
+    });
+
+    return populate_padded_vec_ev;
+}
+} // namespace dpctl::tensor::kernels::elementwise_detail
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logaddexp.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logaddexp.hpp
new file mode 100644
index 000000000000..8565df2cf528
--- /dev/null
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logaddexp.hpp
@@ -0,0 +1,268 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+///
+/// \file
+/// This file defines kernels for elementwise evaluation of LOGADDEXP(x1, x2)
+/// function.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+
+#include <algorithm>
+#include <cmath>
+#include <cstddef>
+#include <cstdint>
+#include <type_traits>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "common.hpp"
+#include "vec_size_util.hpp"
+
+#include "utils/math_utils.hpp"
+#include "utils/offset_utils.hpp"
+#include "utils/type_dispatch_building.hpp"
+#include "utils/type_utils.hpp"
+
+#include "kernels/dpctl_tensor_types.hpp"
+
+namespace dpctl::tensor::kernels::logaddexp
+{
+using dpctl::tensor::ssize_t;
+namespace td_ns = dpctl::tensor::type_dispatch;
+namespace tu_ns = dpctl::tensor::type_utils;
+
+using dpctl::tensor::type_utils::is_complex;
+using dpctl::tensor::type_utils::vec_cast;
+
+template <typename argT1, typename argT2, typename resT>
+struct LogAddExpFunctor
+{
+    using supports_sg_loadstore = std::true_type;
+    using supports_vec = std::true_type;
+
+    resT operator()(const argT1 &in1, const argT2 &in2) const
+    {
+        using dpctl::tensor::math_utils::logaddexp;
+        return logaddexp<resT>(in1, in2);
+    }
+
+    template <int vec_sz>
+    sycl::vec<resT, vec_sz>
+        operator()(const sycl::vec<argT1, vec_sz> &in1,
+                   const sycl::vec<argT2, vec_sz> &in2) const
+    {
+        sycl::vec<resT, vec_sz> res;
+        auto diff = in1 - in2; // take advantange of faster vec arithmetic
+
+#pragma unroll
+        for (int i = 0; i < vec_sz; ++i) {
+            if (std::isfinite(diff[i])) {
+                res[i] = std::max<resT>(in1[i], in2[i]) +
+                         impl_finite<resT>(-sycl::fabs(diff[i]));
+            }
+            else {
+                using dpctl::tensor::math_utils::logaddexp;
+                res[i] = logaddexp<resT>(in1[i], in2[i]);
+            }
+        }
+
+        return res;
+    }
+
+private:
+    template <typename T>
+    T impl_finite(T const &in) const
+    {
+        return (in > 0) ? (in + sycl::log1p(sycl::exp(-in)))
+                        : sycl::log1p(sycl::exp(in));
+    }
+};
+
+template <typename argT1,
+          typename argT2,
+          typename resT,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
+          bool enable_sg_loadstore = true>
+using LogAddExpContigFunctor = elementwise_common::BinaryContigFunctor<
+    argT1,
+    argT2,
+    resT,
+    LogAddExpFunctor<argT1, argT2, resT>,
+    vec_sz,
+    n_vecs,
+    enable_sg_loadstore>;
+
+template <typename argT1, typename argT2, typename resT, typename IndexerT>
+using LogAddExpStridedFunctor = elementwise_common::BinaryStridedFunctor<
+    argT1,
+    argT2,
+    resT,
+    IndexerT,
+    LogAddExpFunctor<argT1, argT2, resT>>;
+
+template <typename T1, typename T2>
+struct LogAddExpOutputType
+{
+    using value_type = typename std::disjunction<
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        sycl::half,
+                                        T2,
+                                        sycl::half,
+                                        sycl::half>,
+        td_ns::BinaryTypeMapResultEntry<T1, float, T2, float, float>,
+        td_ns::BinaryTypeMapResultEntry<T1, double, T2, double, double>,
+        td_ns::DefaultResultEntry<void>>::result_type;
+
+    static constexpr bool is_defined = !std::is_same_v<value_type, void>;
+};
+
+namespace hyperparam_detail
+{
+
+namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
+
+using vsu_ns::BinaryContigHyperparameterSetEntry;
+using vsu_ns::ContigHyperparameterSetDefault;
+
+template <typename argTy1, typename argTy2>
+struct LogAddExpContigHyperparameterSet
+{
+    using value_type =
+        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
+
+    constexpr static auto vec_sz = value_type::vec_sz;
+    constexpr static auto n_vecs = value_type::n_vecs;
+};
+
+} // end of namespace hyperparam_detail
+
+template <typename argT1,
+          typename argT2,
+          typename resT,
+          std::uint8_t vec_sz,
+          std::uint8_t n_vecs>
+class logaddexp_contig_kernel;
+
+template <typename argTy1, typename argTy2>
+sycl::event logaddexp_contig_impl(sycl::queue &exec_q,
+                                  std::size_t nelems,
+                                  const char *arg1_p,
+                                  ssize_t arg1_offset,
+                                  const char *arg2_p,
+                                  ssize_t arg2_offset,
+                                  char *res_p,
+                                  ssize_t res_offset,
+                                  const std::vector<sycl::event> &depends = {})
+{
+    using LogAddExpHS =
+        hyperparam_detail::LogAddExpContigHyperparameterSet<argTy1, argTy2>;
+    static constexpr std::uint8_t vec_sz = LogAddExpHS::vec_sz;
+    static constexpr std::uint8_t n_vecs = LogAddExpHS::n_vecs;
+
+    return elementwise_common::binary_contig_impl<
+        argTy1, argTy2, LogAddExpOutputType, LogAddExpContigFunctor,
+        logaddexp_contig_kernel, vec_sz, n_vecs>(
+        exec_q, nelems, arg1_p, arg1_offset, arg2_p, arg2_offset, res_p,
+        res_offset, depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct LogAddExpContigFactory
+{
+    fnT get()
+    {
+        if constexpr (!LogAddExpOutputType<T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = logaddexp_contig_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+
+template <typename fnT, typename T1, typename T2>
+struct LogAddExpTypeMapFactory
+{
+    /*! @brief get typeid for output type of logaddexp(T1 x, T2 y) */
+    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
+    {
+        using rT = typename LogAddExpOutputType<T1, T2>::value_type;
+        return td_ns::GetTypeid<rT>{}.get();
+    }
+};
+
+template <typename T1, typename T2, typename resT, typename IndexerT>
+class logaddexp_strided_kernel;
+
+template <typename argTy1, typename argTy2>
+sycl::event
+    logaddexp_strided_impl(sycl::queue &exec_q,
+                           std::size_t nelems,
+                           int nd,
+                           const ssize_t *shape_and_strides,
+                           const char *arg1_p,
+                           ssize_t arg1_offset,
+                           const char *arg2_p,
+                           ssize_t arg2_offset,
+                           char *res_p,
+                           ssize_t res_offset,
+                           const std::vector<sycl::event> &depends,
+                           const std::vector<sycl::event> &additional_depends)
+{
+    return elementwise_common::binary_strided_impl<
+        argTy1, argTy2, LogAddExpOutputType, LogAddExpStridedFunctor,
+        logaddexp_strided_kernel>(exec_q, nelems, nd, shape_and_strides, arg1_p,
+                                  arg1_offset, arg2_p, arg2_offset, res_p,
+                                  res_offset, depends, additional_depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct LogAddExpStridedFactory
+{
+    fnT get()
+    {
+        if constexpr (!LogAddExpOutputType<T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = logaddexp_strided_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+
+template <typename argT1, typename argT2, typename resT>
+class logaddexp_matrix_row_broadcast_sg_krn;
+
+} // namespace dpctl::tensor::kernels::logaddexp
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/maximum.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/maximum.hpp
new file mode 100644
index 000000000000..067ccd84f059
--- /dev/null
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/maximum.hpp
@@ -0,0 +1,322 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+///
+/// \file
+/// This file defines kernels for elementwise evaluation of MAXIMUM(x1, x2)
+/// function.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+
+#include <complex>
+#include <cstddef>
+#include <cstdint>
+#include <type_traits>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "common.hpp"
+#include "vec_size_util.hpp"
+
+#include "utils/math_utils.hpp"
+#include "utils/offset_utils.hpp"
+#include "utils/type_dispatch_building.hpp"
+#include "utils/type_utils.hpp"
+
+#include "kernels/dpctl_tensor_types.hpp"
+
+namespace dpctl::tensor::kernels::maximum
+{
+using dpctl::tensor::ssize_t;
+namespace td_ns = dpctl::tensor::type_dispatch;
+namespace tu_ns = dpctl::tensor::type_utils;
+
+template <typename argT1, typename argT2, typename resT>
+struct MaximumFunctor
+{
+
+    using supports_sg_loadstore = std::negation<
+        std::disjunction<tu_ns::is_complex<argT1>, tu_ns::is_complex<argT2>>>;
+    using supports_vec = std::conjunction<
+        std::is_same<argT1, argT2>,
+        std::negation<std::disjunction<tu_ns::is_complex<argT1>,
+                                       tu_ns::is_complex<argT2>>>>;
+
+    resT operator()(const argT1 &in1, const argT2 &in2) const
+    {
+        if constexpr (tu_ns::is_complex<argT1>::value ||
+                      tu_ns::is_complex<argT2>::value)
+        {
+            static_assert(std::is_same_v<argT1, argT2>);
+            using dpctl::tensor::math_utils::max_complex;
+            return max_complex<argT1>(in1, in2);
+        }
+        else if constexpr (std::is_floating_point_v<argT1> ||
+                           std::is_same_v<argT1, sycl::half>)
+        {
+            const bool choose_first = (sycl::isnan(in1) || (in1 > in2));
+            return (choose_first) ? in1 : in2;
+        }
+        else {
+            return (in1 > in2) ? in1 : in2;
+        }
+    }
+
+    template <int vec_sz>
+    sycl::vec<resT, vec_sz>
+        operator()(const sycl::vec<argT1, vec_sz> &in1,
+                   const sycl::vec<argT2, vec_sz> &in2) const
+    {
+        sycl::vec<resT, vec_sz> res;
+#pragma unroll
+        for (int i = 0; i < vec_sz; ++i) {
+            const auto &v1 = in1[i];
+            const auto &v2 = in2[i];
+            if constexpr (std::is_floating_point_v<argT1> ||
+                          std::is_same_v<argT1, sycl::half>)
+            {
+                const bool choose_first = (sycl::isnan(v1) || (v1 > v2));
+                res[i] = (choose_first) ? v1 : v2;
+            }
+            else {
+                res[i] = (v1 > v2) ? v1 : v2;
+            }
+        }
+        return res;
+    }
+};
+
+template <typename argT1,
+          typename argT2,
+          typename resT,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
+          bool enable_sg_loadstore = true>
+using MaximumContigFunctor =
+    elementwise_common::BinaryContigFunctor<argT1,
+                                            argT2,
+                                            resT,
+                                            MaximumFunctor<argT1, argT2, resT>,
+                                            vec_sz,
+                                            n_vecs,
+                                            enable_sg_loadstore>;
+
+template <typename argT1, typename argT2, typename resT, typename IndexerT>
+using MaximumStridedFunctor = elementwise_common::BinaryStridedFunctor<
+    argT1,
+    argT2,
+    resT,
+    IndexerT,
+    MaximumFunctor<argT1, argT2, resT>>;
+
+template <typename T1, typename T2>
+struct MaximumOutputType
+{
+    using value_type = typename std::disjunction<
+        td_ns::BinaryTypeMapResultEntry<T1, bool, T2, bool, bool>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint8_t,
+                                        T2,
+                                        std::uint8_t,
+                                        std::uint8_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::int8_t,
+                                        T2,
+                                        std::int8_t,
+                                        std::int8_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint16_t,
+                                        T2,
+                                        std::uint16_t,
+                                        std::uint16_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::int16_t,
+                                        T2,
+                                        std::int16_t,
+                                        std::int16_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint32_t,
+                                        T2,
+                                        std::uint32_t,
+                                        std::uint32_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::int32_t,
+                                        T2,
+                                        std::int32_t,
+                                        std::int32_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint64_t,
+                                        T2,
+                                        std::uint64_t,
+                                        std::uint64_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::int64_t,
+                                        T2,
+                                        std::int64_t,
+                                        std::int64_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        sycl::half,
+                                        T2,
+                                        sycl::half,
+                                        sycl::half>,
+        td_ns::BinaryTypeMapResultEntry<T1, float, T2, float, float>,
+        td_ns::BinaryTypeMapResultEntry<T1, double, T2, double, double>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::complex<float>,
+                                        T2,
+                                        std::complex<float>,
+                                        std::complex<float>>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::complex<double>,
+                                        T2,
+                                        std::complex<double>,
+                                        std::complex<double>>,
+        td_ns::DefaultResultEntry<void>>::result_type;
+
+    static constexpr bool is_defined = !std::is_same_v<value_type, void>;
+};
+
+namespace hyperparam_detail
+{
+
+namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
+
+using vsu_ns::BinaryContigHyperparameterSetEntry;
+using vsu_ns::ContigHyperparameterSetDefault;
+
+template <typename argTy1, typename argTy2>
+struct MaximumContigHyperparameterSet
+{
+    using value_type =
+        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
+
+    constexpr static auto vec_sz = value_type::vec_sz;
+    constexpr static auto n_vecs = value_type::n_vecs;
+};
+
+} // end of namespace hyperparam_detail
+
+template <typename argT1,
+          typename argT2,
+          typename resT,
+          std::uint8_t vec_sz,
+          std::uint8_t n_vecs>
+class maximum_contig_kernel;
+
+template <typename argTy1, typename argTy2>
+sycl::event maximum_contig_impl(sycl::queue &exec_q,
+                                std::size_t nelems,
+                                const char *arg1_p,
+                                ssize_t arg1_offset,
+                                const char *arg2_p,
+                                ssize_t arg2_offset,
+                                char *res_p,
+                                ssize_t res_offset,
+                                const std::vector<sycl::event> &depends = {})
+{
+    using MaxHS =
+        hyperparam_detail::MaximumContigHyperparameterSet<argTy1, argTy2>;
+    static constexpr std::uint8_t vec_sz = MaxHS::vec_sz;
+    static constexpr std::uint8_t n_vecs = MaxHS::n_vecs;
+
+    return elementwise_common::binary_contig_impl<
+        argTy1, argTy2, MaximumOutputType, MaximumContigFunctor,
+        maximum_contig_kernel, vec_sz, n_vecs>(exec_q, nelems, arg1_p,
+                                               arg1_offset, arg2_p, arg2_offset,
+                                               res_p, res_offset, depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct MaximumContigFactory
+{
+    fnT get()
+    {
+        if constexpr (!MaximumOutputType<T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = maximum_contig_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+
+template <typename fnT, typename T1, typename T2>
+struct MaximumTypeMapFactory
+{
+    /*! @brief get typeid for output type of maximum(T1 x, T2 y) */
+    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
+    {
+        using rT = typename MaximumOutputType<T1, T2>::value_type;
+        return td_ns::GetTypeid<rT>{}.get();
+    }
+};
+
+template <typename T1, typename T2, typename resT, typename IndexerT>
+class maximum_strided_kernel;
+
+template <typename argTy1, typename argTy2>
+sycl::event
+    maximum_strided_impl(sycl::queue &exec_q,
+                         std::size_t nelems,
+                         int nd,
+                         const ssize_t *shape_and_strides,
+                         const char *arg1_p,
+                         ssize_t arg1_offset,
+                         const char *arg2_p,
+                         ssize_t arg2_offset,
+                         char *res_p,
+                         ssize_t res_offset,
+                         const std::vector<sycl::event> &depends,
+                         const std::vector<sycl::event> &additional_depends)
+{
+    return elementwise_common::binary_strided_impl<
+        argTy1, argTy2, MaximumOutputType, MaximumStridedFunctor,
+        maximum_strided_kernel>(exec_q, nelems, nd, shape_and_strides, arg1_p,
+                                arg1_offset, arg2_p, arg2_offset, res_p,
+                                res_offset, depends, additional_depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct MaximumStridedFactory
+{
+    fnT get()
+    {
+        if constexpr (!MaximumOutputType<T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = maximum_strided_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+} // namespace dpctl::tensor::kernels::maximum
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/minimum.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/minimum.hpp
new file mode 100644
index 000000000000..a38945f89a25
--- /dev/null
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/minimum.hpp
@@ -0,0 +1,321 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+///
+/// \file
+/// This file defines kernels for elementwise evaluation of MINIMUM(x1, x2)
+/// function.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+
+#include <complex>
+#include <cstddef>
+#include <cstdint>
+#include <type_traits>
+
+#include <sycl/sycl.hpp>
+
+#include "common.hpp"
+#include "vec_size_util.hpp"
+
+#include "utils/math_utils.hpp"
+#include "utils/offset_utils.hpp"
+#include "utils/type_dispatch_building.hpp"
+#include "utils/type_utils.hpp"
+
+#include "kernels/dpctl_tensor_types.hpp"
+
+namespace dpctl::tensor::kernels::minimum
+{
+using dpctl::tensor::ssize_t;
+namespace td_ns = dpctl::tensor::type_dispatch;
+namespace tu_ns = dpctl::tensor::type_utils;
+
+template <typename argT1, typename argT2, typename resT>
+struct MinimumFunctor
+{
+
+    using supports_sg_loadstore = std::negation<
+        std::disjunction<tu_ns::is_complex<argT1>, tu_ns::is_complex<argT2>>>;
+    using supports_vec = std::conjunction<
+        std::is_same<argT1, argT2>,
+        std::negation<std::disjunction<tu_ns::is_complex<argT1>,
+                                       tu_ns::is_complex<argT2>>>>;
+
+    resT operator()(const argT1 &in1, const argT2 &in2) const
+    {
+        if constexpr (tu_ns::is_complex<argT1>::value ||
+                      tu_ns::is_complex<argT2>::value)
+        {
+            static_assert(std::is_same_v<argT1, argT2>);
+            using dpctl::tensor::math_utils::min_complex;
+            return min_complex<argT1>(in1, in2);
+        }
+        else if constexpr (std::is_floating_point_v<argT1> ||
+                           std::is_same_v<argT1, sycl::half>)
+        {
+            const bool choose_first = sycl::isnan(in1) || (in1 < in2);
+            return (choose_first) ? in1 : in2;
+        }
+        else {
+            return (in1 < in2) ? in1 : in2;
+        }
+    }
+
+    template <int vec_sz>
+    sycl::vec<resT, vec_sz>
+        operator()(const sycl::vec<argT1, vec_sz> &in1,
+                   const sycl::vec<argT2, vec_sz> &in2) const
+    {
+        sycl::vec<resT, vec_sz> res;
+#pragma unroll
+        for (int i = 0; i < vec_sz; ++i) {
+            const auto &v1 = in1[i];
+            const auto &v2 = in2[i];
+            if constexpr (std::is_floating_point_v<argT1> ||
+                          std::is_same_v<argT1, sycl::half>)
+            {
+                const bool choose_first = sycl::isnan(v1) || (v1 < v2);
+                res[i] = (choose_first) ? v1 : v2;
+            }
+            else {
+                res[i] = (v1 < v2) ? v1 : v2;
+            }
+        }
+        return res;
+    }
+};
+
+template <typename argT1,
+          typename argT2,
+          typename resT,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
+          bool enable_sg_loadstore = true>
+using MinimumContigFunctor =
+    elementwise_common::BinaryContigFunctor<argT1,
+                                            argT2,
+                                            resT,
+                                            MinimumFunctor<argT1, argT2, resT>,
+                                            vec_sz,
+                                            n_vecs,
+                                            enable_sg_loadstore>;
+
+template <typename argT1, typename argT2, typename resT, typename IndexerT>
+using MinimumStridedFunctor = elementwise_common::BinaryStridedFunctor<
+    argT1,
+    argT2,
+    resT,
+    IndexerT,
+    MinimumFunctor<argT1, argT2, resT>>;
+
+template <typename T1, typename T2>
+struct MinimumOutputType
+{
+    using value_type = typename std::disjunction<
+        td_ns::BinaryTypeMapResultEntry<T1, bool, T2, bool, bool>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint8_t,
+                                        T2,
+                                        std::uint8_t,
+                                        std::uint8_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::int8_t,
+                                        T2,
+                                        std::int8_t,
+                                        std::int8_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint16_t,
+                                        T2,
+                                        std::uint16_t,
+                                        std::uint16_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::int16_t,
+                                        T2,
+                                        std::int16_t,
+                                        std::int16_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint32_t,
+                                        T2,
+                                        std::uint32_t,
+                                        std::uint32_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::int32_t,
+                                        T2,
+                                        std::int32_t,
+                                        std::int32_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint64_t,
+                                        T2,
+                                        std::uint64_t,
+                                        std::uint64_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::int64_t,
+                                        T2,
+                                        std::int64_t,
+                                        std::int64_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        sycl::half,
+                                        T2,
+                                        sycl::half,
+                                        sycl::half>,
+        td_ns::BinaryTypeMapResultEntry<T1, float, T2, float, float>,
+        td_ns::BinaryTypeMapResultEntry<T1, double, T2, double, double>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::complex<float>,
+                                        T2,
+                                        std::complex<float>,
+                                        std::complex<float>>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::complex<double>,
+                                        T2,
+                                        std::complex<double>,
+                                        std::complex<double>>,
+        td_ns::DefaultResultEntry<void>>::result_type;
+
+    static constexpr bool is_defined = !std::is_same_v<value_type, void>;
+};
+
+namespace hyperparam_detail
+{
+
+namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
+
+using vsu_ns::BinaryContigHyperparameterSetEntry;
+using vsu_ns::ContigHyperparameterSetDefault;
+
+template <typename argTy1, typename argTy2>
+struct MinimumContigHyperparameterSet
+{
+    using value_type =
+        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
+
+    constexpr static auto vec_sz = value_type::vec_sz;
+    constexpr static auto n_vecs = value_type::n_vecs;
+};
+
+} // end of namespace hyperparam_detail
+
+template <typename argT1,
+          typename argT2,
+          typename resT,
+          std::uint8_t vec_sz,
+          std::uint8_t n_vecs>
+class minimum_contig_kernel;
+
+template <typename argTy1, typename argTy2>
+sycl::event minimum_contig_impl(sycl::queue &exec_q,
+                                std::size_t nelems,
+                                const char *arg1_p,
+                                ssize_t arg1_offset,
+                                const char *arg2_p,
+                                ssize_t arg2_offset,
+                                char *res_p,
+                                ssize_t res_offset,
+                                const std::vector<sycl::event> &depends = {})
+{
+    using MinHS =
+        hyperparam_detail::MinimumContigHyperparameterSet<argTy1, argTy2>;
+    static constexpr std::uint8_t vec_sz = MinHS::vec_sz;
+    static constexpr std::uint8_t n_vecs = MinHS::n_vecs;
+
+    return elementwise_common::binary_contig_impl<
+        argTy1, argTy2, MinimumOutputType, MinimumContigFunctor,
+        minimum_contig_kernel, vec_sz, n_vecs>(exec_q, nelems, arg1_p,
+                                               arg1_offset, arg2_p, arg2_offset,
+                                               res_p, res_offset, depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct MinimumContigFactory
+{
+    fnT get()
+    {
+        if constexpr (!MinimumOutputType<T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = minimum_contig_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+
+template <typename fnT, typename T1, typename T2>
+struct MinimumTypeMapFactory
+{
+    /*! @brief get typeid for output type of minimum(T1 x, T2 y) */
+    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
+    {
+        using rT = typename MinimumOutputType<T1, T2>::value_type;
+        return td_ns::GetTypeid<rT>{}.get();
+    }
+};
+
+template <typename T1, typename T2, typename resT, typename IndexerT>
+class minimum_strided_kernel;
+
+template <typename argTy1, typename argTy2>
+sycl::event
+    minimum_strided_impl(sycl::queue &exec_q,
+                         std::size_t nelems,
+                         int nd,
+                         const ssize_t *shape_and_strides,
+                         const char *arg1_p,
+                         ssize_t arg1_offset,
+                         const char *arg2_p,
+                         ssize_t arg2_offset,
+                         char *res_p,
+                         ssize_t res_offset,
+                         const std::vector<sycl::event> &depends,
+                         const std::vector<sycl::event> &additional_depends)
+{
+    return elementwise_common::binary_strided_impl<
+        argTy1, argTy2, MinimumOutputType, MinimumStridedFunctor,
+        minimum_strided_kernel>(exec_q, nelems, nd, shape_and_strides, arg1_p,
+                                arg1_offset, arg2_p, arg2_offset, res_p,
+                                res_offset, depends, additional_depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct MinimumStridedFactory
+{
+    fnT get()
+    {
+        if constexpr (!MinimumOutputType<T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = minimum_strided_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+} // namespace dpctl::tensor::kernels::minimum
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sycl_complex.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sycl_complex.hpp
new file mode 100644
index 000000000000..5cadec6ce2a4
--- /dev/null
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sycl_complex.hpp
@@ -0,0 +1,44 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+///
+/// \file
+/// This file defines a macro for defining the SYCL_EXT_ONEAPI_COMPLEX macro
+/// and indirect inclusion of the experimental oneAPI SYCL complex extension
+/// header file.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+
+#define SYCL_EXT_ONEAPI_COMPLEX
+#if __has_include(<sycl/ext/oneapi/experimental/sycl_complex.hpp>)
+#include <sycl/ext/oneapi/experimental/sycl_complex.hpp>
+#else
+#include <sycl/ext/oneapi/experimental/complex/complex.hpp>
+#endif
+
+namespace exprm_ns = sycl::ext::oneapi::experimental;
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/vec_size_util.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/vec_size_util.hpp
new file mode 100644
index 000000000000..bdbc7e50cc86
--- /dev/null
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/vec_size_util.hpp
@@ -0,0 +1,70 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+///
+/// \file
+/// This file defines utilities for selection of hyperparameters for kernels
+/// implementing unary and binary elementwise functions for contiguous inputs
+//===---------------------------------------------------------------------===//
+
+#pragma once
+
+#include <cstdint>
+#include <type_traits>
+
+namespace dpctl::tensor::kernels::vec_size_utils
+{
+template <typename Ty1,
+          typename ArgTy1,
+          typename Ty2,
+          typename ArgTy2,
+          std::uint8_t vec_sz_v,
+          std::uint8_t n_vecs_v>
+struct BinaryContigHyperparameterSetEntry
+    : std::conjunction<std::is_same<Ty1, ArgTy1>, std::is_same<Ty2, ArgTy2>>
+{
+    static constexpr std::uint8_t vec_sz = vec_sz_v;
+    static constexpr std::uint8_t n_vecs = n_vecs_v;
+};
+
+template <typename Ty,
+          typename ArgTy,
+          std::uint8_t vec_sz_v,
+          std::uint8_t n_vecs_v>
+struct UnaryContigHyperparameterSetEntry : std::is_same<Ty, ArgTy>
+{
+    static constexpr std::uint8_t vec_sz = vec_sz_v;
+    static constexpr std::uint8_t n_vecs = n_vecs_v;
+};
+
+template <std::uint8_t vec_sz_v, std::uint8_t n_vecs_v>
+struct ContigHyperparameterSetDefault : std::true_type
+{
+    static constexpr std::uint8_t vec_sz = vec_sz_v;
+    static constexpr std::uint8_t n_vecs = n_vecs_v;
+};
+} // namespace dpctl::tensor::kernels::vec_size_utils
diff --git a/dpctl/tensor/libtensor/include/utils/indexing_utils.hpp b/dpctl/tensor/libtensor/include/utils/indexing_utils.hpp
new file mode 100644
index 000000000000..d28c8174c39c
--- /dev/null
+++ b/dpctl/tensor/libtensor/include/utils/indexing_utils.hpp
@@ -0,0 +1,153 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+///
+/// \file
+/// This file defines utilities for handling out-of-bounds integer indices in
+/// kernels that involve indexing operations, such as take, put, or advanced
+/// tensor integer indexing.
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <cstddef>
+#include <cstdint>
+#include <limits>
+#include <type_traits>
+
+#include <sycl/sycl.hpp>
+
+#include "kernels/dpctl_tensor_types.hpp"
+
+namespace dpctl::tensor::indexing_utils
+{
+using dpctl::tensor::ssize_t;
+
+/*
+ * ssize_t for indices is a design choice, dpctl::tensor::usm_ndarray
+ * uses py::ssize_t for shapes and strides internally and Python uses
+ * py_ssize_t for sizes of e.g. lists.
+ */
+
+template <typename IndT>
+struct WrapIndex
+{
+    static_assert(std::is_integral_v<IndT>);
+
+    ssize_t operator()(ssize_t max_item, IndT ind) const
+    {
+        ssize_t projected;
+        static constexpr ssize_t unit(1);
+        max_item = sycl::max(max_item, unit);
+
+        static constexpr std::uintmax_t ind_max =
+            std::numeric_limits<IndT>::max();
+        static constexpr std::uintmax_t ssize_max =
+            std::numeric_limits<ssize_t>::max();
+
+        if constexpr (std::is_signed_v<IndT>) {
+            static constexpr std::intmax_t ind_min =
+                std::numeric_limits<IndT>::min();
+            static constexpr std::intmax_t ssize_min =
+                std::numeric_limits<ssize_t>::min();
+
+            if constexpr (ind_max <= ssize_max && ind_min >= ssize_min) {
+                const ssize_t ind_ = static_cast<ssize_t>(ind);
+                const ssize_t lb = -max_item;
+                const ssize_t ub = max_item - 1;
+                projected = sycl::clamp(ind_, lb, ub);
+            }
+            else {
+                const IndT lb = static_cast<IndT>(-max_item);
+                const IndT ub = static_cast<IndT>(max_item - 1);
+                projected = static_cast<ssize_t>(sycl::clamp(ind, lb, ub));
+            }
+            return (projected < 0) ? projected + max_item : projected;
+        }
+        else {
+            if constexpr (ind_max <= ssize_max) {
+                const ssize_t ind_ = static_cast<ssize_t>(ind);
+                const ssize_t ub = max_item - 1;
+                projected = sycl::min(ind_, ub);
+            }
+            else {
+                const IndT ub = static_cast<IndT>(max_item - 1);
+                projected = static_cast<ssize_t>(sycl::min(ind, ub));
+            }
+            return projected;
+        }
+    }
+};
+
+template <typename IndT>
+struct ClipIndex
+{
+    static_assert(std::is_integral_v<IndT>);
+
+    ssize_t operator()(ssize_t max_item, IndT ind) const
+    {
+        ssize_t projected;
+        static constexpr ssize_t unit(1);
+        max_item = sycl::max<ssize_t>(max_item, unit);
+
+        static constexpr std::uintmax_t ind_max =
+            std::numeric_limits<IndT>::max();
+        static constexpr std::uintmax_t ssize_max =
+            std::numeric_limits<ssize_t>::max();
+        if constexpr (std::is_signed_v<IndT>) {
+            static constexpr std::intmax_t ind_min =
+                std::numeric_limits<IndT>::min();
+            static constexpr std::intmax_t ssize_min =
+                std::numeric_limits<ssize_t>::min();
+
+            if constexpr (ind_max <= ssize_max && ind_min >= ssize_min) {
+                const ssize_t ind_ = static_cast<ssize_t>(ind);
+                static constexpr ssize_t lb(0);
+                const ssize_t ub = max_item - 1;
+                projected = sycl::clamp(ind_, lb, ub);
+            }
+            else {
+                static constexpr IndT lb(0);
+                const IndT ub = static_cast<IndT>(max_item - 1);
+                projected = static_cast<std::size_t>(sycl::clamp(ind, lb, ub));
+            }
+        }
+        else {
+            if constexpr (ind_max <= ssize_max) {
+                const ssize_t ind_ = static_cast<ssize_t>(ind);
+                const ssize_t ub = max_item - 1;
+                projected = sycl::min(ind_, ub);
+            }
+            else {
+                const IndT ub = static_cast<IndT>(max_item - 1);
+                projected = static_cast<ssize_t>(sycl::min(ind, ub));
+            }
+        }
+        return projected;
+    }
+};
+} // namespace dpctl::tensor::indexing_utils
diff --git a/dpctl/tensor/libtensor/include/utils/math_utils.hpp b/dpctl/tensor/libtensor/include/utils/math_utils.hpp
new file mode 100644
index 000000000000..d35eff0074dc
--- /dev/null
+++ b/dpctl/tensor/libtensor/include/utils/math_utils.hpp
@@ -0,0 +1,148 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+///
+/// \file
+/// This file defines math utility functions.
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <cmath>
+#include <complex>
+#include <limits>
+
+#include <sycl/sycl.hpp>
+
+namespace dpctl::tensor::math_utils
+{
+template <typename T>
+bool less_complex(const T &x1, const T &x2)
+{
+    using realT = typename T::value_type;
+    realT real1 = std::real(x1);
+    realT real2 = std::real(x2);
+    realT imag1 = std::imag(x1);
+    realT imag2 = std::imag(x2);
+
+    return (real1 == real2)
+               ? (imag1 < imag2)
+               : (real1 < real2 && !std::isnan(imag1) && !std::isnan(imag2));
+}
+
+template <typename T>
+bool greater_complex(const T &x1, const T &x2)
+{
+    using realT = typename T::value_type;
+    realT real1 = std::real(x1);
+    realT real2 = std::real(x2);
+    realT imag1 = std::imag(x1);
+    realT imag2 = std::imag(x2);
+
+    return (real1 == real2)
+               ? (imag1 > imag2)
+               : (real1 > real2 && !std::isnan(imag1) && !std::isnan(imag2));
+}
+
+template <typename T>
+bool less_equal_complex(const T &x1, const T &x2)
+{
+    using realT = typename T::value_type;
+    realT real1 = std::real(x1);
+    realT real2 = std::real(x2);
+    realT imag1 = std::imag(x1);
+    realT imag2 = std::imag(x2);
+
+    return (real1 == real2)
+               ? (imag1 <= imag2)
+               : (real1 < real2 && !std::isnan(imag1) && !std::isnan(imag2));
+}
+
+template <typename T>
+bool greater_equal_complex(const T &x1, const T &x2)
+{
+    using realT = typename T::value_type;
+    realT real1 = std::real(x1);
+    realT real2 = std::real(x2);
+    realT imag1 = std::imag(x1);
+    realT imag2 = std::imag(x2);
+
+    return (real1 == real2)
+               ? (imag1 >= imag2)
+               : (real1 > real2 && !std::isnan(imag1) && !std::isnan(imag2));
+}
+
+template <typename T>
+T max_complex(const T &x1, const T &x2)
+{
+    using realT = typename T::value_type;
+    realT real1 = std::real(x1);
+    realT real2 = std::real(x2);
+    realT imag1 = std::imag(x1);
+    realT imag2 = std::imag(x2);
+
+    bool isnan_imag1 = std::isnan(imag1);
+    bool gt = (real1 == real2)
+                  ? (imag1 > imag2)
+                  : (real1 > real2 && !isnan_imag1 && !std::isnan(imag2));
+    return (std::isnan(real1) || isnan_imag1 || gt) ? x1 : x2;
+}
+
+template <typename T>
+T min_complex(const T &x1, const T &x2)
+{
+    using realT = typename T::value_type;
+    realT real1 = std::real(x1);
+    realT real2 = std::real(x2);
+    realT imag1 = std::imag(x1);
+    realT imag2 = std::imag(x2);
+
+    bool isnan_imag1 = std::isnan(imag1);
+    bool lt = (real1 == real2)
+                  ? (imag1 < imag2)
+                  : (real1 < real2 && !isnan_imag1 && !std::isnan(imag2));
+    return (std::isnan(real1) || isnan_imag1 || lt) ? x1 : x2;
+}
+
+template <typename T>
+T logaddexp(T x, T y)
+{
+    if (x == y) { // handle signed infinities
+        const T log2 = sycl::log(T(2));
+        return x + log2;
+    }
+    else {
+        const T tmp = x - y;
+        static constexpr T zero(0);
+
+        return (tmp > zero)
+                   ? (x + sycl::log1p(sycl::exp(-tmp)))
+                   : ((tmp <= zero) ? y + sycl::log1p(sycl::exp(tmp))
+                                    : std::numeric_limits<T>::quiet_NaN());
+    }
+}
+} // namespace dpctl::tensor::math_utils
diff --git a/dpctl/tensor/libtensor/include/utils/memory_overlap.hpp b/dpctl/tensor/libtensor/include/utils/memory_overlap.hpp
new file mode 100644
index 000000000000..b534e55b3192
--- /dev/null
+++ b/dpctl/tensor/libtensor/include/utils/memory_overlap.hpp
@@ -0,0 +1,157 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+///
+/// \file
+/// This file defines utility to determine whether two arrays have memory
+/// overlap.
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <algorithm>
+#include <iterator>
+
+#include <pybind11/pybind11.h>
+
+#include "dpnp4pybind11.hpp"
+
+/* @brief check for overlap of memory regions behind arrays.
+
+Presently assume that array occupies all bytes between smallest and largest
+displaced elements.
+
+TODO: Write proper Frobenius solver to account for holes, e.g.
+   overlap( x_contig[::2], x_contig[1::2]) should give False,
+   while this implementation gives True.
+*/
+namespace dpctl::tensor::overlap
+{
+namespace py = pybind11;
+
+struct MemoryOverlap
+{
+    bool operator()(dpctl::tensor::usm_ndarray ar1,
+                    dpctl::tensor::usm_ndarray ar2) const
+    {
+        const char *ar1_data = ar1.get_data();
+
+        const auto &ar1_offsets = ar1.get_minmax_offsets();
+        py::ssize_t ar1_elem_size =
+            static_cast<py::ssize_t>(ar1.get_elemsize());
+
+        const char *ar2_data = ar2.get_data();
+        const auto &ar2_offsets = ar2.get_minmax_offsets();
+        py::ssize_t ar2_elem_size =
+            static_cast<py::ssize_t>(ar2.get_elemsize());
+
+        /* Memory of array1 extends from  */
+        /*    [ar1_data + ar1_offsets.first * ar1_elem_size, ar1_data +
+         * ar1_offsets.second * ar1_elem_size + ar1_elem_size] */
+        /* Memory of array2 extends from */
+        /*    [ar2_data + ar2_offsets.first * ar2_elem_size, ar2_data +
+         * ar2_offsets.second * ar2_elem_size + ar2_elem_size] */
+
+        /* Intervals [x0, x1] and [y0, y1] do not overlap if (x0 <= x1) && (y0
+         * <= y1)
+         * && (x1 <=y0 || y1 <= x0 ) */
+        /* Given that x0 <= x1 and y0 <= y1 are true by construction, the
+         * condition for overlap us (x1 > y0) && (y1 > x0) */
+
+        /*  Applying:
+            (ar1_data + ar1_offsets.second * ar1_elem_size + ar1_elem_size >
+        ar2_data
+        + ar2_offsets.first * ar2_elem_size) && (ar2_data + ar2_offsets.second *
+        ar2_elem_size + ar2_elem_size > ar1_data + ar1_offsets.first *
+        ar1_elem_size)
+        */
+
+        auto byte_distance = static_cast<py::ssize_t>(ar2_data - ar1_data);
+
+        py::ssize_t x1_minus_y0 =
+            (-byte_distance +
+             (ar1_elem_size + (ar1_offsets.second * ar1_elem_size) -
+              (ar2_offsets.first * ar2_elem_size)));
+
+        py::ssize_t y1_minus_x0 =
+            (byte_distance +
+             (ar2_elem_size + (ar2_offsets.second * ar2_elem_size) -
+              (ar1_offsets.first * ar1_elem_size)));
+
+        bool memory_overlap = (x1_minus_y0 > 0) && (y1_minus_x0 > 0);
+
+        return memory_overlap;
+    }
+};
+
+struct SameLogicalTensors
+{
+    bool operator()(dpctl::tensor::usm_ndarray ar1,
+                    dpctl::tensor::usm_ndarray ar2) const
+    {
+        // Same ndim
+        int nd1 = ar1.get_ndim();
+        if (nd1 != ar2.get_ndim())
+            return false;
+
+        // Same dtype
+        int tn1 = ar1.get_typenum();
+        if (tn1 != ar2.get_typenum())
+            return false;
+
+        // Same pointer
+        const char *ar1_data = ar1.get_data();
+        const char *ar2_data = ar2.get_data();
+
+        if (ar1_data != ar2_data)
+            return false;
+
+        // Same shape and strides
+        const py::ssize_t *ar1_shape = ar1.get_shape_raw();
+        const py::ssize_t *ar2_shape = ar2.get_shape_raw();
+
+        if (!std::equal(ar1_shape, ar1_shape + nd1, ar2_shape))
+            return false;
+
+        // Same shape and strides
+        auto const &ar1_strides = ar1.get_strides_vector();
+        auto const &ar2_strides = ar2.get_strides_vector();
+
+        auto ar1_beg_it = std::begin(ar1_strides);
+        auto ar1_end_it = std::end(ar1_strides);
+
+        auto ar2_beg_it = std::begin(ar2_strides);
+
+        if (!std::equal(ar1_beg_it, ar1_end_it, ar2_beg_it))
+            return false;
+
+        // all checks passed: arrays are logical views
+        // into the same memory
+        return true;
+    }
+};
+} // namespace dpctl::tensor::overlap
diff --git a/dpctl/tensor/libtensor/include/utils/offset_utils.hpp b/dpctl/tensor/libtensor/include/utils/offset_utils.hpp
new file mode 100644
index 000000000000..19664c3d4e12
--- /dev/null
+++ b/dpctl/tensor/libtensor/include/utils/offset_utils.hpp
@@ -0,0 +1,824 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+///
+/// \file
+/// This file defines Indexer callable operator to compute element offset in
+/// an array addressed by gloabl_id.
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <algorithm>
+#include <array>
+#include <cstddef>
+#include <memory>
+#include <tuple>
+#include <utility>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "kernels/dpctl_tensor_types.hpp"
+#include "utils/strided_iters.hpp"
+#include "utils/sycl_alloc_utils.hpp"
+
+namespace dpctl::tensor::offset_utils
+{
+namespace detail
+{
+struct sink_t
+{
+    sink_t(){};
+    template <class T>
+    sink_t(T &&){};
+};
+
+template <class V>
+std::size_t __accumulate_size(std::size_t &s, V &&v)
+{
+    return s += v.size();
+}
+
+template <class V, class U>
+sink_t __appender(V &lhs, U &&rhs)
+{
+    lhs.insert(lhs.end(), rhs.begin(), rhs.end());
+    return {};
+}
+
+template <typename T, typename A, typename... Vs>
+std::vector<T, A> concat(std::vector<T, A> lhs, Vs &&...vs)
+{
+    std::size_t s = lhs.size();
+    {
+        // limited scope ensures array is freed
+        [[maybe_unused]] sink_t tmp[] = {__accumulate_size(s, vs)..., 0};
+    }
+    lhs.reserve(s);
+    {
+        // array of no-data objects ensures ordering of calls to the appender
+        [[maybe_unused]] sink_t tmp[] = {
+            __appender(lhs, std::forward<Vs>(vs))..., 0};
+    }
+
+    return std::move(lhs); // prevent return-value optimization
+}
+} // namespace detail
+
+template <typename indT, typename... Vs>
+std::tuple<std::unique_ptr<indT, dpctl::tensor::alloc_utils::USMDeleter>,
+           std::size_t,
+           sycl::event>
+    device_allocate_and_pack(sycl::queue &q,
+                             std::vector<sycl::event> &host_task_events,
+                             Vs &&...vs)
+{
+
+    using dpctl::tensor::alloc_utils::usm_host_allocator;
+
+    // memory transfer optimization, use USM-host for temporary speeds up
+    // transfer to device, especially on dGPUs
+    using usm_host_allocatorT = usm_host_allocator<indT>;
+    using shT = std::vector<indT, usm_host_allocatorT>;
+
+    usm_host_allocatorT usm_host_alloc(q);
+    shT empty{0, usm_host_alloc};
+    shT packed_shape_strides = detail::concat(std::move(empty), vs...);
+
+    auto packed_shape_strides_owner =
+        std::make_shared<shT>(std::move(packed_shape_strides));
+
+    auto sz = packed_shape_strides_owner->size();
+    auto shape_strides_owner =
+        dpctl::tensor::alloc_utils::smart_malloc_device<indT>(sz, q);
+    indT *shape_strides = shape_strides_owner.get();
+
+    sycl::event copy_ev =
+        q.copy<indT>(packed_shape_strides_owner->data(), shape_strides, sz);
+
+    sycl::event cleanup_host_task_ev = q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(copy_ev);
+        cgh.host_task([packed_shape_strides_owner =
+                           std::move(packed_shape_strides_owner)] {
+            // increment shared pointer ref-count to keep it alive
+            // till copy operation completes;
+        });
+    });
+    host_task_events.push_back(cleanup_host_task_ev);
+
+    return std::make_tuple(std::move(shape_strides_owner), sz, copy_ev);
+}
+
+struct NoOpIndexer
+{
+    constexpr NoOpIndexer() {}
+    constexpr std::size_t operator()(std::size_t gid) const
+    {
+        return gid;
+    }
+};
+
+using dpctl::tensor::ssize_t;
+
+/* @brief Indexer with shape and strides arrays of same size are packed */
+struct StridedIndexer
+{
+    StridedIndexer(int _nd,
+                   ssize_t _offset,
+                   ssize_t const *_packed_shape_strides)
+        : nd(_nd), starting_offset(_offset),
+          shape_strides(_packed_shape_strides)
+    {
+    }
+
+    ssize_t operator()(ssize_t gid) const
+    {
+        return compute_offset(gid);
+    }
+
+    ssize_t operator()(std::size_t gid) const
+    {
+        return compute_offset(static_cast<ssize_t>(gid));
+    }
+
+private:
+    int nd;
+    ssize_t starting_offset;
+    ssize_t const *shape_strides;
+
+    ssize_t compute_offset(ssize_t gid) const
+    {
+        using dpctl::tensor::strides::CIndexer_vector;
+
+        CIndexer_vector _ind(nd);
+        ssize_t relative_offset(0);
+        _ind.get_displacement<const ssize_t *, const ssize_t *>(
+            gid,
+            shape_strides,      // shape ptr
+            shape_strides + nd, // strides ptr
+            relative_offset);
+        return starting_offset + relative_offset;
+    }
+};
+
+// ensure that indexer is device copyable
+static_assert(sycl::is_device_copyable_v<StridedIndexer>);
+
+/* @brief Indexer with shape, strides provided separately */
+struct UnpackedStridedIndexer
+{
+    UnpackedStridedIndexer(int _nd,
+                           ssize_t _offset,
+                           ssize_t const *_shape,
+                           ssize_t const *_strides)
+        : nd(_nd), starting_offset(_offset), shape(_shape), strides(_strides)
+    {
+    }
+
+    ssize_t operator()(ssize_t gid) const
+    {
+        return compute_offset(gid);
+    }
+
+    ssize_t operator()(std::size_t gid) const
+    {
+        return compute_offset(static_cast<ssize_t>(gid));
+    }
+
+private:
+    int nd;
+    ssize_t starting_offset;
+    ssize_t const *shape;
+    ssize_t const *strides;
+
+    ssize_t compute_offset(ssize_t gid) const
+    {
+        using dpctl::tensor::strides::CIndexer_vector;
+
+        CIndexer_vector _ind(nd);
+        ssize_t relative_offset(0);
+        _ind.get_displacement<const ssize_t *, const ssize_t *>(
+            gid,
+            shape,   // shape ptr
+            strides, // strides ptr
+            relative_offset);
+        return starting_offset + relative_offset;
+    }
+};
+
+// ensure that indexer is device copyable
+static_assert(sycl::is_device_copyable_v<UnpackedStridedIndexer>);
+
+struct Strided1DIndexer
+{
+    Strided1DIndexer(std::size_t _size) : offset{}, size(_size), step(1) {}
+    Strided1DIndexer(ssize_t _size)
+        : offset{}, size(static_cast<std::size_t>(_size)), step(1)
+    {
+    }
+    Strided1DIndexer(std::size_t _size, ssize_t _step)
+        : offset{}, size(_size), step(_step)
+    {
+    }
+    Strided1DIndexer(std::size_t _size, std::size_t _step)
+        : offset{}, size(_size), step(static_cast<ssize_t>(_step))
+    {
+    }
+    Strided1DIndexer(ssize_t _size, ssize_t _step)
+        : offset{}, size(static_cast<std::size_t>(_size)), step(_step)
+    {
+    }
+    Strided1DIndexer(ssize_t _offset, std::size_t _size, ssize_t _step)
+        : offset(_offset), size(_size), step(_step)
+    {
+    }
+    Strided1DIndexer(ssize_t _offset, std::size_t _size, std::size_t _step)
+        : offset(_offset), size(_size), step(static_cast<ssize_t>(_step))
+    {
+    }
+    Strided1DIndexer(ssize_t _offset, ssize_t _size, ssize_t _step)
+        : offset(_offset), size(static_cast<std::size_t>(_size)), step(_step)
+    {
+    }
+
+    ssize_t operator()(std::size_t gid) const
+    {
+        // ensure 0 <= gid < size
+        return offset + std::min<std::size_t>(gid, size - 1) * step;
+    }
+
+private:
+    ssize_t offset = 0;
+    std::size_t size = 1;
+    ssize_t step = 1;
+};
+
+static_assert(sycl::is_device_copyable_v<Strided1DIndexer>);
+
+struct Strided1DCyclicIndexer
+{
+    Strided1DCyclicIndexer(ssize_t _offset, ssize_t _size, ssize_t _step)
+        : offset(_offset), size(static_cast<std::size_t>(_size)), step(_step)
+    {
+    }
+
+    ssize_t operator()(std::size_t gid) const
+    {
+        return offset + (gid % size) * step;
+    }
+
+private:
+    ssize_t offset = 0;
+    std::size_t size = 1;
+    ssize_t step = 1;
+};
+
+static_assert(sycl::is_device_copyable_v<Strided1DCyclicIndexer>);
+
+template <typename displacementT>
+struct TwoOffsets
+{
+    constexpr TwoOffsets() : first_offset(0), second_offset(0) {}
+    constexpr TwoOffsets(const displacementT &first_offset_,
+                         const displacementT &second_offset_)
+        : first_offset(first_offset_), second_offset(second_offset_)
+    {
+    }
+
+    constexpr displacementT get_first_offset() const
+    {
+        return first_offset;
+    }
+    constexpr displacementT get_second_offset() const
+    {
+        return second_offset;
+    }
+
+private:
+    displacementT first_offset = 0;
+    displacementT second_offset = 0;
+};
+
+struct TwoOffsets_StridedIndexer
+{
+    TwoOffsets_StridedIndexer(int common_nd,
+                              ssize_t first_offset_,
+                              ssize_t second_offset_,
+                              ssize_t const *_packed_shape_strides)
+        : nd(common_nd), starting_first_offset(first_offset_),
+          starting_second_offset(second_offset_),
+          shape_strides(_packed_shape_strides)
+    {
+    }
+
+    TwoOffsets<ssize_t> operator()(ssize_t gid) const
+    {
+        return compute_offsets(gid);
+    }
+
+    TwoOffsets<ssize_t> operator()(std::size_t gid) const
+    {
+        return compute_offsets(static_cast<ssize_t>(gid));
+    }
+
+private:
+    int nd;
+    ssize_t starting_first_offset;
+    ssize_t starting_second_offset;
+    ssize_t const *shape_strides;
+
+    TwoOffsets<ssize_t> compute_offsets(ssize_t gid) const
+    {
+        using dpctl::tensor::strides::CIndexer_vector;
+
+        CIndexer_vector _ind(nd);
+        ssize_t relative_first_offset(0);
+        ssize_t relative_second_offset(0);
+        _ind.get_displacement<const ssize_t *, const ssize_t *>(
+            gid,
+            shape_strides,          // shape ptr
+            shape_strides + nd,     // strides ptr
+            shape_strides + 2 * nd, // strides ptr
+            relative_first_offset, relative_second_offset);
+        return TwoOffsets<ssize_t>(
+            starting_first_offset + relative_first_offset,
+            starting_second_offset + relative_second_offset);
+    }
+};
+
+struct TwoZeroOffsets_Indexer
+{
+    constexpr TwoZeroOffsets_Indexer() {}
+
+    constexpr TwoOffsets<ssize_t> operator()(ssize_t) const
+    {
+        return TwoOffsets<ssize_t>();
+    }
+};
+
+static_assert(sycl::is_device_copyable_v<TwoZeroOffsets_Indexer>);
+
+template <typename FirstIndexerT, typename SecondIndexerT>
+struct TwoOffsets_CombinedIndexer
+{
+private:
+    FirstIndexerT first_indexer_;
+    SecondIndexerT second_indexer_;
+
+public:
+    constexpr TwoOffsets_CombinedIndexer(const FirstIndexerT &first_indexer,
+                                         const SecondIndexerT &second_indexer)
+        : first_indexer_(first_indexer), second_indexer_(second_indexer)
+    {
+    }
+
+    constexpr TwoOffsets<ssize_t> operator()(ssize_t gid) const
+    {
+        return TwoOffsets<ssize_t>(first_indexer_(gid), second_indexer_(gid));
+    }
+};
+
+template <typename displacementT>
+struct ThreeOffsets
+{
+    constexpr ThreeOffsets()
+        : first_offset(0), second_offset(0), third_offset(0)
+    {
+    }
+    constexpr ThreeOffsets(const displacementT &first_offset_,
+                           const displacementT &second_offset_,
+                           const displacementT &third_offset_)
+        : first_offset(first_offset_), second_offset(second_offset_),
+          third_offset(third_offset_)
+    {
+    }
+
+    constexpr displacementT get_first_offset() const
+    {
+        return first_offset;
+    }
+    constexpr displacementT get_second_offset() const
+    {
+        return second_offset;
+    }
+    constexpr displacementT get_third_offset() const
+    {
+        return third_offset;
+    }
+
+private:
+    displacementT first_offset = 0;
+    displacementT second_offset = 0;
+    displacementT third_offset = 0;
+};
+
+struct ThreeOffsets_StridedIndexer
+{
+    ThreeOffsets_StridedIndexer(int common_nd,
+                                ssize_t first_offset_,
+                                ssize_t second_offset_,
+                                ssize_t third_offset_,
+                                ssize_t const *_packed_shape_strides)
+        : nd(common_nd), starting_first_offset(first_offset_),
+          starting_second_offset(second_offset_),
+          starting_third_offset(third_offset_),
+          shape_strides(_packed_shape_strides)
+    {
+    }
+
+    ThreeOffsets<ssize_t> operator()(ssize_t gid) const
+    {
+        return compute_offsets(gid);
+    }
+
+    ThreeOffsets<ssize_t> operator()(std::size_t gid) const
+    {
+        return compute_offsets(static_cast<ssize_t>(gid));
+    }
+
+private:
+    int nd;
+    ssize_t starting_first_offset;
+    ssize_t starting_second_offset;
+    ssize_t starting_third_offset;
+    ssize_t const *shape_strides;
+
+    ThreeOffsets<ssize_t> compute_offsets(ssize_t gid) const
+    {
+        using dpctl::tensor::strides::CIndexer_vector;
+
+        CIndexer_vector _ind(nd);
+        ssize_t relative_first_offset(0);
+        ssize_t relative_second_offset(0);
+        ssize_t relative_third_offset(0);
+        _ind.get_displacement<const ssize_t *, const ssize_t *>(
+            gid,
+            shape_strides,          // shape ptr
+            shape_strides + nd,     // strides ptr
+            shape_strides + 2 * nd, // strides ptr
+            shape_strides + 3 * nd, // strides ptr
+            relative_first_offset, relative_second_offset,
+            relative_third_offset);
+        return ThreeOffsets<ssize_t>(
+            starting_first_offset + relative_first_offset,
+            starting_second_offset + relative_second_offset,
+            starting_third_offset + relative_third_offset);
+    }
+};
+
+static_assert(sycl::is_device_copyable_v<ThreeOffsets_StridedIndexer>);
+
+struct ThreeZeroOffsets_Indexer
+{
+    constexpr ThreeZeroOffsets_Indexer() {}
+
+    constexpr ThreeOffsets<ssize_t> operator()(ssize_t) const
+    {
+        return ThreeOffsets<ssize_t>();
+    }
+
+    constexpr ThreeOffsets<ssize_t> operator()(std::size_t) const
+    {
+        return ThreeOffsets<ssize_t>();
+    }
+};
+
+static_assert(sycl::is_device_copyable_v<ThreeZeroOffsets_Indexer>);
+
+template <typename FirstIndexerT,
+          typename SecondIndexerT,
+          typename ThirdIndexerT>
+struct ThreeOffsets_CombinedIndexer
+{
+private:
+    FirstIndexerT first_indexer_;
+    SecondIndexerT second_indexer_;
+    ThirdIndexerT third_indexer_;
+
+public:
+    constexpr ThreeOffsets_CombinedIndexer(const FirstIndexerT &first_indexer,
+                                           const SecondIndexerT &second_indexer,
+                                           const ThirdIndexerT &third_indexer)
+        : first_indexer_(first_indexer), second_indexer_(second_indexer),
+          third_indexer_(third_indexer)
+    {
+    }
+
+    constexpr ThreeOffsets<ssize_t> operator()(ssize_t gid) const
+    {
+        return ThreeOffsets<ssize_t>(first_indexer_(gid), second_indexer_(gid),
+                                     third_indexer_(gid));
+    }
+};
+
+template <typename displacementT>
+struct FourOffsets
+{
+    constexpr FourOffsets()
+        : first_offset(0), second_offset(0), third_offset(0), fourth_offset(0)
+    {
+    }
+    constexpr FourOffsets(const displacementT &first_offset_,
+                          const displacementT &second_offset_,
+                          const displacementT &third_offset_,
+                          const displacementT &fourth_offset_)
+        : first_offset(first_offset_), second_offset(second_offset_),
+          third_offset(third_offset_), fourth_offset(fourth_offset_)
+    {
+    }
+
+    constexpr displacementT get_first_offset() const
+    {
+        return first_offset;
+    }
+    constexpr displacementT get_second_offset() const
+    {
+        return second_offset;
+    }
+    constexpr displacementT get_third_offset() const
+    {
+        return third_offset;
+    }
+    constexpr displacementT get_fourth_offset() const
+    {
+        return fourth_offset;
+    }
+
+private:
+    displacementT first_offset = 0;
+    displacementT second_offset = 0;
+    displacementT third_offset = 0;
+    displacementT fourth_offset = 0;
+};
+
+struct FourOffsets_StridedIndexer
+{
+    constexpr FourOffsets_StridedIndexer(int common_nd,
+                                         ssize_t first_offset_,
+                                         ssize_t second_offset_,
+                                         ssize_t third_offset_,
+                                         ssize_t fourth_offset_,
+                                         ssize_t const *_packed_shape_strides)
+        : nd(common_nd), starting_first_offset(first_offset_),
+          starting_second_offset(second_offset_),
+          starting_third_offset(third_offset_),
+          starting_fourth_offset(fourth_offset_),
+          shape_strides(_packed_shape_strides)
+    {
+    }
+
+    constexpr FourOffsets<ssize_t> operator()(ssize_t gid) const
+    {
+        return compute_offsets(gid);
+    }
+
+    constexpr FourOffsets<ssize_t> operator()(std::size_t gid) const
+    {
+        return compute_offsets(static_cast<ssize_t>(gid));
+    }
+
+private:
+    int nd;
+    ssize_t starting_first_offset;
+    ssize_t starting_second_offset;
+    ssize_t starting_third_offset;
+    ssize_t starting_fourth_offset;
+    ssize_t const *shape_strides;
+
+    FourOffsets<ssize_t> compute_offsets(ssize_t gid) const
+    {
+        using dpctl::tensor::strides::CIndexer_vector;
+
+        CIndexer_vector _ind(nd);
+        ssize_t relative_first_offset(0);
+        ssize_t relative_second_offset(0);
+        ssize_t relative_third_offset(0);
+        ssize_t relative_fourth_offset(0);
+        _ind.get_displacement<const ssize_t *, const ssize_t *>(
+            gid,
+            shape_strides,          // shape ptr
+            shape_strides + nd,     // strides ptr
+            shape_strides + 2 * nd, // strides ptr
+            shape_strides + 3 * nd, // strides ptr
+            shape_strides + 4 * nd, // strides ptr
+            relative_first_offset, relative_second_offset,
+            relative_third_offset, relative_fourth_offset);
+        return FourOffsets<ssize_t>(
+            starting_first_offset + relative_first_offset,
+            starting_second_offset + relative_second_offset,
+            starting_third_offset + relative_third_offset,
+            starting_fourth_offset + relative_fourth_offset);
+    }
+};
+
+static_assert(sycl::is_device_copyable_v<FourOffsets_StridedIndexer>);
+
+struct FourZeroOffsets_Indexer
+{
+    constexpr FourZeroOffsets_Indexer() {}
+
+    constexpr FourOffsets<ssize_t> operator()(ssize_t) const
+    {
+        return FourOffsets<ssize_t>();
+    }
+};
+
+static_assert(sycl::is_device_copyable_v<FourZeroOffsets_Indexer>);
+
+struct NthStrideOffset
+{
+    NthStrideOffset(int common_nd,
+                    ssize_t const *_offsets,
+                    ssize_t const *_packed_shape_strides)
+        : _ind(common_nd), nd(common_nd), offsets(_offsets),
+          shape_strides(_packed_shape_strides)
+    {
+    }
+
+    std::size_t operator()(ssize_t gid, int n) const
+    {
+        ssize_t relative_offset(0);
+        _ind.get_displacement<const ssize_t *, const ssize_t *>(
+            gid, shape_strides, shape_strides + ((n + 1) * nd),
+            relative_offset);
+
+        return relative_offset + offsets[n];
+    }
+
+private:
+    dpctl::tensor::strides::CIndexer_vector<ssize_t> _ind;
+
+    int nd;
+    ssize_t const *offsets;
+    ssize_t const *shape_strides;
+};
+
+static_assert(sycl::is_device_copyable_v<NthStrideOffset>);
+
+template <int nd>
+struct FixedDimStridedIndexer
+{
+    FixedDimStridedIndexer(const std::array<ssize_t, nd> &_shape,
+                           const std::array<ssize_t, nd> &_strides,
+                           ssize_t _offset)
+        : _ind(_shape), strides(_strides), starting_offset(_offset)
+    {
+    }
+    std::size_t operator()(std::size_t gid) const
+    {
+        dpctl::tensor::strides::CIndexer_array<nd, ssize_t> local_indexer(
+            std::move(_ind));
+        local_indexer.set(gid);
+        auto mi = local_indexer.get();
+
+        ssize_t relative_offset = 0;
+
+#pragma unroll
+        for (int i = 0; i < nd; ++i) {
+            relative_offset += mi[i] * strides[i];
+        }
+        return starting_offset + relative_offset;
+    }
+
+private:
+    dpctl::tensor::strides::CIndexer_array<nd, ssize_t> _ind;
+
+    std::array<ssize_t, nd> strides;
+    ssize_t starting_offset;
+};
+
+static_assert(sycl::is_device_copyable_v<FixedDimStridedIndexer<1>>);
+
+template <int nd>
+struct TwoOffsets_FixedDimStridedIndexer
+{
+    TwoOffsets_FixedDimStridedIndexer(const std::array<ssize_t, nd> &_shape,
+                                      const std::array<ssize_t, nd> &_strides1,
+                                      const std::array<ssize_t, nd> &_strides2,
+                                      ssize_t _offset1,
+                                      ssize_t _offset2)
+        : _ind(_shape), strides1(_strides1), strides2(_strides2),
+          starting_offset1(_offset1), starting_offset2(_offset2)
+    {
+    }
+
+    TwoOffsets<ssize_t> operator()(std::size_t gid) const
+    {
+        dpctl::tensor::strides::CIndexer_array<nd, ssize_t> local_indexer(
+            std::move(_ind));
+        local_indexer.set(gid);
+        auto mi = local_indexer.get();
+
+        ssize_t relative_offset1 = 0;
+#pragma unroll
+        for (int i = 0; i < nd; ++i) {
+            relative_offset1 += mi[i] * strides1[i];
+        }
+
+        ssize_t relative_offset2 = 0;
+#pragma unroll
+        for (int i = 0; i < nd; ++i) {
+            relative_offset2 += mi[i] * strides2[i];
+        }
+
+        return TwoOffsets<ssize_t>(starting_offset1 + relative_offset1,
+                                   starting_offset2 + relative_offset2);
+    }
+
+private:
+    dpctl::tensor::strides::CIndexer_array<nd, ssize_t> _ind;
+
+    std::array<ssize_t, nd> strides1;
+    std::array<ssize_t, nd> strides2;
+    ssize_t starting_offset1;
+    ssize_t starting_offset2;
+};
+
+static_assert(sycl::is_device_copyable_v<TwoOffsets_FixedDimStridedIndexer<1>>);
+
+template <int nd>
+struct ThreeOffsets_FixedDimStridedIndexer
+{
+    ThreeOffsets_FixedDimStridedIndexer(
+        const std::array<ssize_t, nd> &_shape,
+        const std::array<ssize_t, nd> &_strides1,
+        const std::array<ssize_t, nd> &_strides2,
+        const std::array<ssize_t, nd> &_strides3,
+        ssize_t _offset1,
+        ssize_t _offset2,
+        ssize_t _offset3)
+        : _ind(_shape), strides1(_strides1), strides2(_strides2),
+          strides3(_strides3), starting_offset1(_offset1),
+          starting_offset2(_offset2), starting_offset3(_offset3)
+    {
+    }
+
+    ThreeOffsets<ssize_t> operator()(std::size_t gid) const
+    {
+        dpctl::tensor::strides::CIndexer_array<nd, ssize_t> local_indexer(
+            std::move(_ind));
+        local_indexer.set(gid);
+        auto mi = local_indexer.get();
+
+        ssize_t relative_offset1 = 0;
+#pragma unroll
+        for (int i = 0; i < nd; ++i) {
+            relative_offset1 += mi[i] * strides1[i];
+        }
+
+        ssize_t relative_offset2 = 0;
+#pragma unroll
+        for (int i = 0; i < nd; ++i) {
+            relative_offset2 += mi[i] * strides2[i];
+        }
+
+        ssize_t relative_offset3 = 0;
+#pragma unroll
+        for (int i = 0; i < nd; ++i) {
+            relative_offset3 += mi[i] * strides3[i];
+        }
+
+        return ThreeOffsets<ssize_t>(starting_offset1 + relative_offset1,
+                                     starting_offset2 + relative_offset2,
+                                     starting_offset3 + relative_offset3);
+    }
+
+private:
+    dpctl::tensor::strides::CIndexer_array<nd, ssize_t> _ind;
+
+    std::array<ssize_t, nd> strides1;
+    std::array<ssize_t, nd> strides2;
+    std::array<ssize_t, nd> strides3;
+    ssize_t starting_offset1;
+    ssize_t starting_offset2;
+    ssize_t starting_offset3;
+};
+
+static_assert(
+    sycl::is_device_copyable_v<ThreeOffsets_FixedDimStridedIndexer<1>>);
+} // namespace dpctl::tensor::offset_utils
diff --git a/dpctl/tensor/libtensor/include/utils/output_validation.hpp b/dpctl/tensor/libtensor/include/utils/output_validation.hpp
new file mode 100644
index 000000000000..26f1b29bd3d8
--- /dev/null
+++ b/dpctl/tensor/libtensor/include/utils/output_validation.hpp
@@ -0,0 +1,79 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+///
+/// \file
+/// This file defines utilities for determining if an array is a valid output
+/// array.
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <stdexcept>
+
+#include <pybind11/pybind11.h>
+
+#include "dpnp4pybind11.hpp"
+
+namespace dpctl::tensor::validation
+{
+namespace py = pybind11;
+
+/*! @brief Raises a value error if an array is read-only.
+
+    This should be called with an array before writing.*/
+struct CheckWritable
+{
+    static void throw_if_not_writable(const dpctl::tensor::usm_ndarray &arr)
+    {
+        if (!arr.is_writable()) {
+            throw py::value_error("output array is read-only.");
+        }
+        return;
+    }
+};
+
+/*! @brief Raises a value error if an array's memory is not sufficiently ample
+    to accommodate an input number of elements.
+
+    This should be called with an array before writing.*/
+struct AmpleMemory
+{
+    template <typename T>
+    static void throw_if_not_ample(const dpctl::tensor::usm_ndarray &arr,
+                                   T nelems)
+    {
+        auto arr_offsets = arr.get_minmax_offsets();
+        T range = static_cast<T>(arr_offsets.second - arr_offsets.first);
+        if (range + 1 < nelems) {
+            throw py::value_error("Memory addressed by the output array is not "
+                                  "sufficiently ample.");
+        }
+        return;
+    }
+};
+} // namespace dpctl::tensor::validation
diff --git a/dpctl/tensor/libtensor/include/utils/strided_iters.hpp b/dpctl/tensor/libtensor/include/utils/strided_iters.hpp
new file mode 100644
index 000000000000..0bed181802ae
--- /dev/null
+++ b/dpctl/tensor/libtensor/include/utils/strided_iters.hpp
@@ -0,0 +1,996 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+///
+/// \file
+/// This file defines CIndexer_array, and CIndexer_vector classes, as well
+/// iteration space simplifiers.
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <algorithm>
+#include <array>
+#include <cstddef>
+#include <numeric>
+#include <tuple>
+#include <type_traits>
+#include <vector>
+
+namespace dpctl::tensor::strides
+{
+/* An N-dimensional array can be stored in a single
+ * contiguous chunk of memory by contiguously laying
+ * array elements in lexicographinc order of their
+ * array indices. Such a layout is called C-contiguous.
+ *
+ * E.g. for (2, 3, 2) array `a` with zero-based indexing convention
+ * the C-array's elements are
+ * { a[0,0,0], a[0,0,1], a[0,1,0], a[0,1,1], a[0,2,0], a[0,2,1],
+ *   a[1,0,0], a[1,0,1], a[1,1,0], a[1,1,1], a[1,2,0], a[1,2,1] }
+ *
+ * Indexer maps zero-based index in C-array to a multi-index
+ * for the purpose of computing element displacement in the
+ * strided array, i.e. in the above example for k = 5, the displacement
+ * is (s0*0 + s1*2 + s2*1), and for k = 7 it is (s0*1 + s1*0 + s2*1)
+ * for N-dimensional array with strides (s0, s1, s2).
+ *
+ * Cindexer_vector need not know array rank `dim` at compile time.
+ * Shape and strides are stored in std::vector, which are not trivially
+ * copyable.
+ *
+ * For the class to be trivially copyable for offloading displacement
+ * computation methods take accessor/pointer arguments of type T for
+ * shape and stride and modify displacement argument passed by reference.
+ */
+template <typename indT = std::ptrdiff_t>
+class CIndexer_vector
+{
+    static_assert(std::is_integral<indT>::value, "Integral type is required");
+    static_assert(std::is_signed<indT>::value,
+                  "Signed integral type is required");
+    int nd;
+
+public:
+    CIndexer_vector(int dim) : nd(dim) {}
+
+    template <class ShapeTy>
+    indT size(const ShapeTy &shape) const
+    {
+        indT s = static_cast<indT>(1);
+        for (int i = 0; i < nd; ++i) {
+            s *= shape[i];
+        }
+        return s;
+    }
+
+    template <class ShapeTy, class StridesTy>
+    void get_displacement(const indT i,
+                          const ShapeTy &shape,
+                          const StridesTy &stride,
+                          indT &disp) const
+    {
+        if (nd == 1) {
+            disp = i * stride[0];
+            return;
+        }
+
+        indT i_ = i;
+        indT d = 0;
+        for (int dim = nd; --dim > 0;) {
+            const indT si = shape[dim];
+            const indT q = i_ / si;
+            const indT r = (i_ - q * si);
+            d += r * stride[dim];
+            i_ = q;
+        }
+        disp = d + i_ * stride[0];
+    }
+
+    template <class ShapeTy, class StridesTy>
+    void get_displacement(const indT i,
+                          const ShapeTy &shape,
+                          const StridesTy &stride1,
+                          const StridesTy &stride2,
+                          indT &disp1,
+                          indT &disp2) const
+    {
+        if (nd == 1) {
+            disp1 = i * stride1[0];
+            disp2 = i * stride2[0];
+            return;
+        }
+
+        indT i_ = i;
+        indT d1 = 0, d2 = 0;
+        for (int dim = nd; --dim > 0;) {
+            const indT si = shape[dim];
+            const indT q = i_ / si;
+            const indT r = (i_ - q * si);
+            i_ = q;
+            d1 += r * stride1[dim];
+            d2 += r * stride2[dim];
+        }
+        disp1 = d1 + i_ * stride1[0];
+        disp2 = d2 + i_ * stride2[0];
+        return;
+    }
+
+    template <class ShapeTy, class StridesTy>
+    void get_displacement(const indT i,
+                          const ShapeTy &shape,
+                          const StridesTy &stride1,
+                          const StridesTy &stride2,
+                          const StridesTy &stride3,
+                          indT &disp1,
+                          indT &disp2,
+                          indT &disp3) const
+    {
+        if (nd == 1) {
+            disp1 = i * stride1[0];
+            disp2 = i * stride2[0];
+            disp3 = i * stride3[0];
+            return;
+        }
+
+        indT i_ = i;
+        indT d1 = 0, d2 = 0, d3 = 0;
+        for (int dim = nd; --dim > 0;) {
+            const indT si = shape[dim];
+            const indT q = i_ / si;
+            const indT r = (i_ - q * si);
+            i_ = q;
+            d1 += r * stride1[dim];
+            d2 += r * stride2[dim];
+            d3 += r * stride3[dim];
+        };
+        disp1 = d1 + i_ * stride1[0];
+        disp2 = d2 + i_ * stride2[0];
+        disp3 = d3 + i_ * stride3[0];
+        return;
+    }
+
+    template <class ShapeTy, class StridesTy>
+    void get_displacement(const indT i,
+                          const ShapeTy &shape,
+                          const StridesTy &stride1,
+                          const StridesTy &stride2,
+                          const StridesTy &stride3,
+                          const StridesTy &stride4,
+                          indT &disp1,
+                          indT &disp2,
+                          indT &disp3,
+                          indT &disp4) const
+    {
+        if (nd == 1) {
+            disp1 = i * stride1[0];
+            disp2 = i * stride2[0];
+            disp3 = i * stride3[0];
+            disp4 = i * stride4[0];
+            return;
+        }
+
+        indT i_ = i;
+        indT d1 = 0, d2 = 0, d3 = 0, d4 = 0;
+        for (int dim = nd; --dim > 0;) {
+            const indT si = shape[dim];
+            const indT q = i_ / si;
+            const indT r = (i_ - q * si);
+            i_ = q;
+            d1 += r * stride1[dim];
+            d2 += r * stride2[dim];
+            d3 += r * stride3[dim];
+            d4 += r * stride4[dim];
+        }
+        disp1 = d1 + i_ * stride1[0];
+        disp2 = d2 + i_ * stride2[0];
+        disp3 = d3 + i_ * stride3[0];
+        disp4 = d4 + i_ * stride4[0];
+        return;
+    }
+
+    template <class ShapeTy, class StridesTy, int nstrides>
+    void get_displacement(const indT i,
+                          const ShapeTy &shape,
+                          const std::array<StridesTy, nstrides> &strides,
+                          std::array<indT, nstrides> &disps) const
+    {
+        if (nd == 1) {
+            for (int k = 0; k < nstrides; ++k) {
+                disps[k] = i * strides[k][0];
+            }
+            return;
+        }
+
+        indT i_ = i;
+        std::array<indT, nstrides> ds;
+        for (int k = 0; k < nstrides; ++k) {
+            ds[k] = 0;
+        }
+
+        for (int dim = nd; --dim > 0;) {
+            const indT si = shape[dim];
+            const indT q = i_ / si;
+            const indT r = (i_ - q * si);
+            for (int k = 0; k < nstrides; ++k) {
+                ds[k] += r * strides[k][dim];
+            }
+            i_ = q;
+        };
+        for (int k = 0; k < nstrides; ++k) {
+            disps[k] = ds[k] + i_ * strides[k][0];
+        }
+        return;
+    }
+
+    template <class ShapeTy, class StridesTy>
+    void get_left_rolled_displacement(const indT i,
+                                      const ShapeTy &shape,
+                                      const StridesTy &stride,
+                                      const StridesTy &shifts,
+                                      indT &disp) const
+    {
+        indT i_ = i;
+        indT d(0);
+        for (int dim = nd; --dim > 0;) {
+            const indT si = shape[dim];
+            const indT q = i_ / si;
+            const indT r = (i_ - q * si);
+            // assumes si > shifts[dim] >= 0
+            const indT shifted_r =
+                (r < shifts[dim] ? r + si - shifts[dim] : r - shifts[dim]);
+            d += shifted_r * stride[dim];
+            i_ = q;
+        }
+        const indT shifted_r =
+            (i_ < shifts[0] ? i_ + shape[0] - shifts[0] : i_ - shifts[0]);
+        disp = d + shifted_r * stride[0];
+    }
+};
+
+/*
+ * CIndexer is for arrays whose array-rank is known at compile time.
+ * Statically allocated shape and multi_index arrays are members of
+ * the class instance, and it remains trivially copyable.
+ *
+ * Method `set(k)` populates work-item private array multi_index, which
+ * can be accessed using `get()` to compute the displacement as needed.
+ */
+
+template <int _ndim, typename indT = std::ptrdiff_t>
+class CIndexer_array
+{
+    static constexpr int ndim = _ndim;
+
+    static_assert(std::is_integral<indT>::value, "Integral type is required");
+    static_assert(std::is_signed<indT>::value,
+                  "Signed integral type is required");
+    static_assert(ndim > 0, "Dimensionality must be positive");
+
+private:
+    typedef std::array<indT, ndim> index_t;
+
+    indT elem_count;
+    index_t shape;
+    index_t multi_index;
+
+public:
+    CIndexer_array() : elem_count(0), shape{}, multi_index{} {}
+
+    explicit CIndexer_array(const index_t &input_shape)
+        : elem_count(0), shape{}, multi_index{}
+    {
+        indT s(1);
+        for (int i = 0; i < ndim; ++i) {
+            shape[i] = input_shape[i];
+            s *= input_shape[i];
+        }
+        elem_count = s;
+    }
+
+    indT size() const
+    {
+        return elem_count;
+    }
+    indT rank() const
+    {
+        return ndim;
+    }
+
+    void set(const indT i)
+    {
+        if (ndim == 1) {
+            multi_index[0] = i;
+            return;
+        }
+
+        indT i_ = i;
+#pragma unroll
+        for (int dim = ndim; --dim > 0;) {
+            indT si = shape[dim];
+            indT q = i_ / si;
+            multi_index[dim] = i_ - q * si;
+            i_ = q;
+        }
+        multi_index[0] = i_;
+    }
+
+    const index_t &get() const
+    {
+        return multi_index;
+    }
+};
+
+/*
+    For purposes of iterating over elements of array with
+    `shape` and `strides` given as pointers
+    `simplify_iteration_strides(nd, shape_ptr, strides_ptr, disp)`
+    may modify memory and returns new length of these arrays.
+
+    The new shape and new strides, as well as the offset
+    `(new_shape, new_strides, disp)` are such that iterating over
+    them will traverse the same elements, possibly in
+    different order.
+
+    ..Example: python
+        import itertools
+        # for some array Y over whose elements we iterate
+        csh, cst, cp = contract_iter(Y.shape, Y.strides)
+        def pointers_set(sh, st, p):
+            citers = itertools.product(*map(lambda s: range(s), sh))
+            dot = lambda st, it: sum(st[k]*it[k] for k in range(len(st)))
+            return set(p + dot(st, it) for it in citers)
+        ps1 = pointers_set(csh, cst, cp)
+        ps2 = pointers_set(Y.shape, Y.strides, 0)
+        assert ps1 == ps2
+
+ */
+template <class ShapeTy, class StridesTy>
+int simplify_iteration_stride(const int nd,
+                              ShapeTy *shape,
+                              StridesTy *strides,
+                              StridesTy &disp)
+{
+    disp = StridesTy(0);
+    if (nd < 2)
+        return nd;
+
+    std::vector<int> pos(nd);
+    std::iota(pos.begin(), pos.end(), 0);
+
+    std::stable_sort(
+        pos.begin(), pos.end(), [&strides, &shape](int i1, int i2) {
+            auto abs_str1 = (strides[i1] < 0) ? -strides[i1] : strides[i1];
+            auto abs_str2 = (strides[i2] < 0) ? -strides[i2] : strides[i2];
+            return (abs_str1 > abs_str2) ||
+                   (abs_str1 == abs_str2 && shape[i1] > shape[i2]);
+        });
+
+    std::vector<ShapeTy> shape_w;
+    std::vector<StridesTy> strides_w;
+    int nd_ = nd;
+    shape_w.reserve(nd_);
+    strides_w.reserve(nd_);
+
+    for (int i = 0; i < nd; ++i) {
+        auto p = pos[i];
+        auto sh_p = shape[p];
+        auto str_p = strides[p];
+        shape_w.push_back(sh_p);
+        if (str_p < 0) {
+            disp += str_p * (sh_p - 1);
+            str_p = -str_p;
+        }
+        strides_w.push_back(str_p);
+    }
+
+    {
+        bool changed;
+        do {
+            changed = false;
+            for (int i = 0; i + 1 < nd_; ++i) {
+                StridesTy step = strides_w[i + 1];
+                StridesTy jump = strides_w[i] - (shape_w[i + 1] - 1) * step;
+                if (jump == step) {
+                    changed = true;
+                    for (int k = i; k + 1 < nd_; ++k) {
+                        strides_w[k] = strides_w[k + 1];
+                    }
+                    shape_w[i] *= shape_w[i + 1];
+                    for (int k = i + 1; k + 1 < nd_; ++k) {
+                        shape_w[k] = shape_w[k + 1];
+                    }
+                    --nd_;
+                }
+            }
+        } while (changed);
+    }
+
+    for (int i = 0; i < nd_; ++i) {
+        shape[i] = shape_w[i];
+    }
+    for (int i = 0; i < nd_; ++i) {
+        strides[i] = strides_w[i];
+    }
+
+    return nd_;
+}
+
+/*
+    For purposes of iterating over pairs of elements of two arrays
+    with  `shape` and strides `strides1`, `strides2` given as pointers
+    `simplify_iteration_two_strides(nd, shape_ptr, strides1_ptr,
+    strides2_ptr, disp1, disp2)`
+    may modify memory and returns new length of these arrays.
+
+    The new shape and new strides, as well as the offset
+    `(new_shape, new_strides1, disp1, new_stride2, disp2)` are such that
+    iterating over them will traverse the same set of pairs of elements,
+    possibly in a different order.
+ */
+template <class ShapeTy, class StridesTy>
+int simplify_iteration_two_strides(const int nd,
+                                   ShapeTy *shape,
+                                   StridesTy *strides1,
+                                   StridesTy *strides2,
+                                   StridesTy &disp1,
+                                   StridesTy &disp2)
+{
+    disp1 = StridesTy(0);
+    disp2 = StridesTy(0);
+    if (nd < 2)
+        return nd;
+
+    std::vector<int> pos(nd);
+    std::iota(pos.begin(), pos.end(), 0);
+
+    std::stable_sort(
+        pos.begin(), pos.end(), [&strides1, &strides2, &shape](int i1, int i2) {
+            auto abs_str1_i1 =
+                (strides1[i1] < 0) ? -strides1[i1] : strides1[i1];
+            auto abs_str1_i2 =
+                (strides1[i2] < 0) ? -strides1[i2] : strides1[i2];
+            auto abs_str2_i1 =
+                (strides2[i1] < 0) ? -strides2[i1] : strides2[i1];
+            auto abs_str2_i2 =
+                (strides2[i2] < 0) ? -strides2[i2] : strides2[i2];
+            return (abs_str2_i1 > abs_str2_i2) ||
+                   (abs_str2_i1 == abs_str2_i2 &&
+                    (abs_str1_i1 > abs_str1_i2 ||
+                     (abs_str1_i1 == abs_str1_i2 && shape[i1] > shape[i2])));
+        });
+
+    std::vector<ShapeTy> shape_w;
+    std::vector<StridesTy> strides1_w;
+    std::vector<StridesTy> strides2_w;
+
+    bool contractable = true;
+    for (int i = 0; i < nd; ++i) {
+        auto p = pos[i];
+        auto sh_p = shape[p];
+        auto str1_p = strides1[p];
+        auto str2_p = strides2[p];
+        shape_w.push_back(sh_p);
+        if (str1_p <= 0 && str2_p <= 0 && std::min(str1_p, str2_p) < 0) {
+            disp1 += str1_p * (sh_p - 1);
+            str1_p = -str1_p;
+            disp2 += str2_p * (sh_p - 1);
+            str2_p = -str2_p;
+        }
+        if (str1_p < 0 || str2_p < 0) {
+            contractable = false;
+        }
+        strides1_w.push_back(str1_p);
+        strides2_w.push_back(str2_p);
+    }
+
+    int nd_ = nd;
+    while (contractable) {
+        bool changed = false;
+        for (int i = 0; i + 1 < nd_; ++i) {
+            StridesTy str1 = strides1_w[i + 1];
+            StridesTy str2 = strides2_w[i + 1];
+            StridesTy jump1 = strides1_w[i] - (shape_w[i + 1] - 1) * str1;
+            StridesTy jump2 = strides2_w[i] - (shape_w[i + 1] - 1) * str2;
+
+            if (jump1 == str1 && jump2 == str2) {
+                changed = true;
+                shape_w[i] *= shape_w[i + 1];
+                for (int j = i; j < nd_; ++j) {
+                    strides1_w[j] = strides1_w[j + 1];
+                }
+                for (int j = i; j < nd_; ++j) {
+                    strides2_w[j] = strides2_w[j + 1];
+                }
+                for (int j = i + 1; j + 1 < nd_; ++j) {
+                    shape_w[j] = shape_w[j + 1];
+                }
+                --nd_;
+                break;
+            }
+        }
+        if (!changed)
+            break;
+    }
+    for (int i = 0; i < nd_; ++i) {
+        shape[i] = shape_w[i];
+    }
+    for (int i = 0; i < nd_; ++i) {
+        strides1[i] = strides1_w[i];
+    }
+    for (int i = 0; i < nd_; ++i) {
+        strides2[i] = strides2_w[i];
+    }
+
+    return nd_;
+}
+
+template <typename T, class Error, typename vecT = std::vector<T>>
+std::tuple<vecT, vecT, T> contract_iter(const vecT &shape, const vecT &strides)
+{
+    const std::size_t dim = shape.size();
+    if (dim != strides.size()) {
+        throw Error("Shape and strides must be of equal size.");
+    }
+    vecT out_shape = shape;
+    vecT out_strides = strides;
+    T disp(0);
+
+    int nd = simplify_iteration_stride(dim, out_shape.data(),
+                                       out_strides.data(), disp);
+    out_shape.resize(nd);
+    out_strides.resize(nd);
+    return std::make_tuple(out_shape, out_strides, disp);
+}
+
+template <typename T, class Error, typename vecT = std::vector<T>>
+std::tuple<vecT, vecT, T, vecT, T> contract_iter2(const vecT &shape,
+                                                  const vecT &strides1,
+                                                  const vecT &strides2)
+{
+    const std::size_t dim = shape.size();
+    if (dim != strides1.size() || dim != strides2.size()) {
+        throw Error("Shape and strides must be of equal size.");
+    }
+    vecT out_shape = shape;
+    vecT out_strides1 = strides1;
+    vecT out_strides2 = strides2;
+    T disp1(0);
+    T disp2(0);
+
+    int nd = simplify_iteration_two_strides(dim, out_shape.data(),
+                                            out_strides1.data(),
+                                            out_strides2.data(), disp1, disp2);
+    out_shape.resize(nd);
+    out_strides1.resize(nd);
+    out_strides2.resize(nd);
+    return std::make_tuple(out_shape, out_strides1, disp1, out_strides2, disp2);
+}
+
+/*
+    For purposes of iterating over pairs of elements of three arrays
+    with  `shape` and strides `strides1`, `strides2`, `strides3` given as
+    pointers `simplify_iteration_three_strides(nd, shape_ptr, strides1_ptr,
+    strides2_ptr, strides3_ptr, disp1, disp2, disp3)`
+    may modify memory and returns new length of these arrays.
+
+    The new shape and new strides, as well as the offset
+    `(new_shape, new_strides1, disp1, new_stride2, disp2, new_stride3, disp3)`
+    are such that iterating over them will traverse the same set of tuples of
+    elements, possibly in a different order.
+ */
+template <class ShapeTy, class StridesTy>
+int simplify_iteration_three_strides(const int nd,
+                                     ShapeTy *shape,
+                                     StridesTy *strides1,
+                                     StridesTy *strides2,
+                                     StridesTy *strides3,
+                                     StridesTy &disp1,
+                                     StridesTy &disp2,
+                                     StridesTy &disp3)
+{
+    disp1 = StridesTy(0);
+    disp2 = StridesTy(0);
+    if (nd < 2)
+        return nd;
+
+    std::vector<int> pos(nd);
+    std::iota(pos.begin(), pos.end(), 0);
+
+    std::stable_sort(pos.begin(), pos.end(),
+                     [&strides1, &strides2, &strides3, &shape](int i1, int i2) {
+                         auto abs_str1_i1 =
+                             (strides1[i1] < 0) ? -strides1[i1] : strides1[i1];
+                         auto abs_str1_i2 =
+                             (strides1[i2] < 0) ? -strides1[i2] : strides1[i2];
+                         auto abs_str2_i1 =
+                             (strides2[i1] < 0) ? -strides2[i1] : strides2[i1];
+                         auto abs_str2_i2 =
+                             (strides2[i2] < 0) ? -strides2[i2] : strides2[i2];
+                         auto abs_str3_i1 =
+                             (strides3[i1] < 0) ? -strides3[i1] : strides3[i1];
+                         auto abs_str3_i2 =
+                             (strides3[i2] < 0) ? -strides3[i2] : strides3[i2];
+                         return (abs_str3_i1 > abs_str3_i2) ||
+                                ((abs_str3_i1 == abs_str3_i2) &&
+                                 ((abs_str2_i1 > abs_str2_i2) ||
+                                  ((abs_str2_i1 == abs_str2_i2) &&
+                                   ((abs_str1_i1 > abs_str1_i2) ||
+                                    ((abs_str1_i1 == abs_str1_i2) &&
+                                     (shape[i1] > shape[i2]))))));
+                     });
+
+    std::vector<ShapeTy> shape_w;
+    std::vector<StridesTy> strides1_w;
+    std::vector<StridesTy> strides2_w;
+    std::vector<StridesTy> strides3_w;
+
+    bool contractable = true;
+    for (int i = 0; i < nd; ++i) {
+        auto p = pos[i];
+        auto sh_p = shape[p];
+        auto str1_p = strides1[p];
+        auto str2_p = strides2[p];
+        auto str3_p = strides3[p];
+        shape_w.push_back(sh_p);
+        if (str1_p <= 0 && str2_p <= 0 && str3_p <= 0 &&
+            std::min({str1_p, str2_p, str3_p}) < 0)
+        {
+            disp1 += str1_p * (sh_p - 1);
+            str1_p = -str1_p;
+            disp2 += str2_p * (sh_p - 1);
+            str2_p = -str2_p;
+            disp3 += str3_p * (sh_p - 1);
+            str3_p = -str3_p;
+        }
+        if (str1_p < 0 || str2_p < 0 || str3_p < 0) {
+            contractable = false;
+        }
+        strides1_w.push_back(str1_p);
+        strides2_w.push_back(str2_p);
+        strides3_w.push_back(str3_p);
+    }
+    int nd_ = nd;
+    while (contractable) {
+        bool changed = false;
+        for (int i = 0; i + 1 < nd_; ++i) {
+            StridesTy str1 = strides1_w[i + 1];
+            StridesTy str2 = strides2_w[i + 1];
+            StridesTy str3 = strides3_w[i + 1];
+            StridesTy jump1 = strides1_w[i] - (shape_w[i + 1] - 1) * str1;
+            StridesTy jump2 = strides2_w[i] - (shape_w[i + 1] - 1) * str2;
+            StridesTy jump3 = strides3_w[i] - (shape_w[i + 1] - 1) * str3;
+
+            if (jump1 == str1 && jump2 == str2 && jump3 == str3) {
+                changed = true;
+                shape_w[i] *= shape_w[i + 1];
+                for (int j = i; j < nd_; ++j) {
+                    strides1_w[j] = strides1_w[j + 1];
+                }
+                for (int j = i; j < nd_; ++j) {
+                    strides2_w[j] = strides2_w[j + 1];
+                }
+                for (int j = i; j < nd_; ++j) {
+                    strides3_w[j] = strides3_w[j + 1];
+                }
+                for (int j = i + 1; j + 1 < nd_; ++j) {
+                    shape_w[j] = shape_w[j + 1];
+                }
+                --nd_;
+                break;
+            }
+        }
+        if (!changed)
+            break;
+    }
+    for (int i = 0; i < nd_; ++i) {
+        shape[i] = shape_w[i];
+    }
+    for (int i = 0; i < nd_; ++i) {
+        strides1[i] = strides1_w[i];
+    }
+    for (int i = 0; i < nd_; ++i) {
+        strides2[i] = strides2_w[i];
+    }
+    for (int i = 0; i < nd_; ++i) {
+        strides3[i] = strides3_w[i];
+    }
+
+    return nd_;
+}
+
+template <typename T, class Error, typename vecT = std::vector<T>>
+std::tuple<vecT, vecT, T, vecT, T, vecT, T> contract_iter3(const vecT &shape,
+                                                           const vecT &strides1,
+                                                           const vecT &strides2,
+                                                           const vecT &strides3)
+{
+    const std::size_t dim = shape.size();
+    if (dim != strides1.size() || dim != strides2.size() ||
+        dim != strides3.size()) {
+        throw Error("Shape and strides must be of equal size.");
+    }
+    vecT out_shape = shape;
+    vecT out_strides1 = strides1;
+    vecT out_strides2 = strides2;
+    vecT out_strides3 = strides3;
+    T disp1(0);
+    T disp2(0);
+    T disp3(0);
+
+    int nd = simplify_iteration_three_strides(
+        dim, out_shape.data(), out_strides1.data(), out_strides2.data(),
+        out_strides3.data(), disp1, disp2, disp3);
+    out_shape.resize(nd);
+    out_strides1.resize(nd);
+    out_strides2.resize(nd);
+    out_strides3.resize(nd);
+    return std::make_tuple(out_shape, out_strides1, disp1, out_strides2, disp2,
+                           out_strides3, disp3);
+}
+
+/*
+    For purposes of iterating over pairs of elements of four arrays
+    with  `shape` and strides `strides1`, `strides2`, `strides3`,
+    `strides4` given as pointers `simplify_iteration_four_strides(nd,
+    shape_ptr, strides1_ptr, strides2_ptr, strides3_ptr, strides4_ptr,
+    disp1, disp2, disp3, disp4)` may modify memory and returns new
+    length of these arrays.
+
+    The new shape and new strides, as well as the offset
+    `(new_shape, new_strides1, disp1, new_stride2, disp2, new_stride3, disp3,
+    new_stride4, disp4)` are such that iterating over them will traverse the
+    same set of tuples of elements, possibly in a different order.
+ */
+template <class ShapeTy, class StridesTy>
+int simplify_iteration_four_strides(const int nd,
+                                    ShapeTy *shape,
+                                    StridesTy *strides1,
+                                    StridesTy *strides2,
+                                    StridesTy *strides3,
+                                    StridesTy *strides4,
+                                    StridesTy &disp1,
+                                    StridesTy &disp2,
+                                    StridesTy &disp3,
+                                    StridesTy &disp4)
+{
+    disp1 = StridesTy(0);
+    disp2 = StridesTy(0);
+    if (nd < 2)
+        return nd;
+
+    std::vector<int> pos(nd);
+    std::iota(pos.begin(), pos.end(), 0);
+
+    std::stable_sort(
+        pos.begin(), pos.end(),
+        [&strides1, &strides2, &strides3, &strides4, &shape](int i1, int i2) {
+            auto abs_str1_i1 =
+                (strides1[i1] < 0) ? -strides1[i1] : strides1[i1];
+            auto abs_str1_i2 =
+                (strides1[i2] < 0) ? -strides1[i2] : strides1[i2];
+            auto abs_str2_i1 =
+                (strides2[i1] < 0) ? -strides2[i1] : strides2[i1];
+            auto abs_str2_i2 =
+                (strides2[i2] < 0) ? -strides2[i2] : strides2[i2];
+            auto abs_str3_i1 =
+                (strides3[i1] < 0) ? -strides3[i1] : strides3[i1];
+            auto abs_str3_i2 =
+                (strides3[i2] < 0) ? -strides3[i2] : strides3[i2];
+            auto abs_str4_i1 =
+                (strides4[i1] < 0) ? -strides4[i1] : strides4[i1];
+            auto abs_str4_i2 =
+                (strides4[i2] < 0) ? -strides4[i2] : strides4[i2];
+            return (abs_str4_i1 > abs_str4_i2) ||
+                   ((abs_str4_i1 == abs_str4_i2) &&
+                    ((abs_str3_i1 > abs_str3_i2) ||
+                     ((abs_str3_i1 == abs_str3_i2) &&
+                      ((abs_str2_i1 > abs_str2_i2) ||
+                       ((abs_str2_i1 == abs_str2_i2) &&
+                        ((abs_str1_i1 > abs_str1_i2) ||
+                         ((abs_str1_i1 == abs_str1_i2) &&
+                          (shape[i1] > shape[i2]))))))));
+        });
+
+    std::vector<ShapeTy> shape_w;
+    std::vector<StridesTy> strides1_w;
+    std::vector<StridesTy> strides2_w;
+    std::vector<StridesTy> strides3_w;
+    std::vector<StridesTy> strides4_w;
+
+    bool contractable = true;
+    for (int i = 0; i < nd; ++i) {
+        auto p = pos[i];
+        auto sh_p = shape[p];
+        auto str1_p = strides1[p];
+        auto str2_p = strides2[p];
+        auto str3_p = strides3[p];
+        auto str4_p = strides4[p];
+        shape_w.push_back(sh_p);
+        if (str1_p <= 0 && str2_p <= 0 && str3_p <= 0 && str4_p <= 0 &&
+            std::min({str1_p, str2_p, str3_p, str4_p}) < 0)
+        {
+            disp1 += str1_p * (sh_p - 1);
+            str1_p = -str1_p;
+            disp2 += str2_p * (sh_p - 1);
+            str2_p = -str2_p;
+            disp3 += str3_p * (sh_p - 1);
+            str3_p = -str3_p;
+            disp4 += str4_p * (sh_p - 1);
+            str4_p = -str4_p;
+        }
+        if (str1_p < 0 || str2_p < 0 || str3_p < 0 || str4_p < 0) {
+            contractable = false;
+        }
+        strides1_w.push_back(str1_p);
+        strides2_w.push_back(str2_p);
+        strides3_w.push_back(str3_p);
+        strides4_w.push_back(str4_p);
+    }
+    int nd_ = nd;
+    while (contractable) {
+        bool changed = false;
+        for (int i = 0; i + 1 < nd_; ++i) {
+            StridesTy str1 = strides1_w[i + 1];
+            StridesTy str2 = strides2_w[i + 1];
+            StridesTy str3 = strides3_w[i + 1];
+            StridesTy str4 = strides4_w[i + 1];
+            StridesTy jump1 = strides1_w[i] - (shape_w[i + 1] - 1) * str1;
+            StridesTy jump2 = strides2_w[i] - (shape_w[i + 1] - 1) * str2;
+            StridesTy jump3 = strides3_w[i] - (shape_w[i + 1] - 1) * str3;
+            StridesTy jump4 = strides4_w[i] - (shape_w[i + 1] - 1) * str4;
+
+            if (jump1 == str1 && jump2 == str2 && jump3 == str3 &&
+                jump4 == str4) {
+                changed = true;
+                shape_w[i] *= shape_w[i + 1];
+                for (int j = i; j < nd_; ++j) {
+                    strides1_w[j] = strides1_w[j + 1];
+                }
+                for (int j = i; j < nd_; ++j) {
+                    strides2_w[j] = strides2_w[j + 1];
+                }
+                for (int j = i; j < nd_; ++j) {
+                    strides3_w[j] = strides3_w[j + 1];
+                }
+                for (int j = i; j < nd_; ++j) {
+                    strides4_w[j] = strides4_w[j + 1];
+                }
+                for (int j = i + 1; j + 1 < nd_; ++j) {
+                    shape_w[j] = shape_w[j + 1];
+                }
+                --nd_;
+                break;
+            }
+        }
+        if (!changed)
+            break;
+    }
+    for (int i = 0; i < nd_; ++i) {
+        shape[i] = shape_w[i];
+    }
+    for (int i = 0; i < nd_; ++i) {
+        strides1[i] = strides1_w[i];
+    }
+    for (int i = 0; i < nd_; ++i) {
+        strides2[i] = strides2_w[i];
+    }
+    for (int i = 0; i < nd_; ++i) {
+        strides3[i] = strides3_w[i];
+    }
+    for (int i = 0; i < nd_; ++i) {
+        strides4[i] = strides4_w[i];
+    }
+
+    return nd_;
+}
+
+template <typename T, class Error, typename vecT = std::vector<T>>
+std::tuple<vecT, vecT, T, vecT, T, vecT, T, vecT, T>
+    contract_iter4(const vecT &shape,
+                   const vecT &strides1,
+                   const vecT &strides2,
+                   const vecT &strides3,
+                   const vecT &strides4)
+{
+    const std::size_t dim = shape.size();
+    if (dim != strides1.size() || dim != strides2.size() ||
+        dim != strides3.size() || dim != strides4.size())
+    {
+        throw Error("Shape and strides must be of equal size.");
+    }
+    vecT out_shape = shape;
+    vecT out_strides1 = strides1;
+    vecT out_strides2 = strides2;
+    vecT out_strides3 = strides3;
+    vecT out_strides4 = strides4;
+    T disp1(0);
+    T disp2(0);
+    T disp3(0);
+    T disp4(0);
+
+    int nd = simplify_iteration_four_strides(
+        dim, out_shape.data(), out_strides1.data(), out_strides2.data(),
+        out_strides3.data(), out_strides4.data(), disp1, disp2, disp3, disp4);
+    out_shape.resize(nd);
+    out_strides1.resize(nd);
+    out_strides2.resize(nd);
+    out_strides3.resize(nd);
+    out_strides4.resize(nd);
+    return std::make_tuple(out_shape, out_strides1, disp1, out_strides2, disp2,
+                           out_strides3, disp3, out_strides4, disp4);
+}
+
+/*
+    For purposes of iterating over elements of an array with  `shape` and
+    strides `strides` given as pointers `compact_iteration(nd, shape, strides)`
+    may modify memory and returns the new length of the array.
+
+    The new shape and new strides `(new_shape, new_strides)` are such that
+    iterating over them will traverse the same elements in the same order,
+    possibly with reduced dimensionality.
+ */
+template <class ShapeTy, class StridesTy>
+int compact_iteration(const int nd, ShapeTy *shape, StridesTy *strides)
+{
+    if (nd < 2)
+        return nd;
+
+    bool contractable = true;
+    for (int i = 0; i < nd; ++i) {
+        if (strides[i] < 0) {
+            contractable = false;
+        }
+    }
+
+    int nd_ = nd;
+    while (contractable) {
+        bool changed = false;
+        for (int i = 0; i + 1 < nd_; ++i) {
+            StridesTy str = strides[i + 1];
+            StridesTy jump = strides[i] - (shape[i + 1] - 1) * str;
+
+            if (jump == str) {
+                changed = true;
+                shape[i] *= shape[i + 1];
+                for (int j = i; j < nd_; ++j) {
+                    strides[j] = strides[j + 1];
+                }
+                for (int j = i + 1; j + 1 < nd_; ++j) {
+                    shape[j] = shape[j + 1];
+                }
+                --nd_;
+                break;
+            }
+        }
+        if (!changed)
+            break;
+    }
+
+    return nd_;
+}
+} // namespace dpctl::tensor::strides
diff --git a/dpctl/tensor/libtensor/include/utils/sycl_alloc_utils.hpp b/dpctl/tensor/libtensor/include/utils/sycl_alloc_utils.hpp
new file mode 100644
index 000000000000..76f0174b9fdf
--- /dev/null
+++ b/dpctl/tensor/libtensor/include/utils/sycl_alloc_utils.hpp
@@ -0,0 +1,223 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+///
+/// \file
+/// This file defines CIndexer_array, and CIndexer_vector classes, as well
+/// iteration space simplifiers.
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <cstddef>
+#include <exception>
+#include <iostream>
+#include <memory>
+#include <stdexcept>
+#include <type_traits>
+#include <utility>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+namespace dpctl::tensor::alloc_utils
+{
+template <typename T>
+class usm_host_allocator : public sycl::usm_allocator<T, sycl::usm::alloc::host>
+{
+public:
+    using baseT = sycl::usm_allocator<T, sycl::usm::alloc::host>;
+    using baseT::baseT;
+
+    template <typename U>
+    struct rebind
+    {
+        typedef usm_host_allocator<U> other;
+    };
+
+    void deallocate(T *ptr, std::size_t n)
+    {
+        try {
+            baseT::deallocate(ptr, n);
+        } catch (const std::exception &e) {
+            std::cerr
+                << "Exception caught in `usm_host_allocator::deallocate`: "
+                << e.what() << std::endl;
+        }
+    }
+};
+
+template <typename T>
+void sycl_free_noexcept(T *ptr, const sycl::context &ctx) noexcept
+{
+    try {
+        sycl::free(ptr, ctx);
+    } catch (const std::exception &e) {
+        std::cerr << "Call to sycl::free caught exception: " << e.what()
+                  << std::endl;
+    }
+}
+
+template <typename T>
+void sycl_free_noexcept(T *ptr, const sycl::queue &q) noexcept
+{
+    sycl_free_noexcept(ptr, q.get_context());
+}
+
+class USMDeleter
+{
+private:
+    sycl::context ctx_;
+
+public:
+    USMDeleter(const sycl::queue &q) : ctx_(q.get_context()) {}
+    USMDeleter(const sycl::context &ctx) : ctx_(ctx) {}
+
+    template <typename T>
+    void operator()(T *ptr) const
+    {
+        sycl_free_noexcept(ptr, ctx_);
+    }
+};
+
+template <typename T>
+std::unique_ptr<T, USMDeleter>
+    smart_malloc(std::size_t count,
+                 const sycl::queue &q,
+                 sycl::usm::alloc kind,
+                 const sycl::property_list &propList = {})
+{
+    T *ptr = sycl::malloc<T>(count, q, kind, propList);
+    if (nullptr == ptr) {
+        throw std::runtime_error("Unable to allocate device_memory");
+    }
+
+    auto usm_deleter = USMDeleter(q);
+    return std::unique_ptr<T, USMDeleter>(ptr, usm_deleter);
+}
+
+template <typename T>
+std::unique_ptr<T, USMDeleter>
+    smart_malloc_device(std::size_t count,
+                        const sycl::queue &q,
+                        const sycl::property_list &propList = {})
+{
+    return smart_malloc<T>(count, q, sycl::usm::alloc::device, propList);
+}
+
+template <typename T>
+std::unique_ptr<T, USMDeleter>
+    smart_malloc_shared(std::size_t count,
+                        const sycl::queue &q,
+                        const sycl::property_list &propList = {})
+{
+    return smart_malloc<T>(count, q, sycl::usm::alloc::shared, propList);
+}
+
+template <typename T>
+std::unique_ptr<T, USMDeleter>
+    smart_malloc_host(std::size_t count,
+                      const sycl::queue &q,
+                      const sycl::property_list &propList = {})
+{
+    return smart_malloc<T>(count, q, sycl::usm::alloc::host, propList);
+}
+
+namespace detail
+{
+template <typename T>
+struct valid_smart_ptr : public std::false_type
+{
+};
+
+template <typename ValT, typename DeleterT>
+struct valid_smart_ptr<std::unique_ptr<ValT, DeleterT> &>
+    : public std::is_same<DeleterT, USMDeleter>
+{
+};
+
+template <typename ValT, typename DeleterT>
+struct valid_smart_ptr<std::unique_ptr<ValT, DeleterT>>
+    : public std::is_same<DeleterT, USMDeleter>
+{
+};
+
+// base case
+template <typename... Rest>
+struct all_valid_smart_ptrs
+{
+    static constexpr bool value = true;
+};
+
+template <typename Arg, typename... RestArgs>
+struct all_valid_smart_ptrs<Arg, RestArgs...>
+{
+    static constexpr bool value = valid_smart_ptr<Arg>::value &&
+                                  (all_valid_smart_ptrs<RestArgs...>::value);
+};
+} // end of namespace detail
+
+/*! @brief Submit host_task and transfer ownership from smart pointers to it */
+template <typename... UniquePtrTs>
+sycl::event async_smart_free(sycl::queue &exec_q,
+                             const std::vector<sycl::event> &depends,
+                             UniquePtrTs &&...unique_pointers)
+{
+    static constexpr std::size_t n = sizeof...(UniquePtrTs);
+    static_assert(
+        n > 0, "async_smart_free requires at least one smart pointer argument");
+
+    static_assert(
+        detail::all_valid_smart_ptrs<UniquePtrTs...>::value,
+        "async_smart_free requires unique_ptr created with smart_malloc");
+
+    std::vector<void *> ptrs;
+    ptrs.reserve(n);
+    (ptrs.push_back(reinterpret_cast<void *>(unique_pointers.get())), ...);
+
+    std::vector<USMDeleter> dels;
+    dels.reserve(n);
+    (dels.emplace_back(unique_pointers.get_deleter()), ...);
+
+    sycl::event ht_e = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+
+        cgh.host_task([ptrs = std::move(ptrs), dels = std::move(dels)]() {
+            for (std::size_t i = 0; i < ptrs.size(); ++i) {
+                dels[i](ptrs[i]);
+            }
+        });
+    });
+
+    // Upon successful submission of host_task, USM allocations are owned
+    // by the host_task. Release smart pointer ownership to avoid double
+    // deallocation
+    (unique_pointers.release(), ...);
+
+    return ht_e;
+}
+} // namespace dpctl::tensor::alloc_utils
diff --git a/dpctl/tensor/libtensor/include/utils/sycl_utils.hpp b/dpctl/tensor/libtensor/include/utils/sycl_utils.hpp
new file mode 100644
index 000000000000..1cb70adafeec
--- /dev/null
+++ b/dpctl/tensor/libtensor/include/utils/sycl_utils.hpp
@@ -0,0 +1,662 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+///
+/// \file
+/// This file defines utilities used for kernel submission.
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <algorithm>
+#include <complex>
+#include <cstddef>
+#include <cstdint>
+#include <type_traits>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "math_utils.hpp"
+
+namespace dpctl::tensor::sycl_utils
+{
+namespace detail
+{
+template <typename...>
+struct TypeList;
+
+template <typename Head, typename... Tail>
+struct TypeList<Head, Tail...>
+{
+    using head = Head;
+    using tail = TypeList<Tail...>;
+};
+
+using NullTypeList = TypeList<>;
+template <typename T>
+struct IsNullTypeList : std::conditional_t<std::is_same_v<T, NullTypeList>,
+                                           std::true_type,
+                                           std::false_type>
+{
+};
+
+// recursively check if type is contained in given TypeList
+template <typename T, typename TList>
+struct IsContained
+    : std::conditional_t<
+          std::is_same_v<typename TList::head, std::remove_cv_t<T>>,
+          std::true_type,
+          IsContained<T, typename TList::tail>>
+{
+};
+
+template <>
+struct TypeList<>
+{
+};
+
+// std::false_type when last case has been checked for membership
+template <typename T>
+struct IsContained<T, NullTypeList> : std::false_type
+{
+};
+
+template <class T>
+struct IsComplex : std::false_type
+{
+};
+template <class T>
+struct IsComplex<std::complex<T>> : std::true_type
+{
+};
+} // namespace detail
+
+template <typename T>
+using sycl_ops = detail::TypeList<sycl::plus<T>,
+                                  sycl::bit_or<T>,
+                                  sycl::bit_xor<T>,
+                                  sycl::bit_and<T>,
+                                  sycl::maximum<T>,
+                                  sycl::minimum<T>,
+                                  sycl::multiplies<T>>;
+
+template <typename T, typename Op>
+struct IsSyclOp
+{
+    static constexpr bool value =
+        detail::IsContained<Op, sycl_ops<std::remove_const_t<T>>>::value ||
+        detail::IsContained<Op, sycl_ops<std::add_const_t<T>>>::value;
+};
+
+/*! @brief Find the smallest multiple of supported sub-group size larger than
+ * nelems */
+template <std::size_t f = 4>
+std::size_t choose_workgroup_size(const std::size_t nelems,
+                                  const std::vector<std::size_t> &sg_sizes)
+{
+    std::vector<std::size_t> wg_choices;
+    wg_choices.reserve(f * sg_sizes.size());
+
+    for (const auto &sg_size : sg_sizes) {
+#pragma unroll
+        for (std::size_t i = 1; i <= f; ++i) {
+            wg_choices.push_back(sg_size * i);
+        }
+    }
+    std::sort(std::begin(wg_choices), std::end(wg_choices));
+
+    std::size_t wg = 1;
+    for (std::size_t i = 0; i < wg_choices.size(); ++i) {
+        if (wg_choices[i] == wg) {
+            continue;
+        }
+        wg = wg_choices[i];
+        std::size_t n_groups = ((nelems + wg - 1) / wg);
+        if (n_groups == 1)
+            break;
+    }
+
+    return wg;
+}
+
+namespace detail
+{
+
+template <typename LocAccT, typename OpT>
+void _fold(LocAccT &local_mem_acc,
+           const std::uint32_t lid,
+           const std::uint32_t cutoff,
+           const std::uint32_t step,
+           const OpT &op)
+{
+    if (lid < cutoff) {
+        local_mem_acc[lid] = op(local_mem_acc[lid], local_mem_acc[step + lid]);
+    }
+}
+
+template <typename LocAccT, typename OpT>
+void _fold(LocAccT &local_mem_acc,
+           const std::uint32_t lid,
+           const std::uint32_t step,
+           const OpT &op)
+{
+    if (lid < step) {
+        local_mem_acc[lid] = op(local_mem_acc[lid], local_mem_acc[step + lid]);
+    }
+}
+
+} // end of namespace detail
+
+template <typename T, typename GroupT, typename LocAccT, typename OpT>
+T custom_reduce_over_group(const GroupT &wg,
+                           LocAccT local_mem_acc,
+                           const T &local_val,
+                           const OpT &op)
+{
+    // value experimentally tuned to achieve best runtime on Iris Xe,
+    // Arc A140V integrated Intel GPUs, and discrete Intel Max GPU.
+    static constexpr std::uint32_t low_sz = 8u;
+    // maximal work-group size
+    static constexpr std::uint32_t high_sz = 1024u;
+    const std::uint32_t wgs = wg.get_local_linear_range();
+    const std::uint32_t lid = wg.get_local_linear_id();
+
+    local_mem_acc[lid] = local_val;
+    sycl::group_barrier(wg, sycl::memory_scope::work_group);
+
+    std::uint32_t n_witems = wgs;
+    if (wgs & (wgs - 1)) {
+        // wgs is not a power of 2
+#pragma unroll
+        for (std::uint32_t sz = high_sz; sz >= low_sz; sz >>= 1) {
+            if (n_witems >= sz) {
+                const std::uint32_t n_witems_ = (n_witems + 1) >> 1;
+                detail::_fold(local_mem_acc, lid, n_witems - n_witems_,
+                              n_witems_, op);
+                sycl::group_barrier(wg, sycl::memory_scope::work_group);
+                n_witems = n_witems_;
+            }
+        }
+    }
+    else {
+        // wgs is a power of 2
+#pragma unroll
+        for (std::uint32_t sz = high_sz; sz >= low_sz; sz >>= 1) {
+            if (n_witems >= sz) {
+                n_witems >>= 1;
+                detail::_fold(local_mem_acc, lid, n_witems, op);
+                sycl::group_barrier(wg, sycl::memory_scope::work_group);
+            }
+        }
+    }
+
+    T red_val_over_wg = local_mem_acc[0];
+    if (wg.leader()) {
+        for (std::uint32_t i = 1; i < n_witems; ++i) {
+            red_val_over_wg = op(red_val_over_wg, local_mem_acc[i]);
+        }
+    }
+
+    return sycl::group_broadcast(wg, red_val_over_wg, 0);
+}
+
+template <typename GroupT,
+          typename SubGroupT,
+          typename LocAccT,
+          typename T,
+          typename OpT>
+T custom_inclusive_scan_over_group(GroupT &&wg,
+                                   SubGroupT &&sg,
+                                   LocAccT &&local_mem_acc,
+                                   const T &local_val,
+                                   const T &identity,
+                                   OpT &&op)
+{
+    const std::uint32_t local_id = wg.get_local_id(0);
+    const std::uint32_t wgs = wg.get_local_range(0);
+
+    const std::uint32_t lane_id = sg.get_local_id()[0];
+    const std::uint32_t sgSize = sg.get_local_range()[0];
+
+    T scan_val = local_val;
+    for (std::uint32_t step = 1; step < sgSize; step *= 2) {
+        const bool advanced_lane = (lane_id >= step);
+        const std::uint32_t src_lane_id =
+            (advanced_lane ? lane_id - step : lane_id);
+        const T modifier = sycl::select_from_group(sg, scan_val, src_lane_id);
+        if (advanced_lane) {
+            scan_val = op(scan_val, modifier);
+        }
+    }
+
+    local_mem_acc[local_id] = scan_val;
+    sycl::group_barrier(wg, sycl::memory_scope::work_group);
+
+    const std::uint32_t max_sgSize = sg.get_max_local_range()[0];
+    const std::uint32_t sgr_id = sg.get_group_id()[0];
+
+    // now scan
+    const std::uint32_t n_aggregates = 1 + ((wgs - 1) / max_sgSize);
+    const bool large_wg = (n_aggregates > max_sgSize);
+    if (large_wg) {
+        if (wg.leader()) {
+            T _scan_val = identity;
+            for (std::uint32_t i = 1; i <= n_aggregates - max_sgSize; ++i) {
+                _scan_val = op(local_mem_acc[i * max_sgSize - 1], _scan_val);
+                local_mem_acc[i * max_sgSize - 1] = _scan_val;
+            }
+        }
+        sycl::group_barrier(wg, sycl::memory_scope::work_group);
+    }
+
+    if (sgr_id == 0) {
+        const std::uint32_t offset =
+            (large_wg) ? n_aggregates - max_sgSize : 0u;
+        const bool in_range = (lane_id < n_aggregates);
+        const bool in_bounds = in_range && (lane_id > 0 || large_wg);
+
+        T __scan_val = (in_bounds)
+                           ? local_mem_acc[(offset + lane_id) * max_sgSize - 1]
+                           : identity;
+        for (std::uint32_t step = 1; step < sgSize; step *= 2) {
+            const bool advanced_lane = (lane_id >= step);
+            const std::uint32_t src_lane_id =
+                (advanced_lane ? lane_id - step : lane_id);
+            const T modifier =
+                sycl::select_from_group(sg, __scan_val, src_lane_id);
+            if (advanced_lane && in_range) {
+                __scan_val = op(__scan_val, modifier);
+            }
+        }
+        if (in_bounds) {
+            local_mem_acc[(offset + lane_id) * max_sgSize - 1] = __scan_val;
+        }
+    }
+    sycl::group_barrier(wg, sycl::memory_scope::work_group);
+
+    if (sgr_id > 0) {
+        const T modifier = local_mem_acc[sgr_id * max_sgSize - 1];
+        scan_val = op(scan_val, modifier);
+    }
+
+    // ensure all work-items finished reading from SLM
+    sycl::group_barrier(wg, sycl::memory_scope::work_group);
+
+    return scan_val;
+}
+
+// Reduction functors
+
+// Maximum
+
+template <typename T>
+struct Maximum
+{
+    T operator()(const T &x, const T &y) const
+    {
+        if constexpr (detail::IsComplex<T>::value) {
+            using dpctl::tensor::math_utils::max_complex;
+            return max_complex<T>(x, y);
+        }
+        else if constexpr (std::is_floating_point_v<T> ||
+                           std::is_same_v<T, sycl::half>) {
+            return (std::isnan(x) || x > y) ? x : y;
+        }
+        else if constexpr (std::is_same_v<T, bool>) {
+            return x || y;
+        }
+        else {
+            return (x > y) ? x : y;
+        }
+    }
+};
+
+// Minimum
+
+template <typename T>
+struct Minimum
+{
+    T operator()(const T &x, const T &y) const
+    {
+        if constexpr (detail::IsComplex<T>::value) {
+            using dpctl::tensor::math_utils::min_complex;
+            return min_complex<T>(x, y);
+        }
+        else if constexpr (std::is_floating_point_v<T> ||
+                           std::is_same_v<T, sycl::half>) {
+            return (std::isnan(x) || x < y) ? x : y;
+        }
+        else if constexpr (std::is_same_v<T, bool>) {
+            return x && y;
+        }
+        else {
+            return (x < y) ? x : y;
+        }
+    }
+};
+
+// Define identities and operator checking structs
+
+template <typename Op, typename T, typename = void>
+struct GetIdentity
+{
+};
+
+// Maximum
+
+template <typename T, class Op>
+using IsMaximum = std::bool_constant<std::is_same_v<Op, sycl::maximum<T>> ||
+                                     std::is_same_v<Op, Maximum<T>>>;
+
+template <typename T, class Op>
+using IsSyclMaximum = std::bool_constant<std::is_same_v<Op, sycl::maximum<T>>>;
+
+template <typename Op, typename T>
+struct GetIdentity<Op, T, std::enable_if_t<IsMaximum<T, Op>::value>>
+{
+    static constexpr T value =
+        static_cast<T>(std::numeric_limits<T>::has_infinity
+                           ? static_cast<T>(-std::numeric_limits<T>::infinity())
+                           : std::numeric_limits<T>::lowest());
+};
+
+template <typename Op>
+struct GetIdentity<Op, bool, std::enable_if_t<IsMaximum<bool, Op>::value>>
+{
+    static constexpr bool value = false;
+};
+
+template <typename Op, typename T>
+struct GetIdentity<Op,
+                   std::complex<T>,
+                   std::enable_if_t<IsMaximum<std::complex<T>, Op>::value>>
+{
+    static constexpr std::complex<T> value{-std::numeric_limits<T>::infinity(),
+                                           -std::numeric_limits<T>::infinity()};
+};
+
+// Minimum
+
+template <typename T, class Op>
+using IsMinimum = std::bool_constant<std::is_same_v<Op, sycl::minimum<T>> ||
+                                     std::is_same_v<Op, Minimum<T>>>;
+
+template <typename T, class Op>
+using IsSyclMinimum = std::bool_constant<std::is_same_v<Op, sycl::minimum<T>>>;
+
+template <typename Op, typename T>
+struct GetIdentity<Op, T, std::enable_if_t<IsMinimum<T, Op>::value>>
+{
+    static constexpr T value =
+        static_cast<T>(std::numeric_limits<T>::has_infinity
+                           ? static_cast<T>(std::numeric_limits<T>::infinity())
+                           : std::numeric_limits<T>::max());
+};
+
+template <typename Op>
+struct GetIdentity<Op, bool, std::enable_if_t<IsMinimum<bool, Op>::value>>
+{
+    static constexpr bool value = true;
+};
+
+template <typename Op, typename T>
+struct GetIdentity<Op,
+                   std::complex<T>,
+                   std::enable_if_t<IsMinimum<std::complex<T>, Op>::value>>
+{
+    static constexpr std::complex<T> value{std::numeric_limits<T>::infinity(),
+                                           std::numeric_limits<T>::infinity()};
+};
+
+// Plus
+
+template <typename T, class Op>
+using IsPlus = std::bool_constant<std::is_same_v<Op, sycl::plus<T>> ||
+                                  std::is_same_v<Op, std::plus<T>>>;
+
+template <typename T, class Op>
+using IsSyclPlus = std::bool_constant<std::is_same_v<Op, sycl::plus<T>>>;
+
+// Multiplies
+
+template <typename T, class Op>
+using IsMultiplies =
+    std::bool_constant<std::is_same_v<Op, sycl::multiplies<T>> ||
+                       std::is_same_v<Op, std::multiplies<T>>>;
+
+template <typename T, class Op>
+using IsSyclMultiplies =
+    std::bool_constant<std::is_same_v<Op, sycl::multiplies<T>>>;
+
+template <typename Op, typename T>
+struct GetIdentity<Op, T, std::enable_if_t<IsMultiplies<T, Op>::value>>
+{
+    static constexpr T value = static_cast<T>(1);
+};
+
+// LogSumExp
+
+template <typename T>
+struct LogSumExp
+{
+    T operator()(const T &x, const T &y) const
+    {
+        using dpctl::tensor::math_utils::logaddexp;
+        return logaddexp<T>(x, y);
+    }
+};
+
+template <typename T, class Op>
+using IsLogSumExp = std::bool_constant<std::is_same_v<Op, LogSumExp<T>>>;
+
+// only defined for types with infinity
+template <typename Op, typename T>
+struct GetIdentity<Op, T, std::enable_if_t<IsLogSumExp<T, Op>::value>>
+{
+    static constexpr T value = -std::numeric_limits<T>::infinity();
+};
+
+// Hypot
+
+template <typename T>
+struct Hypot
+{
+    T operator()(const T &x, const T &y) const
+    {
+        return sycl::hypot(x, y);
+    }
+};
+
+template <typename T, class Op>
+using IsHypot = std::bool_constant<std::is_same_v<Op, Hypot<T>>>;
+
+template <typename Op, typename T>
+struct GetIdentity<Op, T, std::enable_if_t<IsHypot<T, Op>::value>>
+{
+    static constexpr T value = 0;
+};
+
+// Logical_And
+
+template <typename T, class Op>
+using IsLogicalAnd =
+    std::bool_constant<std::is_same_v<Op, sycl::logical_and<T>> ||
+                       std::is_same_v<Op, std::logical_and<T>>>;
+
+template <typename T, class Op>
+using IsSyclLogicalAnd =
+    std::bool_constant<std::is_same_v<Op, sycl::logical_and<T>>>;
+
+template <typename Op, typename T>
+struct GetIdentity<Op, T, std::enable_if_t<IsLogicalAnd<T, Op>::value>>
+{
+    static constexpr T value = static_cast<T>(1);
+};
+
+// Logical_Or
+
+template <typename T, class Op>
+using IsLogicalOr =
+    std::bool_constant<std::is_same_v<Op, sycl::logical_or<T>> ||
+                       std::is_same_v<Op, std::logical_or<T>>>;
+
+template <typename T, class Op>
+using IsSyclLogicalOr =
+    std::bool_constant<std::is_same_v<Op, sycl::logical_or<T>>>;
+
+template <typename Op, typename T>
+struct GetIdentity<Op, T, std::enable_if_t<IsLogicalOr<T, Op>::value>>
+{
+    static constexpr T value = static_cast<T>(0);
+};
+
+// Identity
+
+template <typename Op, typename T, typename = void>
+struct Identity
+{
+};
+
+template <typename Op, typename T>
+using UseBuiltInIdentity =
+    std::conjunction<IsSyclOp<T, Op>, sycl::has_known_identity<Op, T>>;
+
+template <typename Op, typename T>
+struct Identity<Op, T, std::enable_if_t<!UseBuiltInIdentity<Op, T>::value>>
+{
+    static constexpr T value = GetIdentity<Op, T>::value;
+};
+
+template <typename Op, typename T>
+struct Identity<Op, T, std::enable_if_t<UseBuiltInIdentity<Op, T>::value>>
+{
+    static constexpr T value = sycl::known_identity<Op, T>::value;
+};
+
+// Sub-group load/store
+
+#ifndef USE_GROUP_LOAD_STORE
+#if defined(SYCL_EXT_ONEAPI_GROUP_LOAD_STORE) &&                               \
+    SYCL_EXT_ONEAPI_GROUP_LOAD_STORE
+#define USE_GROUP_LOAD_STORE 1
+#else
+#if defined(__LIBSYCL_MAJOR_VERSION) && (__LIBSYCL_MAJOR_VERSION >= 8u)
+#define USE_GROUP_LOAD_STORE 1
+#else
+#define USE_GROUP_LOAD_STORE 0
+#endif
+#endif
+#endif
+
+#if (USE_GROUP_LOAD_STORE)
+namespace ls_ns = sycl::ext::oneapi::experimental;
+#endif
+
+template <std::uint8_t vec_sz,
+          sycl::access::address_space Space,
+          sycl::access::decorated DecorateAddress,
+          typename ElementType>
+auto sub_group_load(const sycl::sub_group &sg,
+                    sycl::multi_ptr<ElementType, Space, DecorateAddress> m_ptr)
+{
+#if (USE_GROUP_LOAD_STORE)
+    using ValueT = typename std::remove_cv_t<ElementType>;
+    sycl::vec<ValueT, vec_sz> x{};
+    static constexpr auto striped =
+        ls_ns::properties{ls_ns::data_placement_striped};
+    ls_ns::group_load(sg, m_ptr, x, striped);
+    return x;
+#else
+    return sg.load<vec_sz>(m_ptr);
+#endif
+}
+
+template <sycl::access::address_space Space,
+          sycl::access::decorated DecorateAddress,
+          typename ElementType>
+auto sub_group_load(const sycl::sub_group &sg,
+                    sycl::multi_ptr<ElementType, Space, DecorateAddress> m_ptr)
+{
+#if (USE_GROUP_LOAD_STORE)
+    using ValueT = typename std::remove_cv_t<ElementType>;
+    ValueT x{};
+    static constexpr auto striped =
+        ls_ns::properties{ls_ns::data_placement_striped};
+    ls_ns::group_load(sg, m_ptr, x, striped);
+    return x;
+#else
+    return sg.load(m_ptr);
+#endif
+}
+
+template <std::uint8_t vec_sz,
+          sycl::access::address_space Space,
+          sycl::access::decorated DecorateAddress,
+          typename VecT,
+          typename ElementType>
+std::enable_if_t<
+    std::is_same_v<std::remove_cv_t<ElementType>, std::remove_cv_t<VecT>>,
+    void>
+    sub_group_store(const sycl::sub_group &sg,
+                    const sycl::vec<VecT, vec_sz> &val,
+                    sycl::multi_ptr<ElementType, Space, DecorateAddress> m_ptr)
+{
+#if (USE_GROUP_LOAD_STORE)
+    static_assert(std::is_same_v<VecT, ElementType>);
+    static constexpr auto striped =
+        ls_ns::properties{ls_ns::data_placement_striped};
+    ls_ns::group_store(sg, val, m_ptr, striped);
+    return;
+#else
+    sg.store<vec_sz>(m_ptr, val);
+    return;
+#endif
+}
+
+template <sycl::access::address_space Space,
+          sycl::access::decorated DecorateAddress,
+          typename VecT,
+          typename ElementType>
+std::enable_if_t<
+    std::is_same_v<std::remove_cv_t<ElementType>, std::remove_cv_t<VecT>>,
+    void>
+    sub_group_store(const sycl::sub_group &sg,
+                    const VecT &val,
+                    sycl::multi_ptr<ElementType, Space, DecorateAddress> m_ptr)
+{
+#if (USE_GROUP_LOAD_STORE)
+    static constexpr auto striped =
+        ls_ns::properties{ls_ns::data_placement_striped};
+    ls_ns::group_store(sg, val, m_ptr, striped);
+    return;
+#else
+    sg.store(m_ptr, val);
+    return;
+#endif
+}
+} // namespace dpctl::tensor::sycl_utils
diff --git a/dpctl/tensor/libtensor/include/utils/type_dispatch.hpp b/dpctl/tensor/libtensor/include/utils/type_dispatch.hpp
new file mode 100644
index 000000000000..242c2cf8724a
--- /dev/null
+++ b/dpctl/tensor/libtensor/include/utils/type_dispatch.hpp
@@ -0,0 +1,134 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+///
+/// \file
+/// This file defines class to implement dispatch tables for pair of types
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <cstdint>
+#include <stdexcept>
+#include <string>
+
+#include "dpnp4pybind11.hpp"
+
+#include "type_dispatch_building.hpp"
+
+namespace dpctl::tensor::type_dispatch
+{
+struct usm_ndarray_types
+{
+    int typenum_to_lookup_id(int typenum) const
+    {
+        using typenum_t = ::dpctl::tensor::type_dispatch::typenum_t;
+        auto const &api = ::dpctl::detail::dpctl_capi::get();
+
+        if (typenum == api.UAR_DOUBLE_) {
+            return static_cast<int>(typenum_t::DOUBLE);
+        }
+        else if (typenum == api.UAR_INT64_) {
+            return static_cast<int>(typenum_t::INT64);
+        }
+        else if (typenum == api.UAR_INT32_) {
+            return static_cast<int>(typenum_t::INT32);
+        }
+        else if (typenum == api.UAR_BOOL_) {
+            return static_cast<int>(typenum_t::BOOL);
+        }
+        else if (typenum == api.UAR_CDOUBLE_) {
+            return static_cast<int>(typenum_t::CDOUBLE);
+        }
+        else if (typenum == api.UAR_FLOAT_) {
+            return static_cast<int>(typenum_t::FLOAT);
+        }
+        else if (typenum == api.UAR_INT16_) {
+            return static_cast<int>(typenum_t::INT16);
+        }
+        else if (typenum == api.UAR_INT8_) {
+            return static_cast<int>(typenum_t::INT8);
+        }
+        else if (typenum == api.UAR_UINT64_) {
+            return static_cast<int>(typenum_t::UINT64);
+        }
+        else if (typenum == api.UAR_UINT32_) {
+            return static_cast<int>(typenum_t::UINT32);
+        }
+        else if (typenum == api.UAR_UINT16_) {
+            return static_cast<int>(typenum_t::UINT16);
+        }
+        else if (typenum == api.UAR_UINT8_) {
+            return static_cast<int>(typenum_t::UINT8);
+        }
+        else if (typenum == api.UAR_CFLOAT_) {
+            return static_cast<int>(typenum_t::CFLOAT);
+        }
+        else if (typenum == api.UAR_HALF_) {
+            return static_cast<int>(typenum_t::HALF);
+        }
+        else if (typenum == api.UAR_INT_ || typenum == api.UAR_UINT_) {
+            switch (sizeof(int)) {
+            case sizeof(std::int32_t):
+                return ((typenum == api.UAR_INT_)
+                            ? static_cast<int>(typenum_t::INT32)
+                            : static_cast<int>(typenum_t::UINT32));
+            case sizeof(std::int64_t):
+                return ((typenum == api.UAR_INT_)
+                            ? static_cast<int>(typenum_t::INT64)
+                            : static_cast<int>(typenum_t::UINT64));
+            default:
+                throw_unrecognized_typenum_error(typenum);
+            }
+        }
+        else if (typenum == api.UAR_LONGLONG_ || typenum == api.UAR_ULONGLONG_)
+        {
+            switch (sizeof(long long)) {
+            case sizeof(std::int64_t):
+                return ((typenum == api.UAR_LONGLONG_)
+                            ? static_cast<int>(typenum_t::INT64)
+                            : static_cast<int>(typenum_t::UINT64));
+            default:
+                throw_unrecognized_typenum_error(typenum);
+            }
+        }
+        else {
+            throw_unrecognized_typenum_error(typenum);
+        }
+        // return code signalling error, should never be reached
+        assert(false);
+        return -1;
+    }
+
+private:
+    void throw_unrecognized_typenum_error(int typenum) const
+    {
+        throw std::runtime_error("Unrecognized typenum " +
+                                 std::to_string(typenum) + " encountered.");
+    }
+};
+} // namespace dpctl::tensor::type_dispatch
diff --git a/dpctl/tensor/libtensor/include/utils/type_dispatch_building.hpp b/dpctl/tensor/libtensor/include/utils/type_dispatch_building.hpp
new file mode 100644
index 000000000000..b1e02eb1513b
--- /dev/null
+++ b/dpctl/tensor/libtensor/include/utils/type_dispatch_building.hpp
@@ -0,0 +1,300 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+///
+/// \file
+/// This file defines class to implement dispatch tables for pair of types
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <complex>
+#include <cstdint>
+#include <type_traits>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+namespace dpctl::tensor::type_dispatch
+{
+enum class typenum_t : int
+{
+    BOOL = 0,
+    INT8, // 1
+    UINT8,
+    INT16,
+    UINT16,
+    INT32, // 5
+    UINT32,
+    INT64,
+    UINT64,
+    HALF,
+    FLOAT, // 10
+    DOUBLE,
+    CFLOAT,
+    CDOUBLE, // 13
+};
+inline constexpr int num_types = 14; // number of elements in typenum_t
+
+template <typename funcPtrT,
+          template <typename fnT, typename D, typename S>
+          typename factory,
+          int _num_types>
+class DispatchTableBuilder
+{
+private:
+    template <typename dstTy>
+    const std::vector<funcPtrT> row_per_dst_type() const
+    {
+        std::vector<funcPtrT> per_dstTy = {
+            factory<funcPtrT, dstTy, bool>{}.get(),
+            factory<funcPtrT, dstTy, std::int8_t>{}.get(),
+            factory<funcPtrT, dstTy, std::uint8_t>{}.get(),
+            factory<funcPtrT, dstTy, std::int16_t>{}.get(),
+            factory<funcPtrT, dstTy, std::uint16_t>{}.get(),
+            factory<funcPtrT, dstTy, std::int32_t>{}.get(),
+            factory<funcPtrT, dstTy, std::uint32_t>{}.get(),
+            factory<funcPtrT, dstTy, std::int64_t>{}.get(),
+            factory<funcPtrT, dstTy, std::uint64_t>{}.get(),
+            factory<funcPtrT, dstTy, sycl::half>{}.get(),
+            factory<funcPtrT, dstTy, float>{}.get(),
+            factory<funcPtrT, dstTy, double>{}.get(),
+            factory<funcPtrT, dstTy, std::complex<float>>{}.get(),
+            factory<funcPtrT, dstTy, std::complex<double>>{}.get()};
+        assert(per_dstTy.size() == _num_types);
+        return per_dstTy;
+    }
+
+public:
+    DispatchTableBuilder() = default;
+    ~DispatchTableBuilder() = default;
+
+    void populate_dispatch_table(funcPtrT table[][_num_types]) const
+    {
+        const auto map_by_dst_type = {row_per_dst_type<bool>(),
+                                      row_per_dst_type<std::int8_t>(),
+                                      row_per_dst_type<std::uint8_t>(),
+                                      row_per_dst_type<std::int16_t>(),
+                                      row_per_dst_type<std::uint16_t>(),
+                                      row_per_dst_type<std::int32_t>(),
+                                      row_per_dst_type<std::uint32_t>(),
+                                      row_per_dst_type<std::int64_t>(),
+                                      row_per_dst_type<std::uint64_t>(),
+                                      row_per_dst_type<sycl::half>(),
+                                      row_per_dst_type<float>(),
+                                      row_per_dst_type<double>(),
+                                      row_per_dst_type<std::complex<float>>(),
+                                      row_per_dst_type<std::complex<double>>()};
+        assert(map_by_dst_type.size() == _num_types);
+        int dst_id = 0;
+        for (const auto &row : map_by_dst_type) {
+            int src_id = 0;
+            for (const auto &fn_ptr : row) {
+                table[dst_id][src_id] = fn_ptr;
+                ++src_id;
+            }
+            ++dst_id;
+        }
+    }
+};
+
+template <typename funcPtrT,
+          template <typename fnT, typename T>
+          typename factory,
+          int _num_types>
+class DispatchVectorBuilder
+{
+private:
+    template <typename Ty>
+    const funcPtrT func_per_type() const
+    {
+        funcPtrT f = factory<funcPtrT, Ty>{}.get();
+        return f;
+    }
+
+public:
+    DispatchVectorBuilder() = default;
+    ~DispatchVectorBuilder() = default;
+
+    void populate_dispatch_vector(funcPtrT vector[]) const
+    {
+        const auto fn_map_by_type = {func_per_type<bool>(),
+                                     func_per_type<std::int8_t>(),
+                                     func_per_type<std::uint8_t>(),
+                                     func_per_type<std::int16_t>(),
+                                     func_per_type<std::uint16_t>(),
+                                     func_per_type<std::int32_t>(),
+                                     func_per_type<std::uint32_t>(),
+                                     func_per_type<std::int64_t>(),
+                                     func_per_type<std::uint64_t>(),
+                                     func_per_type<sycl::half>(),
+                                     func_per_type<float>(),
+                                     func_per_type<double>(),
+                                     func_per_type<std::complex<float>>(),
+                                     func_per_type<std::complex<double>>()};
+        assert(fn_map_by_type.size() == _num_types);
+        int ty_id = 0;
+        for (const auto &fn : fn_map_by_type) {
+            vector[ty_id] = fn;
+            ++ty_id;
+        }
+    }
+};
+
+/*! @brief struct to define result_type typename for Ty == ArgTy */
+template <typename Ty, typename ArgTy, typename ResTy = ArgTy>
+struct TypeMapResultEntry : std::is_same<Ty, ArgTy>
+{
+    using result_type = ResTy;
+};
+
+/*! @brief struct to define result_type typename for Ty1 == ArgTy1 && Ty2 ==
+ * ArgTy2 */
+template <typename Ty1,
+          typename ArgTy1,
+          typename Ty2,
+          typename ArgTy2,
+          typename ResTy>
+struct BinaryTypeMapResultEntry
+    : std::conjunction<std::is_same<Ty1, ArgTy1>, std::is_same<Ty2, ArgTy2>>
+{
+    using result_type = ResTy;
+};
+
+/*! @brief fall-through struct with specified result_type, usually void */
+template <typename Ty = void>
+struct DefaultResultEntry : std::true_type
+{
+    using result_type = Ty;
+};
+
+/*! @brief Utility struct to convert C++ type into typeid integer */
+template <typename T>
+struct GetTypeid
+{
+    int get()
+    {
+        if constexpr (std::is_same_v<T, bool>) {
+            return static_cast<int>(typenum_t::BOOL);
+        }
+        else if constexpr (std::is_same_v<T, std::int8_t>) {
+            return static_cast<int>(typenum_t::INT8);
+        }
+        else if constexpr (std::is_same_v<T, std::uint8_t>) {
+            return static_cast<int>(typenum_t::UINT8);
+        }
+        else if constexpr (std::is_same_v<T, std::int16_t>) {
+            return static_cast<int>(typenum_t::INT16);
+        }
+        else if constexpr (std::is_same_v<T, std::uint16_t>) {
+            return static_cast<int>(typenum_t::UINT16);
+        }
+        else if constexpr (std::is_same_v<T, std::int32_t>) {
+            return static_cast<int>(typenum_t::INT32);
+        }
+        else if constexpr (std::is_same_v<T, std::uint32_t>) {
+            return static_cast<int>(typenum_t::UINT32);
+        }
+        else if constexpr (std::is_same_v<T, std::int64_t>) {
+            return static_cast<int>(typenum_t::INT64);
+        }
+        else if constexpr (std::is_same_v<T, std::uint64_t>) {
+            return static_cast<int>(typenum_t::UINT64);
+        }
+        else if constexpr (std::is_same_v<T, sycl::half>) {
+            return static_cast<int>(typenum_t::HALF);
+        }
+        else if constexpr (std::is_same_v<T, float>) {
+            return static_cast<int>(typenum_t::FLOAT);
+        }
+        else if constexpr (std::is_same_v<T, double>) {
+            return static_cast<int>(typenum_t::DOUBLE);
+        }
+        else if constexpr (std::is_same_v<T, std::complex<float>>) {
+            return static_cast<int>(typenum_t::CFLOAT);
+        }
+        else if constexpr (std::is_same_v<T, std::complex<double>>) {
+            return static_cast<int>(typenum_t::CDOUBLE);
+        }
+        else if constexpr (std::is_same_v<T, void>) { // special token
+            return -1;
+        }
+
+        assert(("Unsupported type T", false));
+        return -2;
+    }
+};
+
+/*! @brief Class to generate vector of null function pointers */
+template <typename FunPtrT>
+struct NullPtrVector
+{
+
+    using value_type = FunPtrT;
+    using const_reference = value_type const &;
+
+    NullPtrVector() : val(nullptr) {}
+
+    const_reference operator[](int) const
+    {
+        return val;
+    }
+
+private:
+    value_type val;
+};
+
+/*! @brief Class to generate table of null function pointers */
+template <typename FunPtrT>
+struct NullPtrTable
+{
+    using value_type = NullPtrVector<FunPtrT>;
+    using const_reference = value_type const &;
+
+    NullPtrTable() : val() {}
+
+    const_reference operator[](int) const
+    {
+        return val;
+    }
+
+private:
+    value_type val;
+};
+
+template <typename Ty1, typename ArgTy, typename Ty2, typename outTy>
+struct TypePairDefinedEntry
+    : std::conjunction<std::is_same<Ty1, ArgTy>, std::is_same<Ty2, outTy>>
+{
+    static constexpr bool is_defined = true;
+};
+
+struct NotDefinedEntry : std::true_type
+{
+    static constexpr bool is_defined = false;
+};
+} // namespace dpctl::tensor::type_dispatch
diff --git a/dpctl/tensor/libtensor/include/utils/type_utils.hpp b/dpctl/tensor/libtensor/include/utils/type_utils.hpp
new file mode 100644
index 000000000000..e5855081c727
--- /dev/null
+++ b/dpctl/tensor/libtensor/include/utils/type_utils.hpp
@@ -0,0 +1,164 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+///
+/// \file
+/// This file defines functions for value casting.
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <complex>
+#include <cstddef>
+#include <cstdint>
+#include <stdexcept>
+#include <type_traits>
+#include <utility>
+
+#include <sycl/sycl.hpp>
+
+namespace dpctl::tensor::type_utils
+{
+template <typename T, typename = void>
+struct is_complex : public std::false_type
+{
+};
+
+template <typename T>
+struct is_complex<
+    T,
+    std::enable_if_t<std::is_same_v<std::remove_cv_t<T>, std::complex<float>> ||
+                     std::is_same_v<std::remove_cv_t<T>, std::complex<double>>>>
+    : public std::true_type
+{
+};
+
+template <typename T>
+inline constexpr bool is_complex_v = is_complex<T>::value;
+
+template <typename dstTy, typename srcTy>
+dstTy convert_impl(const srcTy &v)
+{
+    if constexpr (std::is_same_v<dstTy, srcTy>) {
+        return v;
+    }
+    else if constexpr (std::is_same_v<dstTy, bool>) {
+        if constexpr (is_complex_v<srcTy>) {
+            // bool(complex_v) ==
+            //     (complex_v.real() != 0) && (complex_v.imag() !=0)
+            return (convert_impl<bool, typename srcTy::value_type>(v.real()) ||
+                    convert_impl<bool, typename srcTy::value_type>(v.imag()));
+        }
+        else {
+            return static_cast<dstTy>(v != srcTy{0});
+        }
+    }
+    else if constexpr (std::is_same_v<srcTy, bool>) {
+        // C++ interprets a byte of storage behind bool by only
+        // testing is least significant bit, leading to both
+        // 0x00 and 0x02 interpreted as False, while 0x01 and 0xFF
+        // interpreted as True. NumPy's interpretation of underlying
+        // storage is different: any bit set is interpreted as True,
+        // no bits set as False, see gh-2121
+        const std::uint8_t &u = sycl::bit_cast<std::uint8_t>(v);
+        if constexpr (is_complex_v<dstTy>) {
+            return (u == 0) ? dstTy{} : dstTy{1, 0};
+        }
+        else {
+            return (u == 0) ? dstTy{} : dstTy{1};
+        }
+    }
+    else if constexpr (is_complex_v<srcTy> && !is_complex_v<dstTy>) {
+        // real_t(complex_v) == real_t(complex_v.real())
+        return convert_impl<dstTy, typename srcTy::value_type>(v.real());
+    }
+    else if constexpr (!std::is_integral_v<srcTy> &&
+                       !std::is_same_v<dstTy, bool> &&
+                       std::is_integral_v<dstTy> && std::is_unsigned_v<dstTy>)
+    {
+        // first cast to signed variant, the cast to unsigned one
+        using signedT = typename std::make_signed_t<dstTy>;
+        return static_cast<dstTy>(convert_impl<signedT, srcTy>(v));
+    }
+    else {
+        return static_cast<dstTy>(v);
+    }
+}
+
+template <typename T>
+void validate_type_for_device(const sycl::device &d)
+{
+    if constexpr (std::is_same_v<T, double>) {
+        if (!d.has(sycl::aspect::fp64)) {
+            throw std::runtime_error("Device " +
+                                     d.get_info<sycl::info::device::name>() +
+                                     " does not support type 'float64'");
+        }
+    }
+    else if constexpr (std::is_same_v<T, std::complex<double>>) {
+        if (!d.has(sycl::aspect::fp64)) {
+            throw std::runtime_error("Device " +
+                                     d.get_info<sycl::info::device::name>() +
+                                     " does not support type 'complex128'");
+        }
+    }
+    else if constexpr (std::is_same_v<T, sycl::half>) {
+        if (!d.has(sycl::aspect::fp16)) {
+            throw std::runtime_error("Device " +
+                                     d.get_info<sycl::info::device::name>() +
+                                     " does not support type 'float16'");
+        }
+    }
+}
+
+template <typename T>
+void validate_type_for_device(const sycl::queue &q)
+{
+    validate_type_for_device<T>(q.get_device());
+}
+
+template <typename Op, typename Vec, std::size_t... I>
+auto vec_cast_impl(const Vec &v, std::index_sequence<I...>)
+{
+    return Op{v[I]...};
+}
+
+template <typename dstT,
+          typename srcT,
+          std::size_t N,
+          typename Indices = std::make_index_sequence<N>>
+auto vec_cast(const sycl::vec<srcT, N> &s)
+{
+    if constexpr (std::is_same_v<srcT, dstT>) {
+        return s;
+    }
+    else {
+        return vec_cast_impl<sycl::vec<dstT, N>, sycl::vec<srcT, N>>(s,
+                                                                     Indices{});
+    }
+}
+} // namespace dpctl::tensor::type_utils
diff --git a/dpnp/backend/CMakeLists.txt b/dpnp/backend/CMakeLists.txt
index ddca557a08f4..433ab298d476 100644
--- a/dpnp/backend/CMakeLists.txt
+++ b/dpnp/backend/CMakeLists.txt
@@ -89,7 +89,6 @@ target_compile_definitions(${_trgt} PUBLIC PSTL_USE_PARALLEL_POLICIES=0)
 target_compile_definitions(${_trgt} PUBLIC ONEDPL_USE_PREDEFINED_POLICIES=0)
 
 target_include_directories(${_trgt} PUBLIC ${Dpctl_INCLUDE_DIR})
-target_include_directories(${_trgt} PUBLIC ${Dpctl_TENSOR_INCLUDE_DIR})
 
 target_link_directories(${_trgt} PUBLIC "${Dpctl_INCLUDE_DIR}/..")
 target_link_libraries(${_trgt} PUBLIC DPCTLSyclInterface)
diff --git a/dpnp/backend/extensions/blas/CMakeLists.txt b/dpnp/backend/extensions/blas/CMakeLists.txt
index 267567c69e71..0015eda84843 100644
--- a/dpnp/backend/extensions/blas/CMakeLists.txt
+++ b/dpnp/backend/extensions/blas/CMakeLists.txt
@@ -65,15 +65,12 @@ set_target_properties(
 
 target_include_directories(
     ${python_module_name}
-    PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../../
+    PRIVATE
+        ${CMAKE_CURRENT_SOURCE_DIR}/../common
+        ${CMAKE_SOURCE_DIR}/dpnp/backend/include
+        ${CMAKE_SOURCE_DIR}/dpctl/tensor/libtensor/include
 )
-target_include_directories(
-    ${python_module_name}
-    PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../common
-)
-
-target_include_directories(${python_module_name} PUBLIC ${Dpctl_INCLUDE_DIRS})
-target_include_directories(${python_module_name} PUBLIC ${Dpctl_TENSOR_INCLUDE_DIR})
+target_include_directories(${python_module_name} PUBLIC ${Dpctl_INCLUDE_DIR})
 
 if(WIN32)
     target_compile_options(
diff --git a/dpnp/backend/extensions/blas/dot_common.hpp b/dpnp/backend/extensions/blas/dot_common.hpp
index 1672e7217cba..369e3320473c 100644
--- a/dpnp/backend/extensions/blas/dot_common.hpp
+++ b/dpnp/backend/extensions/blas/dot_common.hpp
@@ -29,6 +29,7 @@
 #pragma once
 
 #include <oneapi/mkl.hpp>
+
 #include <pybind11/pybind11.h>
 
 // dpctl tensor headers
diff --git a/dpnp/backend/extensions/blas/gemm.hpp b/dpnp/backend/extensions/blas/gemm.hpp
index 997d515f98a0..59a3d911d885 100644
--- a/dpnp/backend/extensions/blas/gemm.hpp
+++ b/dpnp/backend/extensions/blas/gemm.hpp
@@ -31,7 +31,7 @@
 #include <oneapi/mkl.hpp>
 #include <sycl/sycl.hpp>
 
-#include <dpctl4pybind11.hpp>
+#include "dpnp4pybind11.hpp"
 
 namespace dpnp::extensions::blas
 {
diff --git a/dpnp/backend/extensions/blas/gemv.hpp b/dpnp/backend/extensions/blas/gemv.hpp
index afe0c6387aa9..6da71ed0964f 100644
--- a/dpnp/backend/extensions/blas/gemv.hpp
+++ b/dpnp/backend/extensions/blas/gemv.hpp
@@ -31,7 +31,7 @@
 #include <oneapi/mkl.hpp>
 #include <sycl/sycl.hpp>
 
-#include <dpctl4pybind11.hpp>
+#include "dpnp4pybind11.hpp"
 
 namespace dpnp::extensions::blas
 {
diff --git a/dpnp/backend/extensions/blas/syrk.hpp b/dpnp/backend/extensions/blas/syrk.hpp
index 580239b28008..f6cec189489a 100644
--- a/dpnp/backend/extensions/blas/syrk.hpp
+++ b/dpnp/backend/extensions/blas/syrk.hpp
@@ -31,7 +31,7 @@
 #include <oneapi/mkl.hpp>
 #include <sycl/sycl.hpp>
 
-#include <dpctl4pybind11.hpp>
+#include "dpnp4pybind11.hpp"
 
 namespace dpnp::extensions::blas
 {
diff --git a/dpnp/backend/extensions/common/ext/common.hpp b/dpnp/backend/extensions/common/ext/common.hpp
index d626b56ea00c..036eb635a3bd 100644
--- a/dpnp/backend/extensions/common/ext/common.hpp
+++ b/dpnp/backend/extensions/common/ext/common.hpp
@@ -29,8 +29,10 @@
 #pragma once
 
 #include <complex>
+
 #include <pybind11/numpy.h>
 #include <pybind11/pybind11.h>
+
 #include <sycl/sycl.hpp>
 
 // dpctl tensor headers
diff --git a/dpnp/backend/extensions/common/ext/details/common_internal.hpp b/dpnp/backend/extensions/common/ext/details/common_internal.hpp
index 31d9671a0a43..8db72ce32318 100644
--- a/dpnp/backend/extensions/common/ext/details/common_internal.hpp
+++ b/dpnp/backend/extensions/common/ext/details/common_internal.hpp
@@ -30,9 +30,11 @@
 
 #include <algorithm>
 
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+
 #include "ext/common.hpp"
 #include "utils/type_dispatch.hpp"
-#include <pybind11/pybind11.h>
 
 namespace dpctl_td_ns = dpctl::tensor::type_dispatch;
 
diff --git a/dpnp/backend/extensions/common/ext/validation_utils.hpp b/dpnp/backend/extensions/common/ext/validation_utils.hpp
index d41db8d5ca5a..03e0718d4450 100644
--- a/dpnp/backend/extensions/common/ext/validation_utils.hpp
+++ b/dpnp/backend/extensions/common/ext/validation_utils.hpp
@@ -32,7 +32,10 @@
 #include <unordered_map>
 #include <vector>
 
-#include "dpctl4pybind11.hpp"
+#include "dpnp4pybind11.hpp"
+
+// dpctl tensor headers
+#include "utils/type_dispatch.hpp"
 
 namespace ext::validation
 {
diff --git a/dpnp/backend/extensions/elementwise_functions/elementwise_functions.hpp b/dpnp/backend/extensions/elementwise_functions/elementwise_functions.hpp
index c996ac07df02..e23f74a678dc 100644
--- a/dpnp/backend/extensions/elementwise_functions/elementwise_functions.hpp
+++ b/dpnp/backend/extensions/elementwise_functions/elementwise_functions.hpp
@@ -34,12 +34,12 @@
 #include <utility>
 #include <vector>
 
-#include <sycl/sycl.hpp>
-
-#include "dpctl4pybind11.hpp"
 #include <pybind11/numpy.h>
 #include <pybind11/pybind11.h>
-#include <pybind11/stl.h>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
 
 #include "elementwise_functions_type_utils.hpp"
 #include "simplify_iteration_space.hpp"
diff --git a/dpnp/backend/extensions/elementwise_functions/elementwise_functions_type_utils.cpp b/dpnp/backend/extensions/elementwise_functions/elementwise_functions_type_utils.cpp
index 62f7584a3e0c..7300f938eabb 100644
--- a/dpnp/backend/extensions/elementwise_functions/elementwise_functions_type_utils.cpp
+++ b/dpnp/backend/extensions/elementwise_functions/elementwise_functions_type_utils.cpp
@@ -26,12 +26,13 @@
 // THE POSSIBILITY OF SUCH DAMAGE.
 //*****************************************************************************
 
-#include "dpctl4pybind11.hpp"
-
 #include <pybind11/numpy.h>
 #include <pybind11/pybind11.h>
+
 #include <sycl/sycl.hpp>
 
+#include "dpnp4pybind11.hpp"
+
 #include "elementwise_functions_type_utils.hpp"
 
 // dpctl tensor headers
diff --git a/dpnp/backend/extensions/elementwise_functions/elementwise_functions_type_utils.hpp b/dpnp/backend/extensions/elementwise_functions/elementwise_functions_type_utils.hpp
index 1bb6fedd7027..58fe43c01589 100644
--- a/dpnp/backend/extensions/elementwise_functions/elementwise_functions_type_utils.hpp
+++ b/dpnp/backend/extensions/elementwise_functions/elementwise_functions_type_utils.hpp
@@ -28,10 +28,10 @@
 
 #pragma once
 
-#include "dpctl4pybind11.hpp"
 #include <pybind11/numpy.h>
 #include <pybind11/pybind11.h>
-#include <pybind11/stl.h>
+
+#include "dpnp4pybind11.hpp"
 
 // dpctl tensor headers
 #include "utils/type_dispatch.hpp"
diff --git a/dpnp/backend/extensions/fft/CMakeLists.txt b/dpnp/backend/extensions/fft/CMakeLists.txt
index 50468857e3b9..0569ecc8bca4 100644
--- a/dpnp/backend/extensions/fft/CMakeLists.txt
+++ b/dpnp/backend/extensions/fft/CMakeLists.txt
@@ -59,15 +59,11 @@ set_target_properties(
 
 target_include_directories(
     ${python_module_name}
-    PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../../include
+    PRIVATE
+        ${CMAKE_SOURCE_DIR}/dpnp/backend/include
+        ${CMAKE_SOURCE_DIR}/dpctl/tensor/libtensor/include
 )
-target_include_directories(
-    ${python_module_name}
-    PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../../src
-)
-
-target_include_directories(${python_module_name} PUBLIC ${Dpctl_INCLUDE_DIRS})
-target_include_directories(${python_module_name} PUBLIC ${Dpctl_TENSOR_INCLUDE_DIR})
+target_include_directories(${python_module_name} PUBLIC ${Dpctl_INCLUDE_DIR})
 
 if(WIN32)
     target_compile_options(
diff --git a/dpnp/backend/extensions/fft/in_place.hpp b/dpnp/backend/extensions/fft/in_place.hpp
index 7eed11565b9e..bc35201b9b6e 100644
--- a/dpnp/backend/extensions/fft/in_place.hpp
+++ b/dpnp/backend/extensions/fft/in_place.hpp
@@ -28,10 +28,13 @@
 
 #pragma once
 
+#include <utility>
+#include <vector>
+
 #include <oneapi/mkl.hpp>
 #include <sycl/sycl.hpp>
 
-#include <dpctl4pybind11.hpp>
+#include "dpnp4pybind11.hpp"
 
 namespace dpnp::extensions::fft
 {
diff --git a/dpnp/backend/extensions/fft/in_place.tpp b/dpnp/backend/extensions/fft/in_place.tpp
index 4bc166b0e7ae..ace535284de6 100644
--- a/dpnp/backend/extensions/fft/in_place.tpp
+++ b/dpnp/backend/extensions/fft/in_place.tpp
@@ -27,15 +27,23 @@
 //*****************************************************************************
 
 #pragma once
+
 #include <stdexcept>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include <pybind11/pybind11.h>
 
 #include <oneapi/mkl.hpp>
 #include <sycl/sycl.hpp>
 
-#include <dpctl4pybind11.hpp>
+#include "dpnp4pybind11.hpp"
 
 #include "common.hpp"
 #include "fft_utils.hpp"
+#include "in_place.hpp"
+
 // dpctl tensor headers
 #include "utils/output_validation.hpp"
 
diff --git a/dpnp/backend/extensions/fft/out_of_place.hpp b/dpnp/backend/extensions/fft/out_of_place.hpp
index 811a2bd6d1c4..55ca9383baaf 100644
--- a/dpnp/backend/extensions/fft/out_of_place.hpp
+++ b/dpnp/backend/extensions/fft/out_of_place.hpp
@@ -28,10 +28,13 @@
 
 #pragma once
 
+#include <utility>
+#include <vector>
+
 #include <oneapi/mkl.hpp>
 #include <sycl/sycl.hpp>
 
-#include <dpctl4pybind11.hpp>
+#include "dpnp4pybind11.hpp"
 
 namespace dpnp::extensions::fft
 {
diff --git a/dpnp/backend/extensions/fft/out_of_place.tpp b/dpnp/backend/extensions/fft/out_of_place.tpp
index 290408dc60bc..e468246ea7af 100644
--- a/dpnp/backend/extensions/fft/out_of_place.tpp
+++ b/dpnp/backend/extensions/fft/out_of_place.tpp
@@ -27,15 +27,25 @@
 //*****************************************************************************
 
 #pragma once
+
+#include <cstddef>
+#include <cstdint>
 #include <stdexcept>
+#include <string>
+#include <utility>
+#include <vector>
 
 #include <oneapi/mkl.hpp>
 #include <sycl/sycl.hpp>
 
-#include <dpctl4pybind11.hpp>
+#include <pybind11/pybind11.h>
+
+#include "dpnp4pybind11.hpp"
 
 #include "common.hpp"
 #include "fft_utils.hpp"
+#include "out_of_place.hpp"
+
 // dpctl tensor headers
 #include "utils/memory_overlap.hpp"
 #include "utils/output_validation.hpp"
diff --git a/dpnp/backend/extensions/indexing/CMakeLists.txt b/dpnp/backend/extensions/indexing/CMakeLists.txt
index a6691f31f559..c0de75ae3146 100644
--- a/dpnp/backend/extensions/indexing/CMakeLists.txt
+++ b/dpnp/backend/extensions/indexing/CMakeLists.txt
@@ -62,15 +62,12 @@ set_target_properties(
 
 target_include_directories(
     ${python_module_name}
-    PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../../
+    PRIVATE
+        ${CMAKE_CURRENT_SOURCE_DIR}/../common
+        ${CMAKE_SOURCE_DIR}/dpnp/backend/include
+        ${CMAKE_SOURCE_DIR}/dpctl/tensor/libtensor/include
 )
-target_include_directories(
-    ${python_module_name}
-    PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../common
-)
-
 target_include_directories(${python_module_name} PUBLIC ${Dpctl_INCLUDE_DIR})
-target_include_directories(${python_module_name} PUBLIC ${Dpctl_TENSOR_INCLUDE_DIR})
 
 if(WIN32)
     target_compile_options(
diff --git a/dpnp/backend/extensions/indexing/choose.cpp b/dpnp/backend/extensions/indexing/choose.cpp
index 99d91744366f..7b5284418b00 100644
--- a/dpnp/backend/extensions/indexing/choose.cpp
+++ b/dpnp/backend/extensions/indexing/choose.cpp
@@ -30,15 +30,18 @@
 #include <cstddef>
 #include <cstdint>
 #include <memory>
-#include <pybind11/pybind11.h>
-#include <pybind11/stl.h>
-#include <sycl/sycl.hpp>
 #include <type_traits>
 #include <utility>
 #include <vector>
 
+#include <sycl/sycl.hpp>
+
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "dpnp4pybind11.hpp"
+
 #include "choose_kernel.hpp"
-#include "dpctl4pybind11.hpp"
 
 // utils extension header
 #include "ext/common.hpp"
@@ -52,7 +55,6 @@
 
 namespace dpnp::extensions::indexing
 {
-
 namespace td_ns = dpctl::tensor::type_dispatch;
 
 static kernels::choose_fn_ptr_t choose_clip_dispatch_table[td_ns::num_types]
@@ -459,5 +461,4 @@ void init_choose(py::module_ m)
 
     return;
 }
-
 } // namespace dpnp::extensions::indexing
diff --git a/dpnp/backend/extensions/lapack/CMakeLists.txt b/dpnp/backend/extensions/lapack/CMakeLists.txt
index 5e8b95963e94..76b25c3a6d10 100644
--- a/dpnp/backend/extensions/lapack/CMakeLists.txt
+++ b/dpnp/backend/extensions/lapack/CMakeLists.txt
@@ -82,15 +82,13 @@ set_target_properties(
 
 target_include_directories(
     ${python_module_name}
-    PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../../
+    PRIVATE
+        ${CMAKE_CURRENT_SOURCE_DIR}/../../
+        ${CMAKE_CURRENT_SOURCE_DIR}/../common
+        ${CMAKE_SOURCE_DIR}/dpnp/backend/include
+        ${CMAKE_SOURCE_DIR}/dpctl/tensor/libtensor/include
 )
-target_include_directories(
-    ${python_module_name}
-    PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../common
-)
-
 target_include_directories(${python_module_name} PUBLIC ${Dpctl_INCLUDE_DIR})
-target_include_directories(${python_module_name} PUBLIC ${Dpctl_TENSOR_INCLUDE_DIR})
 
 if(WIN32)
     target_compile_options(
diff --git a/dpnp/backend/extensions/lapack/geqrf.hpp b/dpnp/backend/extensions/lapack/geqrf.hpp
index 522006ace8ab..7be1fee971cf 100644
--- a/dpnp/backend/extensions/lapack/geqrf.hpp
+++ b/dpnp/backend/extensions/lapack/geqrf.hpp
@@ -31,7 +31,7 @@
 #include <oneapi/mkl.hpp>
 #include <sycl/sycl.hpp>
 
-#include <dpctl4pybind11.hpp>
+#include "dpnp4pybind11.hpp"
 
 namespace dpnp::extensions::lapack
 {
diff --git a/dpnp/backend/extensions/lapack/gesv.hpp b/dpnp/backend/extensions/lapack/gesv.hpp
index d4198efae62e..a86039c9b72e 100644
--- a/dpnp/backend/extensions/lapack/gesv.hpp
+++ b/dpnp/backend/extensions/lapack/gesv.hpp
@@ -31,7 +31,7 @@
 #include <oneapi/mkl.hpp>
 #include <sycl/sycl.hpp>
 
-#include <dpctl4pybind11.hpp>
+#include "dpnp4pybind11.hpp"
 
 namespace dpnp::extensions::lapack
 {
diff --git a/dpnp/backend/extensions/lapack/gesvd.hpp b/dpnp/backend/extensions/lapack/gesvd.hpp
index 116348e01d9f..b2fea5e47299 100644
--- a/dpnp/backend/extensions/lapack/gesvd.hpp
+++ b/dpnp/backend/extensions/lapack/gesvd.hpp
@@ -31,7 +31,7 @@
 #include <oneapi/mkl.hpp>
 #include <sycl/sycl.hpp>
 
-#include <dpctl4pybind11.hpp>
+#include "dpnp4pybind11.hpp"
 
 namespace dpnp::extensions::lapack
 {
diff --git a/dpnp/backend/extensions/lapack/getrf.hpp b/dpnp/backend/extensions/lapack/getrf.hpp
index 24ec473f4dc7..ce6dc3e788b5 100644
--- a/dpnp/backend/extensions/lapack/getrf.hpp
+++ b/dpnp/backend/extensions/lapack/getrf.hpp
@@ -31,7 +31,7 @@
 #include <oneapi/mkl.hpp>
 #include <sycl/sycl.hpp>
 
-#include <dpctl4pybind11.hpp>
+#include "dpnp4pybind11.hpp"
 
 namespace dpnp::extensions::lapack
 {
diff --git a/dpnp/backend/extensions/lapack/getri.hpp b/dpnp/backend/extensions/lapack/getri.hpp
index d8c8e58f3fcb..728af4a77e01 100644
--- a/dpnp/backend/extensions/lapack/getri.hpp
+++ b/dpnp/backend/extensions/lapack/getri.hpp
@@ -31,7 +31,7 @@
 #include <oneapi/mkl.hpp>
 #include <sycl/sycl.hpp>
 
-#include <dpctl4pybind11.hpp>
+#include "dpnp4pybind11.hpp"
 
 namespace dpnp::extensions::lapack
 {
diff --git a/dpnp/backend/extensions/lapack/getrs.hpp b/dpnp/backend/extensions/lapack/getrs.hpp
index f5a47c69c9ec..2728b0c4e04a 100644
--- a/dpnp/backend/extensions/lapack/getrs.hpp
+++ b/dpnp/backend/extensions/lapack/getrs.hpp
@@ -31,7 +31,7 @@
 #include <oneapi/mkl.hpp>
 #include <sycl/sycl.hpp>
 
-#include <dpctl4pybind11.hpp>
+#include "dpnp4pybind11.hpp"
 
 namespace dpnp::extensions::lapack
 {
diff --git a/dpnp/backend/extensions/lapack/heevd.cpp b/dpnp/backend/extensions/lapack/heevd.cpp
index 5990e5344a17..923e950b1383 100644
--- a/dpnp/backend/extensions/lapack/heevd.cpp
+++ b/dpnp/backend/extensions/lapack/heevd.cpp
@@ -28,6 +28,7 @@
 
 #include <stdexcept>
 
+#include <pybind11/pybind11.h>
 #include <pybind11/stl.h>
 
 #include "evd_common.hpp"
diff --git a/dpnp/backend/extensions/lapack/heevd_batch.cpp b/dpnp/backend/extensions/lapack/heevd_batch.cpp
index e1c1a96bc320..9d7c3300dbf7 100644
--- a/dpnp/backend/extensions/lapack/heevd_batch.cpp
+++ b/dpnp/backend/extensions/lapack/heevd_batch.cpp
@@ -28,6 +28,7 @@
 
 #include <stdexcept>
 
+#include <pybind11/pybind11.h>
 #include <pybind11/stl.h>
 
 #include "common_helpers.hpp"
diff --git a/dpnp/backend/extensions/lapack/orgqr.hpp b/dpnp/backend/extensions/lapack/orgqr.hpp
index 962edc7b668f..2502fe567a1f 100644
--- a/dpnp/backend/extensions/lapack/orgqr.hpp
+++ b/dpnp/backend/extensions/lapack/orgqr.hpp
@@ -31,7 +31,7 @@
 #include <oneapi/mkl.hpp>
 #include <sycl/sycl.hpp>
 
-#include <dpctl4pybind11.hpp>
+#include "dpnp4pybind11.hpp"
 
 namespace dpnp::extensions::lapack
 {
diff --git a/dpnp/backend/extensions/lapack/potrf.hpp b/dpnp/backend/extensions/lapack/potrf.hpp
index d5df48a9ddf4..02faf2c04fde 100644
--- a/dpnp/backend/extensions/lapack/potrf.hpp
+++ b/dpnp/backend/extensions/lapack/potrf.hpp
@@ -31,7 +31,7 @@
 #include <oneapi/mkl.hpp>
 #include <sycl/sycl.hpp>
 
-#include <dpctl4pybind11.hpp>
+#include "dpnp4pybind11.hpp"
 
 namespace dpnp::extensions::lapack
 {
diff --git a/dpnp/backend/extensions/lapack/syevd.cpp b/dpnp/backend/extensions/lapack/syevd.cpp
index af69cf9e6b7e..3c09ca4f587b 100644
--- a/dpnp/backend/extensions/lapack/syevd.cpp
+++ b/dpnp/backend/extensions/lapack/syevd.cpp
@@ -28,6 +28,7 @@
 
 #include <stdexcept>
 
+#include <pybind11/pybind11.h>
 #include <pybind11/stl.h>
 
 #include "evd_common.hpp"
diff --git a/dpnp/backend/extensions/lapack/syevd_batch.cpp b/dpnp/backend/extensions/lapack/syevd_batch.cpp
index 0c326e5d79bb..36d1c820f00d 100644
--- a/dpnp/backend/extensions/lapack/syevd_batch.cpp
+++ b/dpnp/backend/extensions/lapack/syevd_batch.cpp
@@ -28,6 +28,7 @@
 
 #include <stdexcept>
 
+#include <pybind11/pybind11.h>
 #include <pybind11/stl.h>
 
 #include "common_helpers.hpp"
diff --git a/dpnp/backend/extensions/lapack/ungqr.hpp b/dpnp/backend/extensions/lapack/ungqr.hpp
index a149af1e24e1..8c9a36b3f4a6 100644
--- a/dpnp/backend/extensions/lapack/ungqr.hpp
+++ b/dpnp/backend/extensions/lapack/ungqr.hpp
@@ -31,7 +31,7 @@
 #include <oneapi/mkl.hpp>
 #include <sycl/sycl.hpp>
 
-#include <dpctl4pybind11.hpp>
+#include "dpnp4pybind11.hpp"
 
 namespace dpnp::extensions::lapack
 {
diff --git a/dpnp/backend/extensions/statistics/CMakeLists.txt b/dpnp/backend/extensions/statistics/CMakeLists.txt
index 9561daf27ce2..e04279b75e49 100644
--- a/dpnp/backend/extensions/statistics/CMakeLists.txt
+++ b/dpnp/backend/extensions/statistics/CMakeLists.txt
@@ -67,19 +67,12 @@ set_target_properties(
 
 target_include_directories(
     ${python_module_name}
-    PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../../include
+    PRIVATE
+        ${CMAKE_CURRENT_SOURCE_DIR}/../common
+        ${CMAKE_SOURCE_DIR}/dpnp/backend/include
+        ${CMAKE_SOURCE_DIR}/dpctl/tensor/libtensor/include
 )
-target_include_directories(
-    ${python_module_name}
-    PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../../src
-)
-target_include_directories(
-    ${python_module_name}
-    PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../common
-)
-
 target_include_directories(${python_module_name} PUBLIC ${Dpctl_INCLUDE_DIR})
-target_include_directories(${python_module_name} PUBLIC ${Dpctl_TENSOR_INCLUDE_DIR})
 
 if(WIN32)
     target_compile_options(
diff --git a/dpnp/backend/extensions/statistics/bincount.hpp b/dpnp/backend/extensions/statistics/bincount.hpp
index 5e42952349b0..2fc477e71edc 100644
--- a/dpnp/backend/extensions/statistics/bincount.hpp
+++ b/dpnp/backend/extensions/statistics/bincount.hpp
@@ -31,7 +31,8 @@
 #include <pybind11/pybind11.h>
 #include <sycl/sycl.hpp>
 
-#include "dpctl4pybind11.hpp"
+#include "dpnp4pybind11.hpp"
+
 #include "ext/dispatch_table.hpp"
 
 namespace dpctl_td_ns = dpctl::tensor::type_dispatch;
diff --git a/dpnp/backend/extensions/statistics/histogram.cpp b/dpnp/backend/extensions/statistics/histogram.cpp
index 6d7da6836f60..afc5d9638f48 100644
--- a/dpnp/backend/extensions/statistics/histogram.cpp
+++ b/dpnp/backend/extensions/statistics/histogram.cpp
@@ -35,8 +35,9 @@
 #include <pybind11/pybind11.h>
 #include <pybind11/stl.h>
 
+#include "dpnp4pybind11.hpp"
+
 // dpctl tensor headers
-#include "dpctl4pybind11.hpp"
 #include "utils/type_dispatch.hpp"
 
 #include "histogram.hpp"
@@ -50,7 +51,6 @@ using namespace ext::common;
 
 namespace
 {
-
 template <typename T, typename DataStorage>
 struct HistogramEdges
 {
diff --git a/dpnp/backend/extensions/statistics/histogram.hpp b/dpnp/backend/extensions/statistics/histogram.hpp
index c6a79ec24ee3..d04d8edbf02b 100644
--- a/dpnp/backend/extensions/statistics/histogram.hpp
+++ b/dpnp/backend/extensions/statistics/histogram.hpp
@@ -31,7 +31,9 @@
 #include <pybind11/pybind11.h>
 #include <sycl/sycl.hpp>
 
-#include "dpctl4pybind11.hpp"
+#include "dpnp4pybind11.hpp"
+
+// utils extension header
 #include "ext/dispatch_table.hpp"
 
 namespace statistics::histogram
diff --git a/dpnp/backend/extensions/statistics/histogram_common.cpp b/dpnp/backend/extensions/statistics/histogram_common.cpp
index 82afa2bd965d..252e1cd7c7cc 100644
--- a/dpnp/backend/extensions/statistics/histogram_common.cpp
+++ b/dpnp/backend/extensions/statistics/histogram_common.cpp
@@ -31,15 +31,18 @@
 #include <string>
 #include <vector>
 
-#include "dpctl4pybind11.hpp"
-#include "utils/type_dispatch.hpp"
-
 #include <pybind11/pybind11.h>
 
+#include "dpnp4pybind11.hpp"
+
 #include "histogram_common.hpp"
 
+// utils extension header
 #include "ext/validation_utils.hpp"
 
+// dpctl tensor headers
+#include "utils/type_dispatch.hpp"
+
 namespace dpctl_td_ns = dpctl::tensor::type_dispatch;
 using dpctl::tensor::usm_ndarray;
 using dpctl_td_ns::typenum_t;
@@ -57,7 +60,6 @@ using ext::validation::name_of;
 
 namespace statistics::histogram
 {
-
 void validate(const usm_ndarray &sample,
               const std::optional<const dpctl::tensor::usm_ndarray> &bins,
               const std::optional<const dpctl::tensor::usm_ndarray> &weights,
diff --git a/dpnp/backend/extensions/statistics/histogramdd.hpp b/dpnp/backend/extensions/statistics/histogramdd.hpp
index 327e9941dbc6..d7c46ae34b7d 100644
--- a/dpnp/backend/extensions/statistics/histogramdd.hpp
+++ b/dpnp/backend/extensions/statistics/histogramdd.hpp
@@ -31,7 +31,9 @@
 #include <pybind11/pybind11.h>
 #include <sycl/sycl.hpp>
 
-#include "dpctl4pybind11.hpp"
+#include "dpnp4pybind11.hpp"
+
+// utils extension header
 #include "ext/dispatch_table.hpp"
 
 namespace statistics::histogram
diff --git a/dpnp/backend/extensions/statistics/sliding_dot_product1d.cpp b/dpnp/backend/extensions/statistics/sliding_dot_product1d.cpp
index b8f679f1030e..6c0e39a11a19 100644
--- a/dpnp/backend/extensions/statistics/sliding_dot_product1d.cpp
+++ b/dpnp/backend/extensions/statistics/sliding_dot_product1d.cpp
@@ -33,11 +33,14 @@
 #include <pybind11/pybind11.h>
 #include <pybind11/stl.h>
 
+#include "dpnp4pybind11.hpp"
+
+// utils extension header
+#include "ext/common.hpp"
+
 // dpctl tensor headers
-#include "dpctl4pybind11.hpp"
 #include "utils/type_dispatch.hpp"
 
-#include "ext/common.hpp"
 #include "sliding_dot_product1d.hpp"
 #include "sliding_window1d.hpp"
 
@@ -51,7 +54,6 @@ using namespace ext::common;
 
 namespace
 {
-
 template <typename T>
 struct SlidingDotProductF
 {
diff --git a/dpnp/backend/extensions/statistics/sliding_window1d.cpp b/dpnp/backend/extensions/statistics/sliding_window1d.cpp
index 3ae66daa332b..81f8ae40104e 100644
--- a/dpnp/backend/extensions/statistics/sliding_window1d.cpp
+++ b/dpnp/backend/extensions/statistics/sliding_window1d.cpp
@@ -29,11 +29,16 @@
 #include <string>
 #include <vector>
 
-#include "dpctl4pybind11.hpp"
-#include "utils/type_dispatch.hpp"
 #include <pybind11/pybind11.h>
 
+#include "dpnp4pybind11.hpp"
+
+// utils extension header
 #include "ext/validation_utils.hpp"
+
+// dpctl tensor headers
+#include "utils/type_dispatch.hpp"
+
 #include "sliding_window1d.hpp"
 
 namespace dpctl_td_ns = dpctl::tensor::type_dispatch;
@@ -48,7 +53,6 @@ using ext::validation::name_of;
 
 namespace statistics::sliding_window1d
 {
-
 void validate(const usm_ndarray &a,
               const usm_ndarray &v,
               const usm_ndarray &out,
@@ -89,5 +93,4 @@ void validate(const usm_ndarray &a,
             std::to_string(expected_output_size) + ")");
     }
 }
-
 } // namespace statistics::sliding_window1d
diff --git a/dpnp/backend/extensions/ufunc/CMakeLists.txt b/dpnp/backend/extensions/ufunc/CMakeLists.txt
index b24d5d131cfe..55a750f8423f 100644
--- a/dpnp/backend/extensions/ufunc/CMakeLists.txt
+++ b/dpnp/backend/extensions/ufunc/CMakeLists.txt
@@ -84,15 +84,13 @@ set_target_properties(
 
 target_include_directories(
     ${python_module_name}
-    PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../../
+    PRIVATE
+        ${CMAKE_CURRENT_SOURCE_DIR}/../../
+        ${CMAKE_CURRENT_SOURCE_DIR}/../common
+        ${CMAKE_SOURCE_DIR}/dpnp/backend/include
+        ${CMAKE_SOURCE_DIR}/dpctl/tensor/libtensor/include
 )
-target_include_directories(
-    ${python_module_name}
-    PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../common
-)
-
 target_include_directories(${python_module_name} PUBLIC ${Dpctl_INCLUDE_DIR})
-target_include_directories(${python_module_name} PUBLIC ${Dpctl_TENSOR_INCLUDE_DIR})
 
 if(_dpnp_sycl_targets)
     # make fat binary
diff --git a/dpnp/backend/extensions/ufunc/elementwise_functions/bitwise_count.cpp b/dpnp/backend/extensions/ufunc/elementwise_functions/bitwise_count.cpp
index a0842f4ef259..761bd330a326 100644
--- a/dpnp/backend/extensions/ufunc/elementwise_functions/bitwise_count.cpp
+++ b/dpnp/backend/extensions/ufunc/elementwise_functions/bitwise_count.cpp
@@ -30,9 +30,13 @@
 #include <type_traits>
 #include <vector>
 
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
 #include <sycl/sycl.hpp>
 
-#include "dpctl4pybind11.hpp"
+#include "dpnp4pybind11.hpp"
 
 #include "bitwise_count.hpp"
 #include "kernels/elementwise_functions/bitwise_count.hpp"
diff --git a/dpnp/backend/extensions/ufunc/elementwise_functions/degrees.cpp b/dpnp/backend/extensions/ufunc/elementwise_functions/degrees.cpp
index 77452a6b777f..729fcb576c77 100644
--- a/dpnp/backend/extensions/ufunc/elementwise_functions/degrees.cpp
+++ b/dpnp/backend/extensions/ufunc/elementwise_functions/degrees.cpp
@@ -29,9 +29,13 @@
 #include <type_traits>
 #include <vector>
 
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
 #include <sycl/sycl.hpp>
 
-#include "dpctl4pybind11.hpp"
+#include "dpnp4pybind11.hpp"
 
 #include "degrees.hpp"
 #include "kernels/elementwise_functions/degrees.hpp"
diff --git a/dpnp/backend/extensions/ufunc/elementwise_functions/divmod.cpp b/dpnp/backend/extensions/ufunc/elementwise_functions/divmod.cpp
index af87dcc85f53..1bb3859a39f4 100644
--- a/dpnp/backend/extensions/ufunc/elementwise_functions/divmod.cpp
+++ b/dpnp/backend/extensions/ufunc/elementwise_functions/divmod.cpp
@@ -30,9 +30,13 @@
 #include <type_traits>
 #include <vector>
 
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
 #include <sycl/sycl.hpp>
 
-#include "dpctl4pybind11.hpp"
+#include "dpnp4pybind11.hpp"
 
 #include "divmod.hpp"
 #include "kernels/elementwise_functions/divmod.hpp"
diff --git a/dpnp/backend/extensions/ufunc/elementwise_functions/erf_funcs.cpp b/dpnp/backend/extensions/ufunc/elementwise_functions/erf_funcs.cpp
index 5254e50d3faf..fff0118d06aa 100644
--- a/dpnp/backend/extensions/ufunc/elementwise_functions/erf_funcs.cpp
+++ b/dpnp/backend/extensions/ufunc/elementwise_functions/erf_funcs.cpp
@@ -29,9 +29,13 @@
 #include <type_traits>
 #include <vector>
 
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
 #include <sycl/sycl.hpp>
 
-#include "dpctl4pybind11.hpp"
+#include "dpnp4pybind11.hpp"
 
 #include "erf_funcs.hpp"
 #include "kernels/elementwise_functions/erf.hpp"
diff --git a/dpnp/backend/extensions/ufunc/elementwise_functions/fabs.cpp b/dpnp/backend/extensions/ufunc/elementwise_functions/fabs.cpp
index d2b6ae24ac4b..f7c2183633af 100644
--- a/dpnp/backend/extensions/ufunc/elementwise_functions/fabs.cpp
+++ b/dpnp/backend/extensions/ufunc/elementwise_functions/fabs.cpp
@@ -29,9 +29,13 @@
 #include <type_traits>
 #include <vector>
 
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
 #include <sycl/sycl.hpp>
 
-#include "dpctl4pybind11.hpp"
+#include "dpnp4pybind11.hpp"
 
 #include "fabs.hpp"
 #include "kernels/elementwise_functions/fabs.hpp"
diff --git a/dpnp/backend/extensions/ufunc/elementwise_functions/float_power.cpp b/dpnp/backend/extensions/ufunc/elementwise_functions/float_power.cpp
index 0994afc7c738..43927eb93806 100644
--- a/dpnp/backend/extensions/ufunc/elementwise_functions/float_power.cpp
+++ b/dpnp/backend/extensions/ufunc/elementwise_functions/float_power.cpp
@@ -30,9 +30,13 @@
 #include <type_traits>
 #include <vector>
 
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
 #include <sycl/sycl.hpp>
 
-#include "dpctl4pybind11.hpp"
+#include "dpnp4pybind11.hpp"
 
 #include "float_power.hpp"
 
diff --git a/dpnp/backend/extensions/ufunc/elementwise_functions/fmax.cpp b/dpnp/backend/extensions/ufunc/elementwise_functions/fmax.cpp
index 5e1a9f33444b..9471feaf2166 100644
--- a/dpnp/backend/extensions/ufunc/elementwise_functions/fmax.cpp
+++ b/dpnp/backend/extensions/ufunc/elementwise_functions/fmax.cpp
@@ -28,9 +28,13 @@
 
 #include <vector>
 
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
 #include <sycl/sycl.hpp>
 
-#include "dpctl4pybind11.hpp"
+#include "dpnp4pybind11.hpp"
 
 #include "fmax.hpp"
 #include "kernels/elementwise_functions/fmax.hpp"
diff --git a/dpnp/backend/extensions/ufunc/elementwise_functions/fmin.cpp b/dpnp/backend/extensions/ufunc/elementwise_functions/fmin.cpp
index c0e1db654317..8e279897f414 100644
--- a/dpnp/backend/extensions/ufunc/elementwise_functions/fmin.cpp
+++ b/dpnp/backend/extensions/ufunc/elementwise_functions/fmin.cpp
@@ -28,9 +28,13 @@
 
 #include <vector>
 
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
 #include <sycl/sycl.hpp>
 
-#include "dpctl4pybind11.hpp"
+#include "dpnp4pybind11.hpp"
 
 #include "fmin.hpp"
 #include "kernels/elementwise_functions/fmin.hpp"
diff --git a/dpnp/backend/extensions/ufunc/elementwise_functions/fmod.cpp b/dpnp/backend/extensions/ufunc/elementwise_functions/fmod.cpp
index 5b83595b3f7c..83fb750b6907 100644
--- a/dpnp/backend/extensions/ufunc/elementwise_functions/fmod.cpp
+++ b/dpnp/backend/extensions/ufunc/elementwise_functions/fmod.cpp
@@ -30,9 +30,13 @@
 #include <type_traits>
 #include <vector>
 
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
 #include <sycl/sycl.hpp>
 
-#include "dpctl4pybind11.hpp"
+#include "dpnp4pybind11.hpp"
 
 #include "fmod.hpp"
 #include "kernels/elementwise_functions/fmod.hpp"
diff --git a/dpnp/backend/extensions/ufunc/elementwise_functions/frexp.cpp b/dpnp/backend/extensions/ufunc/elementwise_functions/frexp.cpp
index 4439f1e76993..17e09f3ee816 100644
--- a/dpnp/backend/extensions/ufunc/elementwise_functions/frexp.cpp
+++ b/dpnp/backend/extensions/ufunc/elementwise_functions/frexp.cpp
@@ -31,9 +31,13 @@
 #include <utility>
 #include <vector>
 
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
 #include <sycl/sycl.hpp>
 
-#include "dpctl4pybind11.hpp"
+#include "dpnp4pybind11.hpp"
 
 #include "frexp.hpp"
 #include "kernels/elementwise_functions/frexp.hpp"
diff --git a/dpnp/backend/extensions/ufunc/elementwise_functions/gcd.cpp b/dpnp/backend/extensions/ufunc/elementwise_functions/gcd.cpp
index ec10504fa15e..0481365356ca 100644
--- a/dpnp/backend/extensions/ufunc/elementwise_functions/gcd.cpp
+++ b/dpnp/backend/extensions/ufunc/elementwise_functions/gcd.cpp
@@ -30,9 +30,13 @@
 #include <type_traits>
 #include <vector>
 
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
 #include <sycl/sycl.hpp>
 
-#include "dpctl4pybind11.hpp"
+#include "dpnp4pybind11.hpp"
 
 #include "gcd.hpp"
 #include "kernels/elementwise_functions/gcd.hpp"
diff --git a/dpnp/backend/extensions/ufunc/elementwise_functions/heaviside.cpp b/dpnp/backend/extensions/ufunc/elementwise_functions/heaviside.cpp
index e3212de86f7f..62affd206420 100644
--- a/dpnp/backend/extensions/ufunc/elementwise_functions/heaviside.cpp
+++ b/dpnp/backend/extensions/ufunc/elementwise_functions/heaviside.cpp
@@ -29,9 +29,13 @@
 #include <type_traits>
 #include <vector>
 
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
 #include <sycl/sycl.hpp>
 
-#include "dpctl4pybind11.hpp"
+#include "dpnp4pybind11.hpp"
 
 #include "heaviside.hpp"
 #include "kernels/elementwise_functions/heaviside.hpp"
diff --git a/dpnp/backend/extensions/ufunc/elementwise_functions/i0.cpp b/dpnp/backend/extensions/ufunc/elementwise_functions/i0.cpp
index 4d120a56e837..53ded341b58b 100644
--- a/dpnp/backend/extensions/ufunc/elementwise_functions/i0.cpp
+++ b/dpnp/backend/extensions/ufunc/elementwise_functions/i0.cpp
@@ -29,9 +29,13 @@
 #include <type_traits>
 #include <vector>
 
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
 #include <sycl/sycl.hpp>
 
-#include "dpctl4pybind11.hpp"
+#include "dpnp4pybind11.hpp"
 
 #include "i0.hpp"
 #include "kernels/elementwise_functions/i0.hpp"
diff --git a/dpnp/backend/extensions/ufunc/elementwise_functions/interpolate.cpp b/dpnp/backend/extensions/ufunc/elementwise_functions/interpolate.cpp
index 33c7ab19b9ab..82e96ab732de 100644
--- a/dpnp/backend/extensions/ufunc/elementwise_functions/interpolate.cpp
+++ b/dpnp/backend/extensions/ufunc/elementwise_functions/interpolate.cpp
@@ -35,12 +35,14 @@
 #include <utility>
 #include <vector>
 
-#include <sycl/sycl.hpp>
-
-#include "dpctl4pybind11.hpp"
+#include <pybind11/numpy.h>
 #include <pybind11/pybind11.h>
 #include <pybind11/stl.h>
 
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+
 // dpctl tensor headers
 #include "utils/type_dispatch.hpp"
 #include "utils/type_utils.hpp"
diff --git a/dpnp/backend/extensions/ufunc/elementwise_functions/isclose.cpp b/dpnp/backend/extensions/ufunc/elementwise_functions/isclose.cpp
index b8179feb9263..3025cbf16586 100644
--- a/dpnp/backend/extensions/ufunc/elementwise_functions/isclose.cpp
+++ b/dpnp/backend/extensions/ufunc/elementwise_functions/isclose.cpp
@@ -32,12 +32,14 @@
 #include <type_traits>
 #include <vector>
 
-#include <sycl/sycl.hpp>
-
-#include "dpctl4pybind11.hpp"
+#include <pybind11/numpy.h>
 #include <pybind11/pybind11.h>
 #include <pybind11/stl.h>
 
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+
 #include "kernels/elementwise_functions/isclose.hpp"
 
 #include "../../elementwise_functions/simplify_iteration_space.hpp"
diff --git a/dpnp/backend/extensions/ufunc/elementwise_functions/lcm.cpp b/dpnp/backend/extensions/ufunc/elementwise_functions/lcm.cpp
index 4276ceb6b246..35138e903eac 100644
--- a/dpnp/backend/extensions/ufunc/elementwise_functions/lcm.cpp
+++ b/dpnp/backend/extensions/ufunc/elementwise_functions/lcm.cpp
@@ -30,9 +30,13 @@
 #include <type_traits>
 #include <vector>
 
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
 #include <sycl/sycl.hpp>
 
-#include "dpctl4pybind11.hpp"
+#include "dpnp4pybind11.hpp"
 
 #include "kernels/elementwise_functions/lcm.hpp"
 #include "lcm.hpp"
diff --git a/dpnp/backend/extensions/ufunc/elementwise_functions/ldexp.cpp b/dpnp/backend/extensions/ufunc/elementwise_functions/ldexp.cpp
index 3e2c4f3d0149..44ef51726a6a 100644
--- a/dpnp/backend/extensions/ufunc/elementwise_functions/ldexp.cpp
+++ b/dpnp/backend/extensions/ufunc/elementwise_functions/ldexp.cpp
@@ -30,9 +30,13 @@
 #include <type_traits>
 #include <vector>
 
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
 #include <sycl/sycl.hpp>
 
-#include "dpctl4pybind11.hpp"
+#include "dpnp4pybind11.hpp"
 
 #include "kernels/elementwise_functions/ldexp.hpp"
 #include "ldexp.hpp"
diff --git a/dpnp/backend/extensions/ufunc/elementwise_functions/logaddexp2.cpp b/dpnp/backend/extensions/ufunc/elementwise_functions/logaddexp2.cpp
index 57c7c60ca9cf..e37f13b119d6 100644
--- a/dpnp/backend/extensions/ufunc/elementwise_functions/logaddexp2.cpp
+++ b/dpnp/backend/extensions/ufunc/elementwise_functions/logaddexp2.cpp
@@ -28,9 +28,13 @@
 
 #include <vector>
 
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
 #include <sycl/sycl.hpp>
 
-#include "dpctl4pybind11.hpp"
+#include "dpnp4pybind11.hpp"
 
 #include "kernels/elementwise_functions/logaddexp2.hpp"
 #include "logaddexp2.hpp"
diff --git a/dpnp/backend/extensions/ufunc/elementwise_functions/modf.cpp b/dpnp/backend/extensions/ufunc/elementwise_functions/modf.cpp
index f8aab23d5630..266103248521 100644
--- a/dpnp/backend/extensions/ufunc/elementwise_functions/modf.cpp
+++ b/dpnp/backend/extensions/ufunc/elementwise_functions/modf.cpp
@@ -31,9 +31,13 @@
 #include <utility>
 #include <vector>
 
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
 #include <sycl/sycl.hpp>
 
-#include "dpctl4pybind11.hpp"
+#include "dpnp4pybind11.hpp"
 
 #include "kernels/elementwise_functions/modf.hpp"
 #include "modf.hpp"
diff --git a/dpnp/backend/extensions/ufunc/elementwise_functions/nan_to_num.cpp b/dpnp/backend/extensions/ufunc/elementwise_functions/nan_to_num.cpp
index 2490f1921a98..c30d388f8afd 100644
--- a/dpnp/backend/extensions/ufunc/elementwise_functions/nan_to_num.cpp
+++ b/dpnp/backend/extensions/ufunc/elementwise_functions/nan_to_num.cpp
@@ -38,11 +38,12 @@
 
 #include <sycl/sycl.hpp>
 
-#include "dpctl4pybind11.hpp"
 #include <pybind11/numpy.h>
 #include <pybind11/pybind11.h>
 #include <pybind11/stl.h>
 
+#include "dpnp4pybind11.hpp"
+
 #include "kernels/elementwise_functions/nan_to_num.hpp"
 
 #include "../../elementwise_functions/simplify_iteration_space.hpp"
diff --git a/dpnp/backend/extensions/ufunc/elementwise_functions/radians.cpp b/dpnp/backend/extensions/ufunc/elementwise_functions/radians.cpp
index 7fc8ae5331dd..0a481fd33d11 100644
--- a/dpnp/backend/extensions/ufunc/elementwise_functions/radians.cpp
+++ b/dpnp/backend/extensions/ufunc/elementwise_functions/radians.cpp
@@ -29,9 +29,13 @@
 #include <type_traits>
 #include <vector>
 
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
 #include <sycl/sycl.hpp>
 
-#include "dpctl4pybind11.hpp"
+#include "dpnp4pybind11.hpp"
 
 #include "kernels/elementwise_functions/radians.hpp"
 #include "populate.hpp"
diff --git a/dpnp/backend/extensions/ufunc/elementwise_functions/sinc.cpp b/dpnp/backend/extensions/ufunc/elementwise_functions/sinc.cpp
index abd02e1e6282..87a911472db2 100644
--- a/dpnp/backend/extensions/ufunc/elementwise_functions/sinc.cpp
+++ b/dpnp/backend/extensions/ufunc/elementwise_functions/sinc.cpp
@@ -30,9 +30,13 @@
 #include <type_traits>
 #include <vector>
 
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
 #include <sycl/sycl.hpp>
 
-#include "dpctl4pybind11.hpp"
+#include "dpnp4pybind11.hpp"
 
 #include "kernels/elementwise_functions/sinc.hpp"
 #include "populate.hpp"
diff --git a/dpnp/backend/extensions/ufunc/elementwise_functions/spacing.cpp b/dpnp/backend/extensions/ufunc/elementwise_functions/spacing.cpp
index 6e401c5388dd..4c14582f30ae 100644
--- a/dpnp/backend/extensions/ufunc/elementwise_functions/spacing.cpp
+++ b/dpnp/backend/extensions/ufunc/elementwise_functions/spacing.cpp
@@ -29,9 +29,13 @@
 #include <type_traits>
 #include <vector>
 
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
 #include <sycl/sycl.hpp>
 
-#include "dpctl4pybind11.hpp"
+#include "dpnp4pybind11.hpp"
 
 #include "kernels/elementwise_functions/spacing.hpp"
 #include "populate.hpp"
diff --git a/dpnp/backend/extensions/vm/CMakeLists.txt b/dpnp/backend/extensions/vm/CMakeLists.txt
index 0e3a17df77e0..32d6a6765a00 100644
--- a/dpnp/backend/extensions/vm/CMakeLists.txt
+++ b/dpnp/backend/extensions/vm/CMakeLists.txt
@@ -107,15 +107,12 @@ set_target_properties(
 
 target_include_directories(
     ${python_module_name}
-    PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../../
+    PRIVATE
+        ${CMAKE_CURRENT_SOURCE_DIR}/../common
+        ${CMAKE_SOURCE_DIR}/dpnp/backend/include
+        ${CMAKE_SOURCE_DIR}/dpctl/tensor/libtensor/include
 )
-target_include_directories(
-    ${python_module_name}
-    PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../common
-)
-
 target_include_directories(${python_module_name} PUBLIC ${Dpctl_INCLUDE_DIR})
-target_include_directories(${python_module_name} PUBLIC ${Dpctl_TENSOR_INCLUDE_DIR})
 
 if(WIN32)
     target_compile_options(
diff --git a/dpnp/backend/extensions/vm/abs.cpp b/dpnp/backend/extensions/vm/abs.cpp
index 133f3077ac43..1dc8143dd5ff 100644
--- a/dpnp/backend/extensions/vm/abs.cpp
+++ b/dpnp/backend/extensions/vm/abs.cpp
@@ -35,7 +35,10 @@
 #include <oneapi/mkl.hpp>
 #include <sycl/sycl.hpp>
 
-#include "dpctl4pybind11.hpp"
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "dpnp4pybind11.hpp"
 
 #include "abs.hpp"
 #include "common.hpp"
diff --git a/dpnp/backend/extensions/vm/acos.cpp b/dpnp/backend/extensions/vm/acos.cpp
index 0cb9bb32f4b8..15b4ce80cc3c 100644
--- a/dpnp/backend/extensions/vm/acos.cpp
+++ b/dpnp/backend/extensions/vm/acos.cpp
@@ -35,7 +35,10 @@
 #include <oneapi/mkl.hpp>
 #include <sycl/sycl.hpp>
 
-#include "dpctl4pybind11.hpp"
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "dpnp4pybind11.hpp"
 
 #include "acos.hpp"
 #include "common.hpp"
diff --git a/dpnp/backend/extensions/vm/acosh.cpp b/dpnp/backend/extensions/vm/acosh.cpp
index fa25ecf5cc1e..eed835b78e10 100644
--- a/dpnp/backend/extensions/vm/acosh.cpp
+++ b/dpnp/backend/extensions/vm/acosh.cpp
@@ -35,7 +35,10 @@
 #include <oneapi/mkl.hpp>
 #include <sycl/sycl.hpp>
 
-#include "dpctl4pybind11.hpp"
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "dpnp4pybind11.hpp"
 
 #include "acosh.hpp"
 #include "common.hpp"
diff --git a/dpnp/backend/extensions/vm/add.cpp b/dpnp/backend/extensions/vm/add.cpp
index 165671c93415..a58aac727cd1 100644
--- a/dpnp/backend/extensions/vm/add.cpp
+++ b/dpnp/backend/extensions/vm/add.cpp
@@ -36,7 +36,10 @@
 #include <oneapi/mkl.hpp>
 #include <sycl/sycl.hpp>
 
-#include "dpctl4pybind11.hpp"
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "dpnp4pybind11.hpp"
 
 #include "add.hpp"
 #include "common.hpp"
diff --git a/dpnp/backend/extensions/vm/arg.cpp b/dpnp/backend/extensions/vm/arg.cpp
index e062f1f2ee06..c50c4a33dee1 100644
--- a/dpnp/backend/extensions/vm/arg.cpp
+++ b/dpnp/backend/extensions/vm/arg.cpp
@@ -35,7 +35,10 @@
 #include <oneapi/mkl.hpp>
 #include <sycl/sycl.hpp>
 
-#include "dpctl4pybind11.hpp"
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "dpnp4pybind11.hpp"
 
 #include "arg.hpp"
 #include "common.hpp"
diff --git a/dpnp/backend/extensions/vm/asin.cpp b/dpnp/backend/extensions/vm/asin.cpp
index 8a2e1c079ed8..5af7033fed21 100644
--- a/dpnp/backend/extensions/vm/asin.cpp
+++ b/dpnp/backend/extensions/vm/asin.cpp
@@ -35,7 +35,10 @@
 #include <oneapi/mkl.hpp>
 #include <sycl/sycl.hpp>
 
-#include "dpctl4pybind11.hpp"
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "dpnp4pybind11.hpp"
 
 #include "asin.hpp"
 #include "common.hpp"
diff --git a/dpnp/backend/extensions/vm/asinh.cpp b/dpnp/backend/extensions/vm/asinh.cpp
index 176bacdb92a8..5b0f8ed13106 100644
--- a/dpnp/backend/extensions/vm/asinh.cpp
+++ b/dpnp/backend/extensions/vm/asinh.cpp
@@ -35,7 +35,10 @@
 #include <oneapi/mkl.hpp>
 #include <sycl/sycl.hpp>
 
-#include "dpctl4pybind11.hpp"
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "dpnp4pybind11.hpp"
 
 #include "asinh.hpp"
 #include "common.hpp"
diff --git a/dpnp/backend/extensions/vm/atan.cpp b/dpnp/backend/extensions/vm/atan.cpp
index 21c8c8f1c9d5..2255000c1c4b 100644
--- a/dpnp/backend/extensions/vm/atan.cpp
+++ b/dpnp/backend/extensions/vm/atan.cpp
@@ -35,7 +35,10 @@
 #include <oneapi/mkl.hpp>
 #include <sycl/sycl.hpp>
 
-#include "dpctl4pybind11.hpp"
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "dpnp4pybind11.hpp"
 
 #include "atan.hpp"
 #include "common.hpp"
diff --git a/dpnp/backend/extensions/vm/atan2.cpp b/dpnp/backend/extensions/vm/atan2.cpp
index 1d4e5c333e68..bf29e2921a1d 100644
--- a/dpnp/backend/extensions/vm/atan2.cpp
+++ b/dpnp/backend/extensions/vm/atan2.cpp
@@ -35,7 +35,10 @@
 #include <oneapi/mkl.hpp>
 #include <sycl/sycl.hpp>
 
-#include "dpctl4pybind11.hpp"
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "dpnp4pybind11.hpp"
 
 #include "atan2.hpp"
 #include "common.hpp"
diff --git a/dpnp/backend/extensions/vm/atanh.cpp b/dpnp/backend/extensions/vm/atanh.cpp
index 7097fabf602f..9daab09980e6 100644
--- a/dpnp/backend/extensions/vm/atanh.cpp
+++ b/dpnp/backend/extensions/vm/atanh.cpp
@@ -35,7 +35,10 @@
 #include <oneapi/mkl.hpp>
 #include <sycl/sycl.hpp>
 
-#include "dpctl4pybind11.hpp"
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "dpnp4pybind11.hpp"
 
 #include "atanh.hpp"
 #include "common.hpp"
diff --git a/dpnp/backend/extensions/vm/cbrt.cpp b/dpnp/backend/extensions/vm/cbrt.cpp
index db3cdfcebd8d..34ff8dd913ac 100644
--- a/dpnp/backend/extensions/vm/cbrt.cpp
+++ b/dpnp/backend/extensions/vm/cbrt.cpp
@@ -34,7 +34,10 @@
 #include <oneapi/mkl.hpp>
 #include <sycl/sycl.hpp>
 
-#include "dpctl4pybind11.hpp"
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "dpnp4pybind11.hpp"
 
 #include "cbrt.hpp"
 #include "common.hpp"
diff --git a/dpnp/backend/extensions/vm/ceil.cpp b/dpnp/backend/extensions/vm/ceil.cpp
index 6f5aeba16f99..e76a30d28317 100644
--- a/dpnp/backend/extensions/vm/ceil.cpp
+++ b/dpnp/backend/extensions/vm/ceil.cpp
@@ -34,7 +34,10 @@
 #include <oneapi/mkl.hpp>
 #include <sycl/sycl.hpp>
 
-#include "dpctl4pybind11.hpp"
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "dpnp4pybind11.hpp"
 
 #include "ceil.hpp"
 #include "common.hpp"
diff --git a/dpnp/backend/extensions/vm/common.hpp b/dpnp/backend/extensions/vm/common.hpp
index 6ee73504ce96..81e113771def 100644
--- a/dpnp/backend/extensions/vm/common.hpp
+++ b/dpnp/backend/extensions/vm/common.hpp
@@ -34,10 +34,10 @@
 #include <vector>
 
 #include <oneapi/mkl.hpp>
+#include <pybind11/pybind11.h>
 #include <sycl/sycl.hpp>
 
-#include <dpctl4pybind11.hpp>
-#include <pybind11/pybind11.h>
+#include "dpnp4pybind11.hpp"
 
 // utils extension header
 #include "ext/common.hpp"
diff --git a/dpnp/backend/extensions/vm/conj.cpp b/dpnp/backend/extensions/vm/conj.cpp
index 36710104750a..f77020cf1d55 100644
--- a/dpnp/backend/extensions/vm/conj.cpp
+++ b/dpnp/backend/extensions/vm/conj.cpp
@@ -35,7 +35,10 @@
 #include <oneapi/mkl.hpp>
 #include <sycl/sycl.hpp>
 
-#include "dpctl4pybind11.hpp"
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "dpnp4pybind11.hpp"
 
 #include "common.hpp"
 #include "conj.hpp"
diff --git a/dpnp/backend/extensions/vm/copysign.cpp b/dpnp/backend/extensions/vm/copysign.cpp
index cd90abf65a06..15c0fceec413 100644
--- a/dpnp/backend/extensions/vm/copysign.cpp
+++ b/dpnp/backend/extensions/vm/copysign.cpp
@@ -35,7 +35,10 @@
 #include <oneapi/mkl.hpp>
 #include <sycl/sycl.hpp>
 
-#include "dpctl4pybind11.hpp"
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "dpnp4pybind11.hpp"
 
 #include "common.hpp"
 #include "copysign.hpp"
diff --git a/dpnp/backend/extensions/vm/cos.cpp b/dpnp/backend/extensions/vm/cos.cpp
index 76db72594763..7c9b0c35d6ca 100644
--- a/dpnp/backend/extensions/vm/cos.cpp
+++ b/dpnp/backend/extensions/vm/cos.cpp
@@ -35,7 +35,10 @@
 #include <oneapi/mkl.hpp>
 #include <sycl/sycl.hpp>
 
-#include "dpctl4pybind11.hpp"
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "dpnp4pybind11.hpp"
 
 #include "common.hpp"
 #include "cos.hpp"
diff --git a/dpnp/backend/extensions/vm/cosh.cpp b/dpnp/backend/extensions/vm/cosh.cpp
index 464410b1accc..a95c7075ba61 100644
--- a/dpnp/backend/extensions/vm/cosh.cpp
+++ b/dpnp/backend/extensions/vm/cosh.cpp
@@ -35,7 +35,10 @@
 #include <oneapi/mkl.hpp>
 #include <sycl/sycl.hpp>
 
-#include "dpctl4pybind11.hpp"
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "dpnp4pybind11.hpp"
 
 #include "common.hpp"
 #include "cosh.hpp"
diff --git a/dpnp/backend/extensions/vm/div.cpp b/dpnp/backend/extensions/vm/div.cpp
index ad96f9acf083..6e0cb4d0439f 100644
--- a/dpnp/backend/extensions/vm/div.cpp
+++ b/dpnp/backend/extensions/vm/div.cpp
@@ -36,7 +36,10 @@
 #include <oneapi/mkl.hpp>
 #include <sycl/sycl.hpp>
 
-#include "dpctl4pybind11.hpp"
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "dpnp4pybind11.hpp"
 
 #include "common.hpp"
 #include "div.hpp"
diff --git a/dpnp/backend/extensions/vm/erf_funcs.cpp b/dpnp/backend/extensions/vm/erf_funcs.cpp
index 4e84403eb061..7be7f691edcf 100644
--- a/dpnp/backend/extensions/vm/erf_funcs.cpp
+++ b/dpnp/backend/extensions/vm/erf_funcs.cpp
@@ -34,7 +34,10 @@
 #include <oneapi/mkl.hpp>
 #include <sycl/sycl.hpp>
 
-#include "dpctl4pybind11.hpp"
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "dpnp4pybind11.hpp"
 
 #include "common.hpp"
 #include "erf_funcs.hpp"
diff --git a/dpnp/backend/extensions/vm/exp.cpp b/dpnp/backend/extensions/vm/exp.cpp
index acd265d191f7..31f50f36171d 100644
--- a/dpnp/backend/extensions/vm/exp.cpp
+++ b/dpnp/backend/extensions/vm/exp.cpp
@@ -35,7 +35,10 @@
 #include <oneapi/mkl.hpp>
 #include <sycl/sycl.hpp>
 
-#include "dpctl4pybind11.hpp"
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "dpnp4pybind11.hpp"
 
 #include "common.hpp"
 #include "exp.hpp"
diff --git a/dpnp/backend/extensions/vm/exp2.cpp b/dpnp/backend/extensions/vm/exp2.cpp
index 82c6c32fb6c5..41f18351fa7d 100644
--- a/dpnp/backend/extensions/vm/exp2.cpp
+++ b/dpnp/backend/extensions/vm/exp2.cpp
@@ -34,7 +34,10 @@
 #include <oneapi/mkl.hpp>
 #include <sycl/sycl.hpp>
 
-#include "dpctl4pybind11.hpp"
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "dpnp4pybind11.hpp"
 
 #include "common.hpp"
 #include "exp2.hpp"
diff --git a/dpnp/backend/extensions/vm/expm1.cpp b/dpnp/backend/extensions/vm/expm1.cpp
index 93cef7b3272d..37440cab9b0c 100644
--- a/dpnp/backend/extensions/vm/expm1.cpp
+++ b/dpnp/backend/extensions/vm/expm1.cpp
@@ -34,7 +34,10 @@
 #include <oneapi/mkl.hpp>
 #include <sycl/sycl.hpp>
 
-#include "dpctl4pybind11.hpp"
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "dpnp4pybind11.hpp"
 
 #include "common.hpp"
 #include "expm1.hpp"
diff --git a/dpnp/backend/extensions/vm/floor.cpp b/dpnp/backend/extensions/vm/floor.cpp
index fb1a86eda7bf..771d141e7f6a 100644
--- a/dpnp/backend/extensions/vm/floor.cpp
+++ b/dpnp/backend/extensions/vm/floor.cpp
@@ -34,7 +34,10 @@
 #include <oneapi/mkl.hpp>
 #include <sycl/sycl.hpp>
 
-#include "dpctl4pybind11.hpp"
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "dpnp4pybind11.hpp"
 
 #include "common.hpp"
 #include "floor.hpp"
diff --git a/dpnp/backend/extensions/vm/fmax.cpp b/dpnp/backend/extensions/vm/fmax.cpp
index 32786a3e8fc2..d01b3ef3dc42 100644
--- a/dpnp/backend/extensions/vm/fmax.cpp
+++ b/dpnp/backend/extensions/vm/fmax.cpp
@@ -35,7 +35,10 @@
 #include <oneapi/mkl.hpp>
 #include <sycl/sycl.hpp>
 
-#include "dpctl4pybind11.hpp"
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "dpnp4pybind11.hpp"
 
 #include "common.hpp"
 #include "fmax.hpp"
diff --git a/dpnp/backend/extensions/vm/fmin.cpp b/dpnp/backend/extensions/vm/fmin.cpp
index d923b8c7ddfb..6fbebba556f8 100644
--- a/dpnp/backend/extensions/vm/fmin.cpp
+++ b/dpnp/backend/extensions/vm/fmin.cpp
@@ -35,7 +35,10 @@
 #include <oneapi/mkl.hpp>
 #include <sycl/sycl.hpp>
 
-#include "dpctl4pybind11.hpp"
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "dpnp4pybind11.hpp"
 
 #include "common.hpp"
 #include "fmin.hpp"
diff --git a/dpnp/backend/extensions/vm/fmod.cpp b/dpnp/backend/extensions/vm/fmod.cpp
index 6c8a4ac705e4..1330453d6f84 100644
--- a/dpnp/backend/extensions/vm/fmod.cpp
+++ b/dpnp/backend/extensions/vm/fmod.cpp
@@ -35,7 +35,10 @@
 #include <oneapi/mkl.hpp>
 #include <sycl/sycl.hpp>
 
-#include "dpctl4pybind11.hpp"
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "dpnp4pybind11.hpp"
 
 #include "common.hpp"
 #include "fmod.hpp"
diff --git a/dpnp/backend/extensions/vm/hypot.cpp b/dpnp/backend/extensions/vm/hypot.cpp
index 92b7c78f8ad6..a9b3d3c12288 100644
--- a/dpnp/backend/extensions/vm/hypot.cpp
+++ b/dpnp/backend/extensions/vm/hypot.cpp
@@ -35,7 +35,10 @@
 #include <oneapi/mkl.hpp>
 #include <sycl/sycl.hpp>
 
-#include "dpctl4pybind11.hpp"
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "dpnp4pybind11.hpp"
 
 #include "common.hpp"
 #include "hypot.hpp"
diff --git a/dpnp/backend/extensions/vm/i0.cpp b/dpnp/backend/extensions/vm/i0.cpp
index 5db3ef9d9669..50f692ebd958 100644
--- a/dpnp/backend/extensions/vm/i0.cpp
+++ b/dpnp/backend/extensions/vm/i0.cpp
@@ -34,7 +34,10 @@
 #include <oneapi/mkl.hpp>
 #include <sycl/sycl.hpp>
 
-#include "dpctl4pybind11.hpp"
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "dpnp4pybind11.hpp"
 
 #include "common.hpp"
 #include "i0.hpp"
diff --git a/dpnp/backend/extensions/vm/inv.cpp b/dpnp/backend/extensions/vm/inv.cpp
index 1adeb1be23d0..eda08a6d0cd5 100644
--- a/dpnp/backend/extensions/vm/inv.cpp
+++ b/dpnp/backend/extensions/vm/inv.cpp
@@ -34,7 +34,10 @@
 #include <oneapi/mkl.hpp>
 #include <sycl/sycl.hpp>
 
-#include "dpctl4pybind11.hpp"
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "dpnp4pybind11.hpp"
 
 #include "common.hpp"
 #include "inv.hpp"
diff --git a/dpnp/backend/extensions/vm/ln.cpp b/dpnp/backend/extensions/vm/ln.cpp
index e60a0545005b..a5365e4d5a8b 100644
--- a/dpnp/backend/extensions/vm/ln.cpp
+++ b/dpnp/backend/extensions/vm/ln.cpp
@@ -35,7 +35,10 @@
 #include <oneapi/mkl.hpp>
 #include <sycl/sycl.hpp>
 
-#include "dpctl4pybind11.hpp"
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "dpnp4pybind11.hpp"
 
 #include "common.hpp"
 #include "ln.hpp"
diff --git a/dpnp/backend/extensions/vm/log10.cpp b/dpnp/backend/extensions/vm/log10.cpp
index d26ec57ab9ce..c04fb602f63d 100644
--- a/dpnp/backend/extensions/vm/log10.cpp
+++ b/dpnp/backend/extensions/vm/log10.cpp
@@ -35,7 +35,10 @@
 #include <oneapi/mkl.hpp>
 #include <sycl/sycl.hpp>
 
-#include "dpctl4pybind11.hpp"
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "dpnp4pybind11.hpp"
 
 #include "common.hpp"
 #include "log10.hpp"
diff --git a/dpnp/backend/extensions/vm/log1p.cpp b/dpnp/backend/extensions/vm/log1p.cpp
index 861804f8f6e0..04416bf37185 100644
--- a/dpnp/backend/extensions/vm/log1p.cpp
+++ b/dpnp/backend/extensions/vm/log1p.cpp
@@ -34,7 +34,10 @@
 #include <oneapi/mkl.hpp>
 #include <sycl/sycl.hpp>
 
-#include "dpctl4pybind11.hpp"
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "dpnp4pybind11.hpp"
 
 #include "common.hpp"
 #include "log1p.hpp"
diff --git a/dpnp/backend/extensions/vm/log2.cpp b/dpnp/backend/extensions/vm/log2.cpp
index e75e96c32fe9..752caa261977 100644
--- a/dpnp/backend/extensions/vm/log2.cpp
+++ b/dpnp/backend/extensions/vm/log2.cpp
@@ -34,7 +34,10 @@
 #include <oneapi/mkl.hpp>
 #include <sycl/sycl.hpp>
 
-#include "dpctl4pybind11.hpp"
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "dpnp4pybind11.hpp"
 
 #include "common.hpp"
 #include "log2.hpp"
diff --git a/dpnp/backend/extensions/vm/modf.cpp b/dpnp/backend/extensions/vm/modf.cpp
index ef68c79d8b42..418e4e44f7f7 100644
--- a/dpnp/backend/extensions/vm/modf.cpp
+++ b/dpnp/backend/extensions/vm/modf.cpp
@@ -35,7 +35,10 @@
 #include <oneapi/mkl.hpp>
 #include <sycl/sycl.hpp>
 
-#include "dpctl4pybind11.hpp"
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "dpnp4pybind11.hpp"
 
 #include "common.hpp"
 #include "modf.hpp"
diff --git a/dpnp/backend/extensions/vm/mul.cpp b/dpnp/backend/extensions/vm/mul.cpp
index 0c9cf7fb79cc..557cfb8882b3 100644
--- a/dpnp/backend/extensions/vm/mul.cpp
+++ b/dpnp/backend/extensions/vm/mul.cpp
@@ -36,7 +36,10 @@
 #include <oneapi/mkl.hpp>
 #include <sycl/sycl.hpp>
 
-#include "dpctl4pybind11.hpp"
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "dpnp4pybind11.hpp"
 
 #include "common.hpp"
 #include "mul.hpp"
diff --git a/dpnp/backend/extensions/vm/nextafter.cpp b/dpnp/backend/extensions/vm/nextafter.cpp
index 59b205b3d62a..a8ff710bda77 100644
--- a/dpnp/backend/extensions/vm/nextafter.cpp
+++ b/dpnp/backend/extensions/vm/nextafter.cpp
@@ -35,7 +35,10 @@
 #include <oneapi/mkl.hpp>
 #include <sycl/sycl.hpp>
 
-#include "dpctl4pybind11.hpp"
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "dpnp4pybind11.hpp"
 
 #include "common.hpp"
 #include "nextafter.hpp"
diff --git a/dpnp/backend/extensions/vm/pow.cpp b/dpnp/backend/extensions/vm/pow.cpp
index 5969a4862730..f0db87d1ef48 100644
--- a/dpnp/backend/extensions/vm/pow.cpp
+++ b/dpnp/backend/extensions/vm/pow.cpp
@@ -36,7 +36,10 @@
 #include <oneapi/mkl.hpp>
 #include <sycl/sycl.hpp>
 
-#include "dpctl4pybind11.hpp"
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "dpnp4pybind11.hpp"
 
 #include "common.hpp"
 #include "pow.hpp"
diff --git a/dpnp/backend/extensions/vm/rint.cpp b/dpnp/backend/extensions/vm/rint.cpp
index 41cd20a944a0..86931f259a04 100644
--- a/dpnp/backend/extensions/vm/rint.cpp
+++ b/dpnp/backend/extensions/vm/rint.cpp
@@ -34,7 +34,10 @@
 #include <oneapi/mkl.hpp>
 #include <sycl/sycl.hpp>
 
-#include "dpctl4pybind11.hpp"
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "dpnp4pybind11.hpp"
 
 #include "common.hpp"
 #include "rint.hpp"
diff --git a/dpnp/backend/extensions/vm/sin.cpp b/dpnp/backend/extensions/vm/sin.cpp
index 9263c3c4ffcf..7bb6ec321d2a 100644
--- a/dpnp/backend/extensions/vm/sin.cpp
+++ b/dpnp/backend/extensions/vm/sin.cpp
@@ -35,7 +35,10 @@
 #include <oneapi/mkl.hpp>
 #include <sycl/sycl.hpp>
 
-#include "dpctl4pybind11.hpp"
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "dpnp4pybind11.hpp"
 
 #include "common.hpp"
 #include "sin.hpp"
diff --git a/dpnp/backend/extensions/vm/sinh.cpp b/dpnp/backend/extensions/vm/sinh.cpp
index a1bae13a5281..5c351afd3b82 100644
--- a/dpnp/backend/extensions/vm/sinh.cpp
+++ b/dpnp/backend/extensions/vm/sinh.cpp
@@ -35,7 +35,10 @@
 #include <oneapi/mkl.hpp>
 #include <sycl/sycl.hpp>
 
-#include "dpctl4pybind11.hpp"
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "dpnp4pybind11.hpp"
 
 #include "common.hpp"
 #include "sinh.hpp"
diff --git a/dpnp/backend/extensions/vm/sqr.cpp b/dpnp/backend/extensions/vm/sqr.cpp
index 88c2e833b483..9d5cb8af5f2c 100644
--- a/dpnp/backend/extensions/vm/sqr.cpp
+++ b/dpnp/backend/extensions/vm/sqr.cpp
@@ -34,7 +34,10 @@
 #include <oneapi/mkl.hpp>
 #include <sycl/sycl.hpp>
 
-#include "dpctl4pybind11.hpp"
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "dpnp4pybind11.hpp"
 
 #include "common.hpp"
 #include "sqr.hpp"
diff --git a/dpnp/backend/extensions/vm/sqrt.cpp b/dpnp/backend/extensions/vm/sqrt.cpp
index 98cf2eea9253..5ab3489c1288 100644
--- a/dpnp/backend/extensions/vm/sqrt.cpp
+++ b/dpnp/backend/extensions/vm/sqrt.cpp
@@ -35,7 +35,10 @@
 #include <oneapi/mkl.hpp>
 #include <sycl/sycl.hpp>
 
-#include "dpctl4pybind11.hpp"
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "dpnp4pybind11.hpp"
 
 #include "common.hpp"
 #include "sqrt.hpp"
diff --git a/dpnp/backend/extensions/vm/sub.cpp b/dpnp/backend/extensions/vm/sub.cpp
index 5ee01f239c06..401588d4b65f 100644
--- a/dpnp/backend/extensions/vm/sub.cpp
+++ b/dpnp/backend/extensions/vm/sub.cpp
@@ -36,7 +36,10 @@
 #include <oneapi/mkl.hpp>
 #include <sycl/sycl.hpp>
 
-#include "dpctl4pybind11.hpp"
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "dpnp4pybind11.hpp"
 
 #include "common.hpp"
 #include "sub.hpp"
diff --git a/dpnp/backend/extensions/vm/tan.cpp b/dpnp/backend/extensions/vm/tan.cpp
index 46555ebd0178..590320034934 100644
--- a/dpnp/backend/extensions/vm/tan.cpp
+++ b/dpnp/backend/extensions/vm/tan.cpp
@@ -35,7 +35,10 @@
 #include <oneapi/mkl.hpp>
 #include <sycl/sycl.hpp>
 
-#include "dpctl4pybind11.hpp"
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "dpnp4pybind11.hpp"
 
 #include "common.hpp"
 #include "tan.hpp"
diff --git a/dpnp/backend/extensions/vm/tanh.cpp b/dpnp/backend/extensions/vm/tanh.cpp
index 04d2febfac1d..8febd94f2ec8 100644
--- a/dpnp/backend/extensions/vm/tanh.cpp
+++ b/dpnp/backend/extensions/vm/tanh.cpp
@@ -35,7 +35,10 @@
 #include <oneapi/mkl.hpp>
 #include <sycl/sycl.hpp>
 
-#include "dpctl4pybind11.hpp"
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "dpnp4pybind11.hpp"
 
 #include "common.hpp"
 #include "tanh.hpp"
diff --git a/dpnp/backend/extensions/vm/trunc.cpp b/dpnp/backend/extensions/vm/trunc.cpp
index c23a9a8180fb..4ec788ccf949 100644
--- a/dpnp/backend/extensions/vm/trunc.cpp
+++ b/dpnp/backend/extensions/vm/trunc.cpp
@@ -34,7 +34,10 @@
 #include <oneapi/mkl.hpp>
 #include <sycl/sycl.hpp>
 
-#include "dpctl4pybind11.hpp"
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "dpnp4pybind11.hpp"
 
 #include "common.hpp"
 #include "trunc.hpp"
diff --git a/dpnp/backend/extensions/window/CMakeLists.txt b/dpnp/backend/extensions/window/CMakeLists.txt
index fc446f523e74..6fe04e334f42 100644
--- a/dpnp/backend/extensions/window/CMakeLists.txt
+++ b/dpnp/backend/extensions/window/CMakeLists.txt
@@ -62,15 +62,13 @@ set_target_properties(
 
 target_include_directories(
     ${python_module_name}
-    PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../../
+    PRIVATE
+        ${CMAKE_CURRENT_SOURCE_DIR}/../../
+        ${CMAKE_CURRENT_SOURCE_DIR}/../common
+        ${CMAKE_SOURCE_DIR}/dpnp/backend/include
+        ${CMAKE_SOURCE_DIR}/dpctl/tensor/libtensor/include
 )
-target_include_directories(
-    ${python_module_name}
-    PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../common
-)
-
 target_include_directories(${python_module_name} PUBLIC ${Dpctl_INCLUDE_DIR})
-target_include_directories(${python_module_name} PUBLIC ${Dpctl_TENSOR_INCLUDE_DIR})
 
 if(WIN32)
     target_compile_options(
diff --git a/dpnp/backend/extensions/window/common.hpp b/dpnp/backend/extensions/window/common.hpp
index cb084e972d78..b95aea6259e3 100644
--- a/dpnp/backend/extensions/window/common.hpp
+++ b/dpnp/backend/extensions/window/common.hpp
@@ -30,9 +30,10 @@
 
 #include <pybind11/pybind11.h>
 #include <pybind11/stl.h>
+
 #include <sycl/sycl.hpp>
 
-#include "dpctl4pybind11.hpp"
+#include "dpnp4pybind11.hpp"
 
 // dpctl tensor headers
 #include "utils/output_validation.hpp"
@@ -41,7 +42,6 @@
 
 namespace dpnp::extensions::window
 {
-
 namespace dpctl_td_ns = dpctl::tensor::type_dispatch;
 
 namespace py = pybind11;
diff --git a/dpnp/backend/extensions/window/kaiser.hpp b/dpnp/backend/extensions/window/kaiser.hpp
index 0a4712cc594e..46227a60669f 100644
--- a/dpnp/backend/extensions/window/kaiser.hpp
+++ b/dpnp/backend/extensions/window/kaiser.hpp
@@ -28,9 +28,10 @@
 
 #pragma once
 
-#include <dpctl4pybind11.hpp>
 #include <sycl/sycl.hpp>
 
+#include "dpnp4pybind11.hpp"
+
 namespace dpnp::extensions::window
 {
 extern std::pair<sycl::event, sycl::event>
@@ -40,5 +41,4 @@ extern std::pair<sycl::event, sycl::event>
               const std::vector<sycl::event> &depends);
 
 extern void init_kaiser_dispatch_vectors(void);
-
 } // namespace dpnp::extensions::window
diff --git a/dpnp/backend/include/dpnp4pybind11.hpp b/dpnp/backend/include/dpnp4pybind11.hpp
new file mode 100644
index 000000000000..cd287989bef2
--- /dev/null
+++ b/dpnp/backend/include/dpnp4pybind11.hpp
@@ -0,0 +1,1373 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+
+#pragma once
+
+#include "dpctl_capi.h"
+
+#include <complex>
+#include <cstddef> // for std::size_t for C++ linkage
+#include <memory>
+#include <stddef.h> // for size_t for C linkage
+#include <stdexcept>
+#include <utility>
+#include <vector>
+
+#include <pybind11/pybind11.h>
+
+#include <sycl/sycl.hpp>
+
+namespace py = pybind11;
+
+namespace dpctl
+{
+namespace detail
+{
+// Lookup a type according to its size, and return a value corresponding to the
+// NumPy typenum.
+template <typename Concrete>
+constexpr int platform_typeid_lookup()
+{
+    return -1;
+}
+
+template <typename Concrete, typename T, typename... Ts, typename... Ints>
+constexpr int platform_typeid_lookup(int I, Ints... Is)
+{
+    return sizeof(Concrete) == sizeof(T)
+               ? I
+               : platform_typeid_lookup<Concrete, Ts...>(Is...);
+}
+
+class dpctl_capi
+{
+public:
+    // dpctl type objects
+    PyTypeObject *Py_SyclDeviceType_;
+    PyTypeObject *PySyclDeviceType_;
+    PyTypeObject *Py_SyclContextType_;
+    PyTypeObject *PySyclContextType_;
+    PyTypeObject *Py_SyclEventType_;
+    PyTypeObject *PySyclEventType_;
+    PyTypeObject *Py_SyclQueueType_;
+    PyTypeObject *PySyclQueueType_;
+    PyTypeObject *Py_MemoryType_;
+    PyTypeObject *PyMemoryUSMDeviceType_;
+    PyTypeObject *PyMemoryUSMSharedType_;
+    PyTypeObject *PyMemoryUSMHostType_;
+    PyTypeObject *PyUSMArrayType_;
+    PyTypeObject *PySyclProgramType_;
+    PyTypeObject *PySyclKernelType_;
+
+    DPCTLSyclDeviceRef (*SyclDevice_GetDeviceRef_)(PySyclDeviceObject *);
+    PySyclDeviceObject *(*SyclDevice_Make_)(DPCTLSyclDeviceRef);
+
+    DPCTLSyclContextRef (*SyclContext_GetContextRef_)(PySyclContextObject *);
+    PySyclContextObject *(*SyclContext_Make_)(DPCTLSyclContextRef);
+
+    DPCTLSyclEventRef (*SyclEvent_GetEventRef_)(PySyclEventObject *);
+    PySyclEventObject *(*SyclEvent_Make_)(DPCTLSyclEventRef);
+
+    DPCTLSyclQueueRef (*SyclQueue_GetQueueRef_)(PySyclQueueObject *);
+    PySyclQueueObject *(*SyclQueue_Make_)(DPCTLSyclQueueRef);
+
+    // memory
+    DPCTLSyclUSMRef (*Memory_GetUsmPointer_)(Py_MemoryObject *);
+    void *(*Memory_GetOpaquePointer_)(Py_MemoryObject *);
+    DPCTLSyclContextRef (*Memory_GetContextRef_)(Py_MemoryObject *);
+    DPCTLSyclQueueRef (*Memory_GetQueueRef_)(Py_MemoryObject *);
+    size_t (*Memory_GetNumBytes_)(Py_MemoryObject *);
+    PyObject *(*Memory_Make_)(DPCTLSyclUSMRef,
+                              size_t,
+                              DPCTLSyclQueueRef,
+                              PyObject *);
+
+    // program
+    DPCTLSyclKernelRef (*SyclKernel_GetKernelRef_)(PySyclKernelObject *);
+    PySyclKernelObject *(*SyclKernel_Make_)(DPCTLSyclKernelRef, const char *);
+
+    DPCTLSyclKernelBundleRef (*SyclProgram_GetKernelBundleRef_)(
+        PySyclProgramObject *);
+    PySyclProgramObject *(*SyclProgram_Make_)(DPCTLSyclKernelBundleRef);
+
+    // tensor
+    char *(*UsmNDArray_GetData_)(PyUSMArrayObject *);
+    int (*UsmNDArray_GetNDim_)(PyUSMArrayObject *);
+    py::ssize_t *(*UsmNDArray_GetShape_)(PyUSMArrayObject *);
+    py::ssize_t *(*UsmNDArray_GetStrides_)(PyUSMArrayObject *);
+    int (*UsmNDArray_GetTypenum_)(PyUSMArrayObject *);
+    int (*UsmNDArray_GetElementSize_)(PyUSMArrayObject *);
+    int (*UsmNDArray_GetFlags_)(PyUSMArrayObject *);
+    DPCTLSyclQueueRef (*UsmNDArray_GetQueueRef_)(PyUSMArrayObject *);
+    py::ssize_t (*UsmNDArray_GetOffset_)(PyUSMArrayObject *);
+    PyObject *(*UsmNDArray_GetUSMData_)(PyUSMArrayObject *);
+    void (*UsmNDArray_SetWritableFlag_)(PyUSMArrayObject *, int);
+    PyObject *(*UsmNDArray_MakeSimpleFromMemory_)(int,
+                                                  const py::ssize_t *,
+                                                  int,
+                                                  Py_MemoryObject *,
+                                                  py::ssize_t,
+                                                  char);
+    PyObject *(*UsmNDArray_MakeSimpleFromPtr_)(size_t,
+                                               int,
+                                               DPCTLSyclUSMRef,
+                                               DPCTLSyclQueueRef,
+                                               PyObject *);
+    PyObject *(*UsmNDArray_MakeFromPtr_)(int,
+                                         const py::ssize_t *,
+                                         int,
+                                         const py::ssize_t *,
+                                         DPCTLSyclUSMRef,
+                                         DPCTLSyclQueueRef,
+                                         py::ssize_t,
+                                         PyObject *);
+
+    int USM_ARRAY_C_CONTIGUOUS_;
+    int USM_ARRAY_F_CONTIGUOUS_;
+    int USM_ARRAY_WRITABLE_;
+    int UAR_BOOL_, UAR_BYTE_, UAR_UBYTE_, UAR_SHORT_, UAR_USHORT_, UAR_INT_,
+        UAR_UINT_, UAR_LONG_, UAR_ULONG_, UAR_LONGLONG_, UAR_ULONGLONG_,
+        UAR_FLOAT_, UAR_DOUBLE_, UAR_CFLOAT_, UAR_CDOUBLE_, UAR_TYPE_SENTINEL_,
+        UAR_HALF_;
+    int UAR_INT8_, UAR_UINT8_, UAR_INT16_, UAR_UINT16_, UAR_INT32_, UAR_UINT32_,
+        UAR_INT64_, UAR_UINT64_;
+
+    bool PySyclDevice_Check_(PyObject *obj) const
+    {
+        return PyObject_TypeCheck(obj, PySyclDeviceType_) != 0;
+    }
+    bool PySyclContext_Check_(PyObject *obj) const
+    {
+        return PyObject_TypeCheck(obj, PySyclContextType_) != 0;
+    }
+    bool PySyclEvent_Check_(PyObject *obj) const
+    {
+        return PyObject_TypeCheck(obj, PySyclEventType_) != 0;
+    }
+    bool PySyclQueue_Check_(PyObject *obj) const
+    {
+        return PyObject_TypeCheck(obj, PySyclQueueType_) != 0;
+    }
+    bool PySyclKernel_Check_(PyObject *obj) const
+    {
+        return PyObject_TypeCheck(obj, PySyclKernelType_) != 0;
+    }
+    bool PySyclProgram_Check_(PyObject *obj) const
+    {
+        return PyObject_TypeCheck(obj, PySyclProgramType_) != 0;
+    }
+
+    ~dpctl_capi()
+    {
+        as_usm_memory_.reset();
+        default_usm_ndarray_.reset();
+        default_usm_memory_.reset();
+        default_sycl_queue_.reset();
+    };
+
+    static auto &get()
+    {
+        static dpctl_capi api{};
+        return api;
+    }
+
+    py::object default_sycl_queue_pyobj()
+    {
+        return *default_sycl_queue_;
+    }
+    py::object default_usm_memory_pyobj()
+    {
+        return *default_usm_memory_;
+    }
+    py::object default_usm_ndarray_pyobj()
+    {
+        return *default_usm_ndarray_;
+    }
+    py::object as_usm_memory_pyobj()
+    {
+        return *as_usm_memory_;
+    }
+
+private:
+    struct Deleter
+    {
+        void operator()(py::object *p) const
+        {
+            const bool initialized = Py_IsInitialized();
+#if PY_VERSION_HEX < 0x30d0000
+            const bool finalizing = _Py_IsFinalizing();
+#else
+            const bool finalizing = Py_IsFinalizing();
+#endif
+            const bool guard = initialized && !finalizing;
+
+            if (guard) {
+                delete p;
+            }
+        }
+    };
+
+    std::shared_ptr<py::object> default_sycl_queue_;
+    std::shared_ptr<py::object> default_usm_memory_;
+    std::shared_ptr<py::object> default_usm_ndarray_;
+    std::shared_ptr<py::object> as_usm_memory_;
+
+    dpctl_capi()
+        : Py_SyclDeviceType_(nullptr), PySyclDeviceType_(nullptr),
+          Py_SyclContextType_(nullptr), PySyclContextType_(nullptr),
+          Py_SyclEventType_(nullptr), PySyclEventType_(nullptr),
+          Py_SyclQueueType_(nullptr), PySyclQueueType_(nullptr),
+          Py_MemoryType_(nullptr), PyMemoryUSMDeviceType_(nullptr),
+          PyMemoryUSMSharedType_(nullptr), PyMemoryUSMHostType_(nullptr),
+          PyUSMArrayType_(nullptr), PySyclProgramType_(nullptr),
+          PySyclKernelType_(nullptr), SyclDevice_GetDeviceRef_(nullptr),
+          SyclDevice_Make_(nullptr), SyclContext_GetContextRef_(nullptr),
+          SyclContext_Make_(nullptr), SyclEvent_GetEventRef_(nullptr),
+          SyclEvent_Make_(nullptr), SyclQueue_GetQueueRef_(nullptr),
+          SyclQueue_Make_(nullptr), Memory_GetUsmPointer_(nullptr),
+          Memory_GetOpaquePointer_(nullptr), Memory_GetContextRef_(nullptr),
+          Memory_GetQueueRef_(nullptr), Memory_GetNumBytes_(nullptr),
+          Memory_Make_(nullptr), SyclKernel_GetKernelRef_(nullptr),
+          SyclKernel_Make_(nullptr), SyclProgram_GetKernelBundleRef_(nullptr),
+          SyclProgram_Make_(nullptr), UsmNDArray_GetData_(nullptr),
+          UsmNDArray_GetNDim_(nullptr), UsmNDArray_GetShape_(nullptr),
+          UsmNDArray_GetStrides_(nullptr), UsmNDArray_GetTypenum_(nullptr),
+          UsmNDArray_GetElementSize_(nullptr), UsmNDArray_GetFlags_(nullptr),
+          UsmNDArray_GetQueueRef_(nullptr), UsmNDArray_GetOffset_(nullptr),
+          UsmNDArray_GetUSMData_(nullptr), UsmNDArray_SetWritableFlag_(nullptr),
+          UsmNDArray_MakeSimpleFromMemory_(nullptr),
+          UsmNDArray_MakeSimpleFromPtr_(nullptr),
+          UsmNDArray_MakeFromPtr_(nullptr), USM_ARRAY_C_CONTIGUOUS_(0),
+          USM_ARRAY_F_CONTIGUOUS_(0), USM_ARRAY_WRITABLE_(0), UAR_BOOL_(-1),
+          UAR_BYTE_(-1), UAR_UBYTE_(-1), UAR_SHORT_(-1), UAR_USHORT_(-1),
+          UAR_INT_(-1), UAR_UINT_(-1), UAR_LONG_(-1), UAR_ULONG_(-1),
+          UAR_LONGLONG_(-1), UAR_ULONGLONG_(-1), UAR_FLOAT_(-1),
+          UAR_DOUBLE_(-1), UAR_CFLOAT_(-1), UAR_CDOUBLE_(-1),
+          UAR_TYPE_SENTINEL_(-1), UAR_HALF_(-1), UAR_INT8_(-1), UAR_UINT8_(-1),
+          UAR_INT16_(-1), UAR_UINT16_(-1), UAR_INT32_(-1), UAR_UINT32_(-1),
+          UAR_INT64_(-1), UAR_UINT64_(-1), default_sycl_queue_{},
+          default_usm_memory_{}, default_usm_ndarray_{}, as_usm_memory_{}
+
+    {
+        // Import Cython-generated C-API for dpctl
+        // This imports python modules and initializes
+        // static variables such as function pointers for C-API,
+        // e.g. SyclDevice_GetDeviceRef, etc.
+        // pointers to Python types, i.e. PySyclDeviceType, etc.
+        // and exported constants, i.e. USM_ARRAY_C_CONTIGUOUS, etc.
+        import_dpctl();
+
+        // Python type objects for classes implemented by dpctl
+        this->Py_SyclDeviceType_ = &Py_SyclDeviceType;
+        this->PySyclDeviceType_ = &PySyclDeviceType;
+        this->Py_SyclContextType_ = &Py_SyclContextType;
+        this->PySyclContextType_ = &PySyclContextType;
+        this->Py_SyclEventType_ = &Py_SyclEventType;
+        this->PySyclEventType_ = &PySyclEventType;
+        this->Py_SyclQueueType_ = &Py_SyclQueueType;
+        this->PySyclQueueType_ = &PySyclQueueType;
+        this->Py_MemoryType_ = &Py_MemoryType;
+        this->PyMemoryUSMDeviceType_ = &PyMemoryUSMDeviceType;
+        this->PyMemoryUSMSharedType_ = &PyMemoryUSMSharedType;
+        this->PyMemoryUSMHostType_ = &PyMemoryUSMHostType;
+        this->PyUSMArrayType_ = &PyUSMArrayType;
+        this->PySyclProgramType_ = &PySyclProgramType;
+        this->PySyclKernelType_ = &PySyclKernelType;
+
+        // SyclDevice API
+        this->SyclDevice_GetDeviceRef_ = SyclDevice_GetDeviceRef;
+        this->SyclDevice_Make_ = SyclDevice_Make;
+
+        // SyclContext API
+        this->SyclContext_GetContextRef_ = SyclContext_GetContextRef;
+        this->SyclContext_Make_ = SyclContext_Make;
+
+        // SyclEvent API
+        this->SyclEvent_GetEventRef_ = SyclEvent_GetEventRef;
+        this->SyclEvent_Make_ = SyclEvent_Make;
+
+        // SyclQueue API
+        this->SyclQueue_GetQueueRef_ = SyclQueue_GetQueueRef;
+        this->SyclQueue_Make_ = SyclQueue_Make;
+
+        // dpctl.memory API
+        this->Memory_GetUsmPointer_ = Memory_GetUsmPointer;
+        this->Memory_GetOpaquePointer_ = Memory_GetOpaquePointer;
+        this->Memory_GetContextRef_ = Memory_GetContextRef;
+        this->Memory_GetQueueRef_ = Memory_GetQueueRef;
+        this->Memory_GetNumBytes_ = Memory_GetNumBytes;
+        this->Memory_Make_ = Memory_Make;
+
+        // dpctl.program API
+        this->SyclKernel_GetKernelRef_ = SyclKernel_GetKernelRef;
+        this->SyclKernel_Make_ = SyclKernel_Make;
+        this->SyclProgram_GetKernelBundleRef_ = SyclProgram_GetKernelBundleRef;
+        this->SyclProgram_Make_ = SyclProgram_Make;
+
+        // dpctl.tensor.usm_ndarray API
+        this->UsmNDArray_GetData_ = UsmNDArray_GetData;
+        this->UsmNDArray_GetNDim_ = UsmNDArray_GetNDim;
+        this->UsmNDArray_GetShape_ = UsmNDArray_GetShape;
+        this->UsmNDArray_GetStrides_ = UsmNDArray_GetStrides;
+        this->UsmNDArray_GetTypenum_ = UsmNDArray_GetTypenum;
+        this->UsmNDArray_GetElementSize_ = UsmNDArray_GetElementSize;
+        this->UsmNDArray_GetFlags_ = UsmNDArray_GetFlags;
+        this->UsmNDArray_GetQueueRef_ = UsmNDArray_GetQueueRef;
+        this->UsmNDArray_GetOffset_ = UsmNDArray_GetOffset;
+        this->UsmNDArray_GetUSMData_ = UsmNDArray_GetUSMData;
+        this->UsmNDArray_SetWritableFlag_ = UsmNDArray_SetWritableFlag;
+        this->UsmNDArray_MakeSimpleFromMemory_ =
+            UsmNDArray_MakeSimpleFromMemory;
+        this->UsmNDArray_MakeSimpleFromPtr_ = UsmNDArray_MakeSimpleFromPtr;
+        this->UsmNDArray_MakeFromPtr_ = UsmNDArray_MakeFromPtr;
+
+        // constants
+        this->USM_ARRAY_C_CONTIGUOUS_ = USM_ARRAY_C_CONTIGUOUS;
+        this->USM_ARRAY_F_CONTIGUOUS_ = USM_ARRAY_F_CONTIGUOUS;
+        this->USM_ARRAY_WRITABLE_ = USM_ARRAY_WRITABLE;
+        this->UAR_BOOL_ = UAR_BOOL;
+        this->UAR_BYTE_ = UAR_BYTE;
+        this->UAR_UBYTE_ = UAR_UBYTE;
+        this->UAR_SHORT_ = UAR_SHORT;
+        this->UAR_USHORT_ = UAR_USHORT;
+        this->UAR_INT_ = UAR_INT;
+        this->UAR_UINT_ = UAR_UINT;
+        this->UAR_LONG_ = UAR_LONG;
+        this->UAR_ULONG_ = UAR_ULONG;
+        this->UAR_LONGLONG_ = UAR_LONGLONG;
+        this->UAR_ULONGLONG_ = UAR_ULONGLONG;
+        this->UAR_FLOAT_ = UAR_FLOAT;
+        this->UAR_DOUBLE_ = UAR_DOUBLE;
+        this->UAR_CFLOAT_ = UAR_CFLOAT;
+        this->UAR_CDOUBLE_ = UAR_CDOUBLE;
+        this->UAR_TYPE_SENTINEL_ = UAR_TYPE_SENTINEL;
+        this->UAR_HALF_ = UAR_HALF;
+
+        // deduced disjoint types
+        this->UAR_INT8_ = UAR_BYTE;
+        this->UAR_UINT8_ = UAR_UBYTE;
+        this->UAR_INT16_ = UAR_SHORT;
+        this->UAR_UINT16_ = UAR_USHORT;
+        this->UAR_INT32_ =
+            platform_typeid_lookup<std::int32_t, long, int, short>(
+                UAR_LONG, UAR_INT, UAR_SHORT);
+        this->UAR_UINT32_ =
+            platform_typeid_lookup<std::uint32_t, unsigned long, unsigned int,
+                                   unsigned short>(UAR_ULONG, UAR_UINT,
+                                                   UAR_USHORT);
+        this->UAR_INT64_ =
+            platform_typeid_lookup<std::int64_t, long, long long, int>(
+                UAR_LONG, UAR_LONGLONG, UAR_INT);
+        this->UAR_UINT64_ =
+            platform_typeid_lookup<std::uint64_t, unsigned long,
+                                   unsigned long long, unsigned int>(
+                UAR_ULONG, UAR_ULONGLONG, UAR_UINT);
+
+        // create shared pointers to python objects used in type-casters
+        // for dpctl::memory::usm_memory and dpctl::tensor::usm_ndarray
+        sycl::queue q_{};
+        PySyclQueueObject *py_q_tmp =
+            SyclQueue_Make(reinterpret_cast<DPCTLSyclQueueRef>(&q_));
+        const py::object &py_sycl_queue = py::reinterpret_steal<py::object>(
+            reinterpret_cast<PyObject *>(py_q_tmp));
+
+        default_sycl_queue_ = std::shared_ptr<py::object>(
+            new py::object(py_sycl_queue), Deleter{});
+
+        py::module_ mod_memory = py::module_::import("dpctl.memory");
+        const py::object &py_as_usm_memory = mod_memory.attr("as_usm_memory");
+        as_usm_memory_ = std::shared_ptr<py::object>(
+            new py::object{py_as_usm_memory}, Deleter{});
+
+        auto mem_kl = mod_memory.attr("MemoryUSMHost");
+        const py::object &py_default_usm_memory =
+            mem_kl(1, py::arg("queue") = py_sycl_queue);
+        default_usm_memory_ = std::shared_ptr<py::object>(
+            new py::object{py_default_usm_memory}, Deleter{});
+
+        py::module_ mod_usmarray =
+            py::module_::import("dpctl.tensor._usmarray");
+        auto tensor_kl = mod_usmarray.attr("usm_ndarray");
+
+        const py::object &py_default_usm_ndarray =
+            tensor_kl(py::tuple(), py::arg("dtype") = py::str("u1"),
+                      py::arg("buffer") = py_default_usm_memory);
+
+        default_usm_ndarray_ = std::shared_ptr<py::object>(
+            new py::object{py_default_usm_ndarray}, Deleter{});
+    }
+
+    dpctl_capi(dpctl_capi const &) = default;
+    dpctl_capi &operator=(dpctl_capi const &) = default;
+    dpctl_capi &operator=(dpctl_capi &&) = default;
+
+}; // struct dpctl_capi
+} // namespace detail
+} // namespace dpctl
+
+namespace pybind11::detail
+{
+#define DPCTL_TYPE_CASTER(type, py_name)                                       \
+protected:                                                                     \
+    std::unique_ptr<type> value;                                               \
+                                                                               \
+public:                                                                        \
+    static constexpr auto name = py_name;                                      \
+    template <                                                                 \
+        typename T_,                                                           \
+        ::pybind11::detail::enable_if_t<                                       \
+            std::is_same<type, ::pybind11::detail::remove_cv_t<T_>>::value,    \
+            int> = 0>                                                          \
+    static ::pybind11::handle cast(T_ *src,                                    \
+                                   ::pybind11::return_value_policy policy,     \
+                                   ::pybind11::handle parent)                  \
+    {                                                                          \
+        if (!src)                                                              \
+            return ::pybind11::none().release();                               \
+        if (policy == ::pybind11::return_value_policy::take_ownership) {       \
+            auto h = cast(std::move(*src), policy, parent);                    \
+            delete src;                                                        \
+            return h;                                                          \
+        }                                                                      \
+        return cast(*src, policy, parent);                                     \
+    }                                                                          \
+    operator type *()                                                          \
+    {                                                                          \
+        return value.get();                                                    \
+    } /* NOLINT(bugprone-macro-parentheses) */                                 \
+    operator type &()                                                          \
+    {                                                                          \
+        return *value;                                                         \
+    } /* NOLINT(bugprone-macro-parentheses) */                                 \
+    operator type &&() &&                                                      \
+    {                                                                          \
+        return std::move(*value);                                              \
+    } /* NOLINT(bugprone-macro-parentheses) */                                 \
+    template <typename T_>                                                     \
+    using cast_op_type = ::pybind11::detail::movable_cast_op_type<T_>
+
+/* This type caster associates ``sycl::queue`` C++ class with
+ * :class:`dpctl.SyclQueue` for the purposes of generation of
+ * Python bindings by pybind11.
+ */
+template <>
+struct type_caster<sycl::queue>
+{
+public:
+    bool load(handle src, bool)
+    {
+        PyObject *source = src.ptr();
+        auto const &api = ::dpctl::detail::dpctl_capi::get();
+        if (api.PySyclQueue_Check_(source)) {
+            DPCTLSyclQueueRef QRef = api.SyclQueue_GetQueueRef_(
+                reinterpret_cast<PySyclQueueObject *>(source));
+            value = std::make_unique<sycl::queue>(
+                *(reinterpret_cast<sycl::queue *>(QRef)));
+            return true;
+        }
+        else {
+            throw py::type_error(
+                "Input is of unexpected type, expected dpctl.SyclQueue");
+        }
+    }
+
+    static handle cast(sycl::queue src, return_value_policy, handle)
+    {
+        auto const &api = ::dpctl::detail::dpctl_capi::get();
+        auto tmp =
+            api.SyclQueue_Make_(reinterpret_cast<DPCTLSyclQueueRef>(&src));
+        return handle(reinterpret_cast<PyObject *>(tmp));
+    }
+
+    DPCTL_TYPE_CASTER(sycl::queue, _("dpctl.SyclQueue"));
+};
+
+/* This type caster associates ``sycl::device`` C++ class with
+ * :class:`dpctl.SyclDevice` for the purposes of generation of
+ * Python bindings by pybind11.
+ */
+template <>
+struct type_caster<sycl::device>
+{
+public:
+    bool load(handle src, bool)
+    {
+        PyObject *source = src.ptr();
+        auto const &api = ::dpctl::detail::dpctl_capi::get();
+        if (api.PySyclDevice_Check_(source)) {
+            DPCTLSyclDeviceRef DRef = api.SyclDevice_GetDeviceRef_(
+                reinterpret_cast<PySyclDeviceObject *>(source));
+            value = std::make_unique<sycl::device>(
+                *(reinterpret_cast<sycl::device *>(DRef)));
+            return true;
+        }
+        else {
+            throw py::type_error(
+                "Input is of unexpected type, expected dpctl.SyclDevice");
+        }
+    }
+
+    static handle cast(sycl::device src, return_value_policy, handle)
+    {
+        auto const &api = ::dpctl::detail::dpctl_capi::get();
+        auto tmp =
+            api.SyclDevice_Make_(reinterpret_cast<DPCTLSyclDeviceRef>(&src));
+        return handle(reinterpret_cast<PyObject *>(tmp));
+    }
+
+    DPCTL_TYPE_CASTER(sycl::device, _("dpctl.SyclDevice"));
+};
+
+/* This type caster associates ``sycl::context`` C++ class with
+ * :class:`dpctl.SyclContext` for the purposes of generation of
+ * Python bindings by pybind11.
+ */
+template <>
+struct type_caster<sycl::context>
+{
+public:
+    bool load(handle src, bool)
+    {
+        PyObject *source = src.ptr();
+        auto const &api = ::dpctl::detail::dpctl_capi::get();
+        if (api.PySyclContext_Check_(source)) {
+            DPCTLSyclContextRef CRef = api.SyclContext_GetContextRef_(
+                reinterpret_cast<PySyclContextObject *>(source));
+            value = std::make_unique<sycl::context>(
+                *(reinterpret_cast<sycl::context *>(CRef)));
+            return true;
+        }
+        else {
+            throw py::type_error(
+                "Input is of unexpected type, expected dpctl.SyclContext");
+        }
+    }
+
+    static handle cast(sycl::context src, return_value_policy, handle)
+    {
+        auto const &api = ::dpctl::detail::dpctl_capi::get();
+        auto tmp =
+            api.SyclContext_Make_(reinterpret_cast<DPCTLSyclContextRef>(&src));
+        return handle(reinterpret_cast<PyObject *>(tmp));
+    }
+
+    DPCTL_TYPE_CASTER(sycl::context, _("dpctl.SyclContext"));
+};
+
+/* This type caster associates ``sycl::event`` C++ class with
+ * :class:`dpctl.SyclEvent` for the purposes of generation of
+ * Python bindings by pybind11.
+ */
+template <>
+struct type_caster<sycl::event>
+{
+public:
+    bool load(handle src, bool)
+    {
+        PyObject *source = src.ptr();
+        auto const &api = ::dpctl::detail::dpctl_capi::get();
+        if (api.PySyclEvent_Check_(source)) {
+            DPCTLSyclEventRef ERef = api.SyclEvent_GetEventRef_(
+                reinterpret_cast<PySyclEventObject *>(source));
+            value = std::make_unique<sycl::event>(
+                *(reinterpret_cast<sycl::event *>(ERef)));
+            return true;
+        }
+        else {
+            throw py::type_error(
+                "Input is of unexpected type, expected dpctl.SyclEvent");
+        }
+    }
+
+    static handle cast(sycl::event src, return_value_policy, handle)
+    {
+        auto const &api = ::dpctl::detail::dpctl_capi::get();
+        auto tmp =
+            api.SyclEvent_Make_(reinterpret_cast<DPCTLSyclEventRef>(&src));
+        return handle(reinterpret_cast<PyObject *>(tmp));
+    }
+
+    DPCTL_TYPE_CASTER(sycl::event, _("dpctl.SyclEvent"));
+};
+
+/* This type caster associates ``sycl::kernel`` C++ class with
+ * :class:`dpctl.program.SyclKernel` for the purposes of generation of
+ * Python bindings by pybind11.
+ */
+template <>
+struct type_caster<sycl::kernel>
+{
+public:
+    bool load(handle src, bool)
+    {
+        PyObject *source = src.ptr();
+        auto const &api = ::dpctl::detail::dpctl_capi::get();
+        if (api.PySyclKernel_Check_(source)) {
+            DPCTLSyclKernelRef KRef = api.SyclKernel_GetKernelRef_(
+                reinterpret_cast<PySyclKernelObject *>(source));
+            value = std::make_unique<sycl::kernel>(
+                *(reinterpret_cast<sycl::kernel *>(KRef)));
+            return true;
+        }
+        else {
+            throw py::type_error("Input is of unexpected type, expected "
+                                 "dpctl.program.SyclKernel");
+        }
+    }
+
+    static handle cast(sycl::kernel src, return_value_policy, handle)
+    {
+        auto const &api = ::dpctl::detail::dpctl_capi::get();
+        auto tmp =
+            api.SyclKernel_Make_(reinterpret_cast<DPCTLSyclKernelRef>(&src),
+                                 "dpctl4pybind11_kernel");
+        return handle(reinterpret_cast<PyObject *>(tmp));
+    }
+
+    DPCTL_TYPE_CASTER(sycl::kernel, _("dpctl.program.SyclKernel"));
+};
+
+/* This type caster associates
+ * ``sycl::kernel_bundle<sycl::bundle_state::executable>`` C++ class with
+ * :class:`dpctl.program.SyclProgram` for the purposes of generation of
+ * Python bindings by pybind11.
+ */
+template <>
+struct type_caster<sycl::kernel_bundle<sycl::bundle_state::executable>>
+{
+public:
+    bool load(handle src, bool)
+    {
+        PyObject *source = src.ptr();
+        auto const &api = ::dpctl::detail::dpctl_capi::get();
+        if (api.PySyclProgram_Check_(source)) {
+            DPCTLSyclKernelBundleRef KBRef =
+                api.SyclProgram_GetKernelBundleRef_(
+                    reinterpret_cast<PySyclProgramObject *>(source));
+            value = std::make_unique<
+                sycl::kernel_bundle<sycl::bundle_state::executable>>(
+                *(reinterpret_cast<
+                    sycl::kernel_bundle<sycl::bundle_state::executable> *>(
+                    KBRef)));
+            return true;
+        }
+        else {
+            throw py::type_error("Input is of unexpected type, expected "
+                                 "dpctl.program.SyclProgram");
+        }
+    }
+
+    static handle cast(sycl::kernel_bundle<sycl::bundle_state::executable> src,
+                       return_value_policy,
+                       handle)
+    {
+        auto const &api = ::dpctl::detail::dpctl_capi::get();
+        auto tmp = api.SyclProgram_Make_(
+            reinterpret_cast<DPCTLSyclKernelBundleRef>(&src));
+        return handle(reinterpret_cast<PyObject *>(tmp));
+    }
+
+    DPCTL_TYPE_CASTER(sycl::kernel_bundle<sycl::bundle_state::executable>,
+                      _("dpctl.program.SyclProgram"));
+};
+
+/* This type caster associates
+ * ``sycl::half`` C++ class with Python :class:`float` for the purposes
+ * of generation of Python bindings by pybind11.
+ */
+template <>
+struct type_caster<sycl::half>
+{
+public:
+    bool load(handle src, bool convert)
+    {
+        double py_value;
+
+        if (!src) {
+            return false;
+        }
+
+        PyObject *source = src.ptr();
+
+        if (convert || PyFloat_Check(source)) {
+            py_value = PyFloat_AsDouble(source);
+        }
+        else {
+            return false;
+        }
+
+        bool py_err = (py_value == double(-1)) && PyErr_Occurred();
+
+        if (py_err) {
+            PyErr_Clear();
+            if (convert && (PyNumber_Check(source) != 0)) {
+                auto tmp = reinterpret_steal<object>(PyNumber_Float(source));
+                return load(tmp, false);
+            }
+            return false;
+        }
+        value = static_cast<sycl::half>(py_value);
+        return true;
+    }
+
+    static handle cast(sycl::half src, return_value_policy, handle)
+    {
+        return PyFloat_FromDouble(static_cast<double>(src));
+    }
+
+    PYBIND11_TYPE_CASTER(sycl::half, _("float"));
+};
+} // namespace pybind11::detail
+
+namespace dpctl
+{
+namespace memory
+{
+// since PYBIND11_OBJECT_CVT uses error_already_set without namespace,
+// this allows to avoid compilation error
+using pybind11::error_already_set;
+
+class usm_memory : public py::object
+{
+public:
+    PYBIND11_OBJECT_CVT(
+        usm_memory,
+        py::object,
+        [](PyObject *o) -> bool {
+            return PyObject_TypeCheck(
+                       o, ::dpctl::detail::dpctl_capi::get().Py_MemoryType_) !=
+                   0;
+        },
+        [](PyObject *o) -> PyObject * { return as_usm_memory(o); })
+
+    usm_memory()
+        : py::object(
+              ::dpctl::detail::dpctl_capi::get().default_usm_memory_pyobj(),
+              borrowed_t{})
+    {
+        if (!m_ptr)
+            throw py::error_already_set();
+    }
+
+    /*! @brief Create usm_memory object from shared pointer that manages
+     *  lifetime of the USM allocation.
+     */
+    usm_memory(void *usm_ptr,
+               std::size_t nbytes,
+               const sycl::queue &q,
+               std::shared_ptr<void> shptr)
+    {
+        auto const &api = ::dpctl::detail::dpctl_capi::get();
+        DPCTLSyclUSMRef usm_ref = reinterpret_cast<DPCTLSyclUSMRef>(usm_ptr);
+        auto q_uptr = std::make_unique<sycl::queue>(q);
+        DPCTLSyclQueueRef QRef =
+            reinterpret_cast<DPCTLSyclQueueRef>(q_uptr.get());
+
+        auto vacuous_destructor = []() {};
+        py::capsule mock_owner(vacuous_destructor);
+
+        // create memory object owned by mock_owner, it is a new reference
+        PyObject *_memory =
+            api.Memory_Make_(usm_ref, nbytes, QRef, mock_owner.ptr());
+        auto ref_count_decrementer = [](PyObject *o) noexcept { Py_DECREF(o); };
+
+        using py_uptrT =
+            std::unique_ptr<PyObject, decltype(ref_count_decrementer)>;
+
+        if (!_memory) {
+            throw py::error_already_set();
+        }
+
+        auto memory_uptr = py_uptrT(_memory, ref_count_decrementer);
+        std::shared_ptr<void> *opaque_ptr = new std::shared_ptr<void>(shptr);
+
+        Py_MemoryObject *memobj = reinterpret_cast<Py_MemoryObject *>(_memory);
+        // replace mock_owner capsule as the owner
+        memobj->refobj = Py_None;
+        // set opaque ptr field, usm_memory now knowns that USM is managed
+        // by smart pointer
+        memobj->_opaque_ptr = reinterpret_cast<void *>(opaque_ptr);
+
+        // _memory will delete created copies of sycl::queue, and
+        // std::shared_ptr and the deleter of the shared_ptr<void> is
+        // supposed to free the USM allocation
+        m_ptr = _memory;
+        q_uptr.release();
+        memory_uptr.release();
+    }
+
+    sycl::queue get_queue() const
+    {
+        Py_MemoryObject *mem_obj = reinterpret_cast<Py_MemoryObject *>(m_ptr);
+        auto const &api = ::dpctl::detail::dpctl_capi::get();
+        DPCTLSyclQueueRef QRef = api.Memory_GetQueueRef_(mem_obj);
+        sycl::queue *obj_q = reinterpret_cast<sycl::queue *>(QRef);
+        return *obj_q;
+    }
+
+    char *get_pointer() const
+    {
+        Py_MemoryObject *mem_obj = reinterpret_cast<Py_MemoryObject *>(m_ptr);
+        auto const &api = ::dpctl::detail::dpctl_capi::get();
+        DPCTLSyclUSMRef MRef = api.Memory_GetUsmPointer_(mem_obj);
+        return reinterpret_cast<char *>(MRef);
+    }
+
+    std::size_t get_nbytes() const
+    {
+        auto const &api = ::dpctl::detail::dpctl_capi::get();
+        Py_MemoryObject *mem_obj = reinterpret_cast<Py_MemoryObject *>(m_ptr);
+        return api.Memory_GetNumBytes_(mem_obj);
+    }
+
+    bool is_managed_by_smart_ptr() const
+    {
+        auto const &api = ::dpctl::detail::dpctl_capi::get();
+        Py_MemoryObject *mem_obj = reinterpret_cast<Py_MemoryObject *>(m_ptr);
+        const void *opaque_ptr = api.Memory_GetOpaquePointer_(mem_obj);
+
+        return bool(opaque_ptr);
+    }
+
+    const std::shared_ptr<void> &get_smart_ptr_owner() const
+    {
+        auto const &api = ::dpctl::detail::dpctl_capi::get();
+        Py_MemoryObject *mem_obj = reinterpret_cast<Py_MemoryObject *>(m_ptr);
+        void *opaque_ptr = api.Memory_GetOpaquePointer_(mem_obj);
+
+        if (opaque_ptr) {
+            auto shptr_ptr =
+                reinterpret_cast<std::shared_ptr<void> *>(opaque_ptr);
+            return *shptr_ptr;
+        }
+        else {
+            throw std::runtime_error(
+                "Memory object does not have smart pointer "
+                "managing lifetime of USM allocation");
+        }
+    }
+
+protected:
+    static PyObject *as_usm_memory(PyObject *o)
+    {
+        if (o == nullptr) {
+            PyErr_SetString(PyExc_ValueError,
+                            "cannot create a usm_memory from a nullptr");
+            return nullptr;
+        }
+
+        auto converter =
+            ::dpctl::detail::dpctl_capi::get().as_usm_memory_pyobj();
+
+        py::object res;
+        try {
+            res = converter(py::handle(o));
+        } catch (const py::error_already_set &e) {
+            return nullptr;
+        }
+        return res.ptr();
+    }
+};
+} // end namespace memory
+
+namespace tensor
+{
+inline std::vector<py::ssize_t>
+    c_contiguous_strides(int nd,
+                         const py::ssize_t *shape,
+                         py::ssize_t element_size = 1)
+{
+    if (nd > 0) {
+        std::vector<py::ssize_t> c_strides(nd, element_size);
+        for (int ic = nd - 1; ic > 0;) {
+            py::ssize_t next_v = c_strides[ic] * shape[ic];
+            c_strides[--ic] = next_v;
+        }
+        return c_strides;
+    }
+    else {
+        return std::vector<py::ssize_t>();
+    }
+}
+
+inline std::vector<py::ssize_t>
+    f_contiguous_strides(int nd,
+                         const py::ssize_t *shape,
+                         py::ssize_t element_size = 1)
+{
+    if (nd > 0) {
+        std::vector<py::ssize_t> f_strides(nd, element_size);
+        for (int i = 0; i < nd - 1;) {
+            py::ssize_t next_v = f_strides[i] * shape[i];
+            f_strides[++i] = next_v;
+        }
+        return f_strides;
+    }
+    else {
+        return std::vector<py::ssize_t>();
+    }
+}
+
+inline std::vector<py::ssize_t>
+    c_contiguous_strides(const std::vector<py::ssize_t> &shape,
+                         py::ssize_t element_size = 1)
+{
+    return c_contiguous_strides(shape.size(), shape.data(), element_size);
+}
+
+inline std::vector<py::ssize_t>
+    f_contiguous_strides(const std::vector<py::ssize_t> &shape,
+                         py::ssize_t element_size = 1)
+{
+    return f_contiguous_strides(shape.size(), shape.data(), element_size);
+}
+
+class usm_ndarray : public py::object
+{
+public:
+    PYBIND11_OBJECT(usm_ndarray, py::object, [](PyObject *o) -> bool {
+        return PyObject_TypeCheck(
+                   o, ::dpctl::detail::dpctl_capi::get().PyUSMArrayType_) != 0;
+    })
+
+    usm_ndarray()
+        : py::object(
+              ::dpctl::detail::dpctl_capi::get().default_usm_ndarray_pyobj(),
+              borrowed_t{})
+    {
+        if (!m_ptr)
+            throw py::error_already_set();
+    }
+
+    char *get_data() const
+    {
+        PyUSMArrayObject *raw_ar = usm_array_ptr();
+
+        auto const &api = ::dpctl::detail::dpctl_capi::get();
+        return api.UsmNDArray_GetData_(raw_ar);
+    }
+
+    template <typename T>
+    T *get_data() const
+    {
+        return reinterpret_cast<T *>(get_data());
+    }
+
+    int get_ndim() const
+    {
+        PyUSMArrayObject *raw_ar = usm_array_ptr();
+
+        auto const &api = ::dpctl::detail::dpctl_capi::get();
+        return api.UsmNDArray_GetNDim_(raw_ar);
+    }
+
+    const py::ssize_t *get_shape_raw() const
+    {
+        PyUSMArrayObject *raw_ar = usm_array_ptr();
+
+        auto const &api = ::dpctl::detail::dpctl_capi::get();
+        return api.UsmNDArray_GetShape_(raw_ar);
+    }
+
+    std::vector<py::ssize_t> get_shape_vector() const
+    {
+        auto raw_sh = get_shape_raw();
+        auto nd = get_ndim();
+
+        std::vector<py::ssize_t> shape_vector(raw_sh, raw_sh + nd);
+        return shape_vector;
+    }
+
+    py::ssize_t get_shape(int i) const
+    {
+        auto shape_ptr = get_shape_raw();
+        return shape_ptr[i];
+    }
+
+    const py::ssize_t *get_strides_raw() const
+    {
+        PyUSMArrayObject *raw_ar = usm_array_ptr();
+
+        auto const &api = ::dpctl::detail::dpctl_capi::get();
+        return api.UsmNDArray_GetStrides_(raw_ar);
+    }
+
+    std::vector<py::ssize_t> get_strides_vector() const
+    {
+        auto raw_st = get_strides_raw();
+        auto nd = get_ndim();
+
+        if (raw_st == nullptr) {
+            auto is_c_contig = is_c_contiguous();
+            auto is_f_contig = is_f_contiguous();
+            auto raw_sh = get_shape_raw();
+            if (is_c_contig) {
+                const auto &contig_strides = c_contiguous_strides(nd, raw_sh);
+                return contig_strides;
+            }
+            else if (is_f_contig) {
+                const auto &contig_strides = f_contiguous_strides(nd, raw_sh);
+                return contig_strides;
+            }
+            else {
+                throw std::runtime_error("Invalid array encountered when "
+                                         "building strides");
+            }
+        }
+        else {
+            std::vector<py::ssize_t> st_vec(raw_st, raw_st + nd);
+            return st_vec;
+        }
+    }
+
+    py::ssize_t get_size() const
+    {
+        PyUSMArrayObject *raw_ar = usm_array_ptr();
+
+        auto const &api = ::dpctl::detail::dpctl_capi::get();
+        int ndim = api.UsmNDArray_GetNDim_(raw_ar);
+        const py::ssize_t *shape = api.UsmNDArray_GetShape_(raw_ar);
+
+        py::ssize_t nelems = 1;
+        for (int i = 0; i < ndim; ++i) {
+            nelems *= shape[i];
+        }
+
+        assert(nelems >= 0);
+        return nelems;
+    }
+
+    std::pair<py::ssize_t, py::ssize_t> get_minmax_offsets() const
+    {
+        PyUSMArrayObject *raw_ar = usm_array_ptr();
+
+        auto const &api = ::dpctl::detail::dpctl_capi::get();
+        int nd = api.UsmNDArray_GetNDim_(raw_ar);
+        const py::ssize_t *shape = api.UsmNDArray_GetShape_(raw_ar);
+        const py::ssize_t *strides = api.UsmNDArray_GetStrides_(raw_ar);
+
+        py::ssize_t offset_min = 0;
+        py::ssize_t offset_max = 0;
+        if (strides == nullptr) {
+            py::ssize_t stride(1);
+            for (int i = 0; i < nd; ++i) {
+                offset_max += stride * (shape[i] - 1);
+                stride *= shape[i];
+            }
+        }
+        else {
+            for (int i = 0; i < nd; ++i) {
+                py::ssize_t delta = strides[i] * (shape[i] - 1);
+                if (strides[i] > 0) {
+                    offset_max += delta;
+                }
+                else {
+                    offset_min += delta;
+                }
+            }
+        }
+        return std::make_pair(offset_min, offset_max);
+    }
+
+    sycl::queue get_queue() const
+    {
+        PyUSMArrayObject *raw_ar = usm_array_ptr();
+
+        auto const &api = ::dpctl::detail::dpctl_capi::get();
+        DPCTLSyclQueueRef QRef = api.UsmNDArray_GetQueueRef_(raw_ar);
+        return *(reinterpret_cast<sycl::queue *>(QRef));
+    }
+
+    sycl::device get_device() const
+    {
+        PyUSMArrayObject *raw_ar = usm_array_ptr();
+
+        auto const &api = ::dpctl::detail::dpctl_capi::get();
+        DPCTLSyclQueueRef QRef = api.UsmNDArray_GetQueueRef_(raw_ar);
+        return reinterpret_cast<sycl::queue *>(QRef)->get_device();
+    }
+
+    int get_typenum() const
+    {
+        PyUSMArrayObject *raw_ar = usm_array_ptr();
+
+        auto const &api = ::dpctl::detail::dpctl_capi::get();
+        return api.UsmNDArray_GetTypenum_(raw_ar);
+    }
+
+    int get_flags() const
+    {
+        PyUSMArrayObject *raw_ar = usm_array_ptr();
+
+        auto const &api = ::dpctl::detail::dpctl_capi::get();
+        return api.UsmNDArray_GetFlags_(raw_ar);
+    }
+
+    int get_elemsize() const
+    {
+        PyUSMArrayObject *raw_ar = usm_array_ptr();
+
+        auto const &api = ::dpctl::detail::dpctl_capi::get();
+        return api.UsmNDArray_GetElementSize_(raw_ar);
+    }
+
+    bool is_c_contiguous() const
+    {
+        int flags = get_flags();
+        auto const &api = ::dpctl::detail::dpctl_capi::get();
+        return static_cast<bool>(flags & api.USM_ARRAY_C_CONTIGUOUS_);
+    }
+
+    bool is_f_contiguous() const
+    {
+        int flags = get_flags();
+        auto const &api = ::dpctl::detail::dpctl_capi::get();
+        return static_cast<bool>(flags & api.USM_ARRAY_F_CONTIGUOUS_);
+    }
+
+    bool is_writable() const
+    {
+        int flags = get_flags();
+        auto const &api = ::dpctl::detail::dpctl_capi::get();
+        return static_cast<bool>(flags & api.USM_ARRAY_WRITABLE_);
+    }
+
+    /*! @brief Get usm_data property of array */
+    py::object get_usm_data() const
+    {
+        PyUSMArrayObject *raw_ar = usm_array_ptr();
+
+        auto const &api = ::dpctl::detail::dpctl_capi::get();
+        // UsmNDArray_GetUSMData_ gives a new reference
+        PyObject *usm_data = api.UsmNDArray_GetUSMData_(raw_ar);
+
+        // pass reference ownership to py::object
+        return py::reinterpret_steal<py::object>(usm_data);
+    }
+
+    bool is_managed_by_smart_ptr() const
+    {
+        PyUSMArrayObject *raw_ar = usm_array_ptr();
+
+        auto const &api = ::dpctl::detail::dpctl_capi::get();
+        PyObject *usm_data = api.UsmNDArray_GetUSMData_(raw_ar);
+
+        if (!PyObject_TypeCheck(usm_data, api.Py_MemoryType_)) {
+            Py_DECREF(usm_data);
+            return false;
+        }
+
+        Py_MemoryObject *mem_obj =
+            reinterpret_cast<Py_MemoryObject *>(usm_data);
+        const void *opaque_ptr = api.Memory_GetOpaquePointer_(mem_obj);
+
+        Py_DECREF(usm_data);
+        return bool(opaque_ptr);
+    }
+
+    const std::shared_ptr<void> &get_smart_ptr_owner() const
+    {
+        PyUSMArrayObject *raw_ar = usm_array_ptr();
+
+        auto const &api = ::dpctl::detail::dpctl_capi::get();
+
+        PyObject *usm_data = api.UsmNDArray_GetUSMData_(raw_ar);
+
+        if (!PyObject_TypeCheck(usm_data, api.Py_MemoryType_)) {
+            Py_DECREF(usm_data);
+            throw std::runtime_error(
+                "usm_ndarray object does not have Memory object "
+                "managing lifetime of USM allocation");
+        }
+
+        Py_MemoryObject *mem_obj =
+            reinterpret_cast<Py_MemoryObject *>(usm_data);
+        void *opaque_ptr = api.Memory_GetOpaquePointer_(mem_obj);
+        Py_DECREF(usm_data);
+
+        if (opaque_ptr) {
+            auto shptr_ptr =
+                reinterpret_cast<std::shared_ptr<void> *>(opaque_ptr);
+            return *shptr_ptr;
+        }
+        else {
+            throw std::runtime_error(
+                "Memory object underlying usm_ndarray does not have "
+                "smart pointer managing lifetime of USM allocation");
+        }
+    }
+
+private:
+    PyUSMArrayObject *usm_array_ptr() const
+    {
+        return reinterpret_cast<PyUSMArrayObject *>(m_ptr);
+    }
+};
+} // end namespace tensor
+
+namespace utils
+{
+namespace detail
+{
+struct ManagedMemory
+{
+
+    static bool is_usm_managed_by_shared_ptr(const py::object &h)
+    {
+        if (py::isinstance<dpctl::memory::usm_memory>(h)) {
+            const auto &usm_memory_inst =
+                py::cast<dpctl::memory::usm_memory>(h);
+            return usm_memory_inst.is_managed_by_smart_ptr();
+        }
+        else if (py::isinstance<dpctl::tensor::usm_ndarray>(h)) {
+            const auto &usm_array_inst =
+                py::cast<dpctl::tensor::usm_ndarray>(h);
+            return usm_array_inst.is_managed_by_smart_ptr();
+        }
+
+        return false;
+    }
+
+    static const std::shared_ptr<void> &extract_shared_ptr(const py::object &h)
+    {
+        if (py::isinstance<dpctl::memory::usm_memory>(h)) {
+            const auto &usm_memory_inst =
+                py::cast<dpctl::memory::usm_memory>(h);
+            return usm_memory_inst.get_smart_ptr_owner();
+        }
+        else if (py::isinstance<dpctl::tensor::usm_ndarray>(h)) {
+            const auto &usm_array_inst =
+                py::cast<dpctl::tensor::usm_ndarray>(h);
+            return usm_array_inst.get_smart_ptr_owner();
+        }
+
+        throw std::runtime_error(
+            "Attempted extraction of shared_ptr on an unrecognized type");
+    }
+};
+} // end of namespace detail
+
+template <std::size_t num>
+sycl::event keep_args_alive(sycl::queue &q,
+                            const py::object (&py_objs)[num],
+                            const std::vector<sycl::event> &depends = {})
+{
+    std::size_t n_objects_held = 0;
+    std::array<std::shared_ptr<py::handle>, num> shp_arr{};
+
+    std::size_t n_usm_owners_held = 0;
+    std::array<std::shared_ptr<void>, num> shp_usm{};
+
+    for (std::size_t i = 0; i < num; ++i) {
+        const auto &py_obj_i = py_objs[i];
+        if (detail::ManagedMemory::is_usm_managed_by_shared_ptr(py_obj_i)) {
+            const auto &shp =
+                detail::ManagedMemory::extract_shared_ptr(py_obj_i);
+            shp_usm[n_usm_owners_held] = shp;
+            ++n_usm_owners_held;
+        }
+        else {
+            shp_arr[n_objects_held] = std::make_shared<py::handle>(py_obj_i);
+            shp_arr[n_objects_held]->inc_ref();
+            ++n_objects_held;
+        }
+    }
+
+    bool use_depends = true;
+    sycl::event host_task_ev;
+
+    if (n_usm_owners_held > 0) {
+        host_task_ev = q.submit([&](sycl::handler &cgh) {
+            if (use_depends) {
+                cgh.depends_on(depends);
+                use_depends = false;
+            }
+            else {
+                cgh.depends_on(host_task_ev);
+            }
+            cgh.host_task([shp_usm = std::move(shp_usm)]() {
+                // no body, but shared pointers are captured in
+                // the lambda, ensuring that USM allocation is
+                // kept alive
+            });
+        });
+    }
+
+    if (n_objects_held > 0) {
+        host_task_ev = q.submit([&](sycl::handler &cgh) {
+            if (use_depends) {
+                cgh.depends_on(depends);
+                use_depends = false;
+            }
+            else {
+                cgh.depends_on(host_task_ev);
+            }
+            cgh.host_task([n_objects_held, shp_arr = std::move(shp_arr)]() {
+                py::gil_scoped_acquire acquire;
+
+                for (std::size_t i = 0; i < n_objects_held; ++i) {
+                    shp_arr[i]->dec_ref();
+                }
+            });
+        });
+    }
+
+    return host_task_ev;
+}
+
+/*! @brief Check if all allocation queues are the same as the
+    execution queue */
+template <std::size_t num>
+bool queues_are_compatible(const sycl::queue &exec_q,
+                           const sycl::queue (&alloc_qs)[num])
+{
+    for (std::size_t i = 0; i < num; ++i) {
+
+        if (exec_q != alloc_qs[i]) {
+            return false;
+        }
+    }
+    return true;
+}
+
+/*! @brief Check if all allocation queues of usm_ndarays are the same as
+    the execution queue */
+template <std::size_t num>
+bool queues_are_compatible(const sycl::queue &exec_q,
+                           const ::dpctl::tensor::usm_ndarray (&arrs)[num])
+{
+    for (std::size_t i = 0; i < num; ++i) {
+
+        if (exec_q != arrs[i].get_queue()) {
+            return false;
+        }
+    }
+    return true;
+}
+} // end namespace utils
+} // end namespace dpctl
diff --git a/pyproject.toml b/pyproject.toml
index d659428877fc..cdf592535d11 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -108,7 +108,7 @@ target-version = ['py310', 'py311', 'py312', 'py313', 'py314']
 [tool.codespell]
 builtin = "clear,rare,informal,names"
 check-filenames = true
-ignore-words-list = "amin,arange,elemt,fro,hist,ith,mone,nd,nin,sinc,vart"
+ignore-words-list = "amin,arange,elemt,fro,hist,ith,mone,nd,nin,sinc,vart,GroupT"
 quiet-level = 3
 
 [tool.coverage.report]
@@ -134,13 +134,21 @@ source = [
 ensure_newline_before_comments = true
 force_grid_wrap = 0
 include_trailing_comma = true
+known_third_party = ["dpctl"]
 line_length = 80
 multi_line_output = 3
+profile = "black"
 skip = ["dpnp/__init__.py"]
 split_on_trailing_comma = true
 use_parentheses = true
 
 [tool.pylint.basic]
+disable = [
+  "wrong-import-order",
+  "ungrouped-imports",
+  "wrong-import-position"
+]
+ignored-modules = ["dpctl", "dpctl.*"]
 include-naming-hint = true
 
 [tool.pylint.classes]

From 1a1a099b57baf20deb3623dd543f990abd915f94 Mon Sep 17 00:00:00 2001
From: vlad-perevezentsev <vladislav.perevezentsev@intel.com>
Date: Thu, 19 Feb 2026 13:19:13 +0100
Subject: [PATCH 02/43] Move `_tensor_impl` extensions and use it for dpnp
 (#2755)

This PR proposes introducing `dpctl_ext` as a new internal extension
module (temporarily renamed from `dpctl` to avoid conflicts), adding
CMake/packaging support for building `_tensor_impl` via pybind11 and
switching dpnp to use `dpctl_ext.tensor. _tensor_impl`

The migrated `_tensor_impl` currently supports the following functions:

>  '_array_overlap',
>  '_as_c_contig',
>  '_as_f_contig',
>  '_contract_iter',
>  '_contract_iter2',
>  '_contract_iter3',
>  '_contract_iter4',
>  '_copy_usm_ndarray_into_usm_ndarray',
>  '_ravel_multi_index',
>  '_same_logical_tensors',
>  '_unravel_index',
>  'default_device_bool_type',
>  'default_device_complex_type',
>  'default_device_fp_type',
>  'default_device_index_type',
>  'default_device_int_type',
>  'default_device_uint_type'

Files in `dpnp` that explicitly `import dpctl.tensor._tensor_impl`
---
 .gitignore                                    |    4 +
 CMakeLists.txt                                |    1 +
 dpctl_ext/CMakeLists.txt                      |  119 ++
 dpctl_ext/__init__.py                         |   27 +
 dpctl_ext/tensor/CMakeLists.txt               |  179 +++
 dpctl_ext/tensor/__init__.py                  |   27 +
 .../libtensor/include/kernels/alignment.hpp   |    0
 .../include/kernels/copy_and_cast.hpp         | 1280 +++++++++++++++++
 .../include/kernels/copy_as_contiguous.hpp    |  646 +++++++++
 .../include/kernels/dpctl_tensor_types.hpp    |    0
 .../kernels/elementwise_functions/common.hpp  |    0
 .../elementwise_functions/common_detail.hpp   |    0
 .../elementwise_functions/logaddexp.hpp       |    0
 .../kernels/elementwise_functions/maximum.hpp |    0
 .../kernels/elementwise_functions/minimum.hpp |    0
 .../elementwise_functions/sycl_complex.hpp    |    0
 .../elementwise_functions/vec_size_util.hpp   |    0
 .../include/utils/indexing_utils.hpp          |    0
 .../libtensor/include/utils/math_utils.hpp    |    0
 .../include/utils/memory_overlap.hpp          |    0
 .../libtensor/include/utils/offset_utils.hpp  |    0
 .../include/utils/output_validation.hpp       |    0
 .../libtensor/include/utils/strided_iters.hpp |    0
 .../include/utils/sycl_alloc_utils.hpp        |    0
 .../libtensor/include/utils/sycl_utils.hpp    |    0
 .../libtensor/include/utils/type_dispatch.hpp |    0
 .../include/utils/type_dispatch_building.hpp  |    0
 .../libtensor/include/utils/type_utils.hpp    |    0
 .../source/copy_and_cast_usm_to_usm.cpp       |  297 ++++
 .../source/copy_and_cast_usm_to_usm.hpp       |   53 +
 .../libtensor/source/copy_as_contig.cpp       |  786 ++++++++++
 .../libtensor/source/copy_as_contig.hpp       |   54 +
 .../source/device_support_queries.cpp         |  179 +++
 .../source/device_support_queries.hpp         |   50 +
 .../source/simplify_iteration_space.cpp       |  540 +++++++
 .../source/simplify_iteration_space.hpp       |  125 ++
 .../tensor/libtensor/source/tensor_ctors.cpp  |  501 +++++++
 dpnp/backend/extensions/blas/CMakeLists.txt   |    2 +-
 dpnp/backend/extensions/fft/CMakeLists.txt    |    2 +-
 .../extensions/indexing/CMakeLists.txt        |    2 +-
 dpnp/backend/extensions/lapack/CMakeLists.txt |    2 +-
 .../extensions/statistics/CMakeLists.txt      |    2 +-
 dpnp/backend/extensions/ufunc/CMakeLists.txt  |    2 +-
 dpnp/backend/extensions/vm/CMakeLists.txt     |    2 +-
 dpnp/backend/extensions/window/CMakeLists.txt |    2 +-
 dpnp/dpnp_algo/dpnp_elementwise_common.py     |    5 +-
 dpnp/dpnp_iface.py                            |    7 +-
 dpnp/dpnp_iface_searching.py                  |    5 +-
 dpnp/dpnp_utils/dpnp_utils_linearalgebra.py   |    5 +-
 dpnp/scipy/linalg/_utils.py                   |    4 +-
 dpnp/tests/test_array_api_info.py             |    6 +-
 pyproject.toml                                |    2 +-
 setup.py                                      |    3 +
 53 files changed, 4903 insertions(+), 18 deletions(-)
 create mode 100644 dpctl_ext/CMakeLists.txt
 create mode 100644 dpctl_ext/__init__.py
 create mode 100644 dpctl_ext/tensor/CMakeLists.txt
 create mode 100644 dpctl_ext/tensor/__init__.py
 rename {dpctl => dpctl_ext}/tensor/libtensor/include/kernels/alignment.hpp (100%)
 create mode 100644 dpctl_ext/tensor/libtensor/include/kernels/copy_and_cast.hpp
 create mode 100644 dpctl_ext/tensor/libtensor/include/kernels/copy_as_contiguous.hpp
 rename {dpctl => dpctl_ext}/tensor/libtensor/include/kernels/dpctl_tensor_types.hpp (100%)
 rename {dpctl => dpctl_ext}/tensor/libtensor/include/kernels/elementwise_functions/common.hpp (100%)
 rename {dpctl => dpctl_ext}/tensor/libtensor/include/kernels/elementwise_functions/common_detail.hpp (100%)
 rename {dpctl => dpctl_ext}/tensor/libtensor/include/kernels/elementwise_functions/logaddexp.hpp (100%)
 rename {dpctl => dpctl_ext}/tensor/libtensor/include/kernels/elementwise_functions/maximum.hpp (100%)
 rename {dpctl => dpctl_ext}/tensor/libtensor/include/kernels/elementwise_functions/minimum.hpp (100%)
 rename {dpctl => dpctl_ext}/tensor/libtensor/include/kernels/elementwise_functions/sycl_complex.hpp (100%)
 rename {dpctl => dpctl_ext}/tensor/libtensor/include/kernels/elementwise_functions/vec_size_util.hpp (100%)
 rename {dpctl => dpctl_ext}/tensor/libtensor/include/utils/indexing_utils.hpp (100%)
 rename {dpctl => dpctl_ext}/tensor/libtensor/include/utils/math_utils.hpp (100%)
 rename {dpctl => dpctl_ext}/tensor/libtensor/include/utils/memory_overlap.hpp (100%)
 rename {dpctl => dpctl_ext}/tensor/libtensor/include/utils/offset_utils.hpp (100%)
 rename {dpctl => dpctl_ext}/tensor/libtensor/include/utils/output_validation.hpp (100%)
 rename {dpctl => dpctl_ext}/tensor/libtensor/include/utils/strided_iters.hpp (100%)
 rename {dpctl => dpctl_ext}/tensor/libtensor/include/utils/sycl_alloc_utils.hpp (100%)
 rename {dpctl => dpctl_ext}/tensor/libtensor/include/utils/sycl_utils.hpp (100%)
 rename {dpctl => dpctl_ext}/tensor/libtensor/include/utils/type_dispatch.hpp (100%)
 rename {dpctl => dpctl_ext}/tensor/libtensor/include/utils/type_dispatch_building.hpp (100%)
 rename {dpctl => dpctl_ext}/tensor/libtensor/include/utils/type_utils.hpp (100%)
 create mode 100644 dpctl_ext/tensor/libtensor/source/copy_and_cast_usm_to_usm.cpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/copy_and_cast_usm_to_usm.hpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/copy_as_contig.cpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/copy_as_contig.hpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/device_support_queries.cpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/device_support_queries.hpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/simplify_iteration_space.cpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/simplify_iteration_space.hpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp

diff --git a/.gitignore b/.gitignore
index 5d2725d3186f..0cfebe53f623 100644
--- a/.gitignore
+++ b/.gitignore
@@ -32,3 +32,7 @@ dpnp/**/*.cpython*.so
 dpnp/**/*.pyd
 *~
 core
+
+# TODO: revert to `dpctl/`
+# when dpnp fully migrates dpctl/tensor
+dpctl_ext/**/*.cpython*.so
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 2d0bf1edc75f..c7bb7f650dac 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -345,3 +345,4 @@ if(DEFINED SKBUILD)
 endif()
 
 add_subdirectory(dpnp)
+add_subdirectory(dpctl_ext)
diff --git a/dpctl_ext/CMakeLists.txt b/dpctl_ext/CMakeLists.txt
new file mode 100644
index 000000000000..e58693091422
--- /dev/null
+++ b/dpctl_ext/CMakeLists.txt
@@ -0,0 +1,119 @@
+# -*- coding: utf-8 -*-
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+# TODO: rework this logic to remove current duplication
+if(WIN32)
+    string(
+        CONCAT WARNING_FLAGS
+        "-Wall "
+        "-Wextra "
+        "-Winit-self "
+        "-Wunused-function "
+        "-Wuninitialized "
+        "-Wmissing-declarations "
+        "-Wstrict-prototypes "
+        "-Wno-unused-parameter "
+    )
+    string(CONCAT SDL_FLAGS "/GS " "/DynamicBase ")
+    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /Ox ${WARNING_FLAGS} ${SDL_FLAGS}")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /Ox ${WARNING_FLAGS} ${SDL_FLAGS}")
+    set(CMAKE_C_FLAGS_DEBUG
+        "${CMAKE_C_FLAGS_DEBUG} ${WARNING_FLAGS} ${SDL_FLAGS} -O0 -g1 -DDEBUG -Xsycl-target-frontend=spir64 \"-g0\""
+    )
+    set(CMAKE_CXX_FLAGS_DEBUG
+        "${CMAKE_CXX_FLAGS_DEBUG} ${WARNING_FLAGS} ${SDL_FLAGS} -O0 -g1 -DDEBUG -Xsycl-target-frontend=spir64 \"-g0\""
+    )
+    set(CMAKE_C_FLAGS_COVERAGE
+        "${CMAKE_C_FLAGS_DEBUG} ${WARNING_FLAGS} ${SDL_FLAGS} -O1 -g1 -DDEBUG"
+    )
+    set(CMAKE_CXX_FLAGS_COVERAGE
+        "${CMAKE_CXX_FLAGS_DEBUG} ${WARNING_FLAGS} ${SDL_FLAGS} -O1 -g1 -DDEBUG"
+    )
+    set(CMAKE_MODULE_LINKER_FLAGS_COVERAGE "${CMAKE_MODULE_LINKER_FLAGS_DEBUG}")
+    set(DPCTL_LDFLAGS "/NXCompat;/DynamicBase")
+    mark_as_advanced(
+        CMAKE_CXX_FLAGS_COVERAGE
+        CMAKE_C_FLAGS_COVERAGE
+        CMAKE_MODULE_LINKER_FLAGS_COVERAGE
+    )
+elseif(UNIX)
+    string(
+        CONCAT WARNING_FLAGS
+        "-Wall "
+        "-Wextra "
+        "-Winit-self "
+        "-Wunused-function "
+        "-Wuninitialized "
+        "-Wmissing-declarations "
+        "-Wstrict-prototypes "
+        "-Wno-unused-parameter "
+        "-fdiagnostics-color=auto "
+    )
+    string(
+        CONCAT SDL_FLAGS
+        "-fstack-protector "
+        "-fstack-protector-all "
+        "-fpic "
+        "-fPIC "
+        "-D_FORTIFY_SOURCE=2 "
+        "-Wformat "
+        "-Wformat-security "
+        #       "-fno-strict-overflow "    # no-strict-overflow is implied by -fwrapv
+        "-fno-delete-null-pointer-checks "
+        "-fwrapv "
+    )
+    string(CONCAT CFLAGS "${WARNING_FLAGS}" "${SDL_FLAGS}")
+    string(CONCAT CXXFLAGS "${WARNING_FLAGS}" "${SDL_FLAGS}")
+    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -O3 ${CFLAGS}")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3 ${CXXFLAGS}")
+    set(CMAKE_C_FLAGS_DEBUG
+        "${CMAKE_C_FLAGS_DEBUG} ${CFLAGS} -O0 -g -DDEBUG -Xsycl-target-frontend=spir64 \"-g0\""
+    )
+    set(CMAKE_CXX_FLAGS_DEBUG
+        "${CMAKE_CXX_FLAGS_DEBUG} ${CXXFLAGS} -O0 -g -DDEBUG -Xsycl-target-frontend=spir64 \"-g0\""
+    )
+    set(CMAKE_C_FLAGS_COVERAGE "${CMAKE_C_FLAGS_DEBUG} ${CFLAGS} -O1 -g1 -DDEBUG")
+    set(CMAKE_CXX_FLAGS_COVERAGE "${CMAKE_CXX_FLAGS_DEBUG} ${CXXFLAGS} -O1 -g1 -DDEBUG")
+    set(CMAKE_MODULE_LINKER_FLAGS_COVERAGE "${CMAKE_MODULE_LINKER_FLAGS_DEBUG}")
+    set(DPCTL_LDFLAGS "-z,noexecstack,-z,relro,-z,now")
+    mark_as_advanced(
+        CMAKE_CXX_FLAGS_COVERAGE
+        CMAKE_C_FLAGS_COVERAGE
+        CMAKE_MODULE_LINKER_FLAGS_COVERAGE
+    )
+else()
+    message(FATAL_ERROR "Unsupported system.")
+endif()
+
+# at build time create include/ directory and copy header files over
+# set(DPCTL_INCLUDE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/include)
+
+set(CMAKE_INSTALL_RPATH "$ORIGIN")
+
+add_subdirectory(tensor)
diff --git a/dpctl_ext/__init__.py b/dpctl_ext/__init__.py
new file mode 100644
index 000000000000..a71324cb88d8
--- /dev/null
+++ b/dpctl_ext/__init__.py
@@ -0,0 +1,27 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
diff --git a/dpctl_ext/tensor/CMakeLists.txt b/dpctl_ext/tensor/CMakeLists.txt
new file mode 100644
index 000000000000..ed69b4f10cba
--- /dev/null
+++ b/dpctl_ext/tensor/CMakeLists.txt
@@ -0,0 +1,179 @@
+# -*- coding: utf-8 -*-
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+find_package(Python COMPONENTS Development.Module)
+
+if(WIN32)
+    if(${CMAKE_VERSION} VERSION_LESS "3.27")
+        # this is a work-around for target_link_options inserting option after -link option, cause
+        # linker to ignore it.
+        set(CMAKE_CXX_LINK_FLAGS
+            "${CMAKE_CXX_LINK_FLAGS} -fsycl-device-code-split=per_kernel"
+        )
+    endif()
+endif()
+
+# TODO: reuse this library for dpnp ufunc extension build
+set(_static_lib_sources
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/simplify_iteration_space.cpp
+)
+set(_tensor_impl_sources
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/tensor_ctors.cpp
+    # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/accumulators.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_and_cast_usm_to_usm.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_as_contig.cpp
+    # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_numpy_ndarray_into_usm_ndarray.cpp
+    # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_for_reshape.cpp
+    # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_for_roll.cpp
+    # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/linear_sequences.cpp
+    # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/integer_advanced_indexing.cpp
+    # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/boolean_advanced_indexing.cpp
+    # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/eye_ctor.cpp
+    # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/full_ctor.cpp
+    # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/zeros_ctor.cpp
+    # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/triul_ctor.cpp
+    # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/where.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/device_support_queries.cpp
+    # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/repeat.cpp
+    # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/clip.cpp
+)
+
+set(_static_lib_trgt simplify_iteration_space)
+
+add_library(${_static_lib_trgt} STATIC ${_static_lib_sources})
+target_include_directories(
+    ${_static_lib_trgt}
+    PRIVATE
+        # ${Python_INCLUDE_DIRS}
+        # ${Dpctl_INCLUDE_DIR}
+        ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/include
+)
+target_link_libraries(${_static_lib_trgt} PRIVATE pybind11::headers Python::Module)
+set_target_properties(${_static_lib_trgt} PROPERTIES POSITION_INDEPENDENT_CODE ON)
+
+set(_py_trgts)
+
+set(python_module_name _tensor_impl)
+pybind11_add_module(${python_module_name} MODULE ${_tensor_impl_sources})
+add_sycl_to_target(TARGET ${python_module_name} SOURCES ${_tensor_impl_sources})
+target_link_libraries(${python_module_name} PRIVATE ${_static_lib_trgt})
+list(APPEND _py_trgts ${python_module_name})
+
+set(_clang_prefix "")
+if(WIN32)
+    set(_clang_prefix "/clang:")
+endif()
+
+set(_no_fast_math_sources
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_and_cast_usm_to_usm.cpp
+    # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/full_ctor.cpp
+    # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/linear_sequences.cpp
+    # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/clip.cpp
+    # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/where.cpp
+)
+#list(
+#APPEND _no_fast_math_sources
+# ${_elementwise_sources}
+# ${_reduction_sources}
+# ${_sorting_sources}
+# ${_linalg_sources}
+# ${_accumulator_sources}
+#)
+
+foreach(_src_fn ${_no_fast_math_sources})
+    get_source_file_property(_cmpl_options_prop ${_src_fn} COMPILE_OPTIONS)
+    set(_combined_options_prop ${_cmpl_options_prop} "${_clang_prefix}-fno-fast-math")
+    set_source_files_properties(
+        ${_src_fn}
+        PROPERTIES COMPILE_OPTIONS "${_combined_options_prop}"
+    )
+endforeach()
+
+set(_compiler_definitions "")
+
+set(_linker_options "LINKER:${DPNP_LDFLAGS}")
+foreach(python_module_name ${_py_trgts})
+    target_compile_options(
+        ${python_module_name}
+        PRIVATE -fno-sycl-id-queries-fit-in-int
+    )
+    target_link_options(
+        ${python_module_name}
+        PRIVATE -fsycl-device-code-split=per_kernel
+    )
+    # TODO: expand DPCTL_OFFLOAD_COMPRESS to the whole dpnp level
+    if(DPCTL_OFFLOAD_COMPRESS)
+        target_link_options(${python_module_name} PRIVATE --offload-compress)
+    endif()
+
+    target_include_directories(
+        ${python_module_name}
+        PRIVATE
+            ${CMAKE_SOURCE_DIR}/dpnp/backend/include
+            ${Dpctl_INCLUDE_DIR}
+            ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/include
+            ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/
+    )
+    target_link_options(${python_module_name} PRIVATE ${_linker_options})
+    if(DPCTL_GENERATE_COVERAGE)
+        if(DPCTL_GENERATE_COVERAGE_FOR_PYBIND11_EXTENSIONS)
+            target_compile_options(
+                ${python_module_name}
+                PRIVATE -fprofile-instr-generate -fcoverage-mapping
+            )
+        endif()
+        target_link_options(
+            ${python_module_name}
+            PRIVATE -fprofile-instr-generate -fcoverage-mapping
+        )
+    endif()
+    if(_dpnp_sycl_targets)
+        # make fat binary
+        target_compile_options(
+            ${python_module_name}
+            PRIVATE ${_dpnp_sycl_target_compile_options}
+        )
+        target_link_options(
+            ${python_module_name}
+            PRIVATE ${_dpnp_sycl_target_link_options}
+        )
+    endif()
+    # TODO: update source so they reference individual libraries instead of
+    #   dpctl4pybind11.hpp. It will allow to simplify dependency tree
+    # NOTE: dpctl C-API is resolved at runtime via Python
+    # target_link_libraries(${python_module_name} PRIVATE DpctlCAPI)
+    if(DPNP_WITH_REDIST)
+        set_target_properties(
+            ${python_module_name}
+            PROPERTIES INSTALL_RPATH "$ORIGIN/../../../.."
+        )
+    endif()
+    # TODO: revert to `DESTINATION "dpctl/tensor"`
+    install(TARGETS ${python_module_name} DESTINATION "dpctl_ext/tensor")
+endforeach()
diff --git a/dpctl_ext/tensor/__init__.py b/dpctl_ext/tensor/__init__.py
new file mode 100644
index 000000000000..a71324cb88d8
--- /dev/null
+++ b/dpctl_ext/tensor/__init__.py
@@ -0,0 +1,27 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
diff --git a/dpctl/tensor/libtensor/include/kernels/alignment.hpp b/dpctl_ext/tensor/libtensor/include/kernels/alignment.hpp
similarity index 100%
rename from dpctl/tensor/libtensor/include/kernels/alignment.hpp
rename to dpctl_ext/tensor/libtensor/include/kernels/alignment.hpp
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/copy_and_cast.hpp b/dpctl_ext/tensor/libtensor/include/kernels/copy_and_cast.hpp
new file mode 100644
index 000000000000..d6001a11e471
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/include/kernels/copy_and_cast.hpp
@@ -0,0 +1,1280 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for tensor copying and value casting.
+//===----------------------------------------------------------------------===//
+
+#pragma once
+#include <algorithm>
+#include <array>
+#include <cstddef>
+#include <cstdint>
+#include <sycl/sycl.hpp>
+#include <vector>
+
+#include "dpctl_tensor_types.hpp"
+#include "kernels/alignment.hpp"
+#include "utils/offset_utils.hpp"
+#include "utils/sycl_utils.hpp"
+#include "utils/type_utils.hpp"
+
+namespace dpctl::tensor::kernels::copy_and_cast
+{
+
+using dpctl::tensor::ssize_t;
+using namespace dpctl::tensor::offset_utils;
+
+using dpctl::tensor::kernels::alignment_utils::
+    disabled_sg_loadstore_wrapper_krn;
+using dpctl::tensor::kernels::alignment_utils::is_aligned;
+using dpctl::tensor::kernels::alignment_utils::required_alignment;
+
+using dpctl::tensor::sycl_utils::sub_group_load;
+using dpctl::tensor::sycl_utils::sub_group_store;
+
+template <typename srcT, typename dstT, typename IndexerT>
+class copy_cast_generic_kernel;
+
+template <typename srcT,
+          typename dstT,
+          std::uint8_t vec_sz,
+          std::uint8_t n_vecs>
+class copy_cast_contig_kernel;
+
+template <typename srcT, typename dstT, typename IndexerT>
+class copy_cast_from_host_kernel;
+
+template <typename srcT, typename dstT, typename IndexerT>
+class copy_cast_from_host_contig_kernel;
+
+template <typename srcTy, typename dstTy>
+class Caster
+{
+public:
+    Caster() = default;
+    dstTy operator()(const srcTy &src) const
+    {
+        using dpctl::tensor::type_utils::convert_impl;
+        return convert_impl<dstTy, srcTy>(src);
+    }
+};
+
+template <typename srcT, typename dstT, typename CastFnT, typename IndexerT>
+class GenericCopyFunctor
+{
+private:
+    const srcT *src_ = nullptr;
+    dstT *dst_ = nullptr;
+    IndexerT indexer_;
+
+public:
+    GenericCopyFunctor(const srcT *src_p, dstT *dst_p, const IndexerT &indexer)
+        : src_(src_p), dst_(dst_p), indexer_(indexer)
+    {
+    }
+
+    void operator()(sycl::id<1> wiid) const
+    {
+        const auto &offsets = indexer_(static_cast<ssize_t>(wiid.get(0)));
+        const ssize_t &src_offset = offsets.get_first_offset();
+        const ssize_t &dst_offset = offsets.get_second_offset();
+
+        static constexpr CastFnT fn{};
+        dst_[dst_offset] = fn(src_[src_offset]);
+    }
+};
+
+/*!
+  @defgroup CopyAndCastKernels
+ */
+
+/*!
+ * @brief Function pointer type for generic array cast and copying function.
+ */
+typedef sycl::event (*copy_and_cast_generic_fn_ptr_t)(
+    sycl::queue &,
+    std::size_t,
+    int,
+    const ssize_t *,
+    const char *,
+    ssize_t,
+    char *,
+    ssize_t,
+    const std::vector<sycl::event> &,
+    const std::vector<sycl::event> &);
+
+/*!
+ * @brief Generic function to copy `nelems` elements from `src` usm_ndarray to
+ `dst` usm_ndarray while casting from `srcTy` to `dstTy`.
+
+   Both arrays have array dimensionality specified via argument `nd`. The
+ `shape_and_strides` is kernel accessible USM array of length `3*nd`, where the
+ first `nd` elements encode common shape, second `nd` elements contain strides
+ of `src` array, and the trailing `nd` elements contain strides of `dst` array.
+ `src_p` and `dst_p` represent pointers into respective arrays, but the start of
+ iteration begins at offset of `src_offset` elements for `src` array and at
+ offset `dst_offset` elements for `dst` array. Kernel is submitted to sycl queue
+ `q` with events `depends` and `additional_depends` as dependencies.
+
+   @param  q       Sycl queue to which the kernel is submitted.
+   @param  nelems  Number of elements to cast and copy.
+   @param  nd      Array dimensionality, i.e. number of indices needed to
+ identify an element of each array.
+   @param  shape_and_strides  Kernel accessible USM pointer to packed shape and
+ strides.
+   @param  src_p   Kernel accessible USM pointer for the source array
+   @param  src_offset  Offset to the beginning of iteration in number of
+ elements of source array from `src_p`.
+   @param  dst_p   Kernel accessible USM pointer for the destination array
+   @param  dst_offset  Offset to the beginning of iteration in number of
+ elements of destination array from `dst_p`.
+   @param  depends  List of events to wait for before starting computations, if
+ any.
+   @param  additional_depends Additional list of events to wait for before
+ starting computations, if any.
+
+   @return  Event to wait on to ensure that computation completes.
+   @ingroup CopyAndCastKernels
+ */
+template <typename dstTy, typename srcTy>
+sycl::event copy_and_cast_generic_impl(
+    sycl::queue &q,
+    std::size_t nelems,
+    int nd,
+    const ssize_t *shape_and_strides,
+    const char *src_p,
+    ssize_t src_offset,
+    char *dst_p,
+    ssize_t dst_offset,
+    const std::vector<sycl::event> &depends,
+    const std::vector<sycl::event> &additional_depends)
+{
+    dpctl::tensor::type_utils::validate_type_for_device<dstTy>(q);
+    dpctl::tensor::type_utils::validate_type_for_device<srcTy>(q);
+
+    sycl::event copy_and_cast_ev = q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+        cgh.depends_on(additional_depends);
+
+        const TwoOffsets_StridedIndexer indexer{nd, src_offset, dst_offset,
+                                                shape_and_strides};
+        const srcTy *src_tp = reinterpret_cast<const srcTy *>(src_p);
+        dstTy *dst_tp = reinterpret_cast<dstTy *>(dst_p);
+
+        cgh.parallel_for<class copy_cast_generic_kernel<
+            srcTy, dstTy, TwoOffsets_StridedIndexer>>(
+            sycl::range<1>(nelems),
+            GenericCopyFunctor<srcTy, dstTy, Caster<srcTy, dstTy>,
+                               TwoOffsets_StridedIndexer>(src_tp, dst_tp,
+                                                          indexer));
+    });
+
+    return copy_and_cast_ev;
+}
+
+/*!
+ * @brief Factory to get generic function pointer of type `fnT` for given source
+ * data type `S` and destination data type `D`.
+ * @ingroup CopyAndCastKernels
+ */
+template <typename fnT, typename D, typename S>
+struct CopyAndCastGenericFactory
+{
+    fnT get()
+    {
+        fnT f = copy_and_cast_generic_impl<D, S>;
+        return f;
+    }
+};
+
+// Specialization of copy_and_cast for contiguous arrays
+
+template <typename srcT,
+          typename dstT,
+          typename CastFnT,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
+          bool enable_sg_loadstore = true>
+class ContigCopyFunctor
+{
+private:
+    std::size_t nelems;
+    const srcT *src_p = nullptr;
+    dstT *dst_p = nullptr;
+
+public:
+    ContigCopyFunctor(const std::size_t nelems_,
+                      const srcT *src_p_,
+                      dstT *dst_p_)
+        : nelems(nelems_), src_p(src_p_), dst_p(dst_p_)
+    {
+    }
+
+    void operator()(sycl::nd_item<1> ndit) const
+    {
+        static constexpr CastFnT fn{};
+
+        static constexpr std::uint8_t elems_per_wi = n_vecs * vec_sz;
+
+        using dpctl::tensor::type_utils::is_complex_v;
+        if constexpr (!enable_sg_loadstore || is_complex_v<srcT> ||
+                      is_complex_v<dstT>) {
+            std::uint16_t sgSize = ndit.get_sub_group().get_local_range()[0];
+            const std::size_t gid = ndit.get_global_linear_id();
+
+            // start = (gid / sgSize) * elems_per_sg + (gid % sgSize)
+            const std::uint16_t elems_per_sg = sgSize * elems_per_wi;
+            const std::size_t start =
+                (gid / sgSize) * (elems_per_sg - sgSize) + gid;
+            const std::size_t end = std::min(nelems, start + elems_per_sg);
+            for (std::size_t offset = start; offset < end; offset += sgSize) {
+                dst_p[offset] = fn(src_p[offset]);
+            }
+        }
+        else {
+            auto sg = ndit.get_sub_group();
+            const std::uint16_t sgSize = sg.get_max_local_range()[0];
+            const std::size_t base =
+                elems_per_wi * (ndit.get_group(0) * ndit.get_local_range(0) +
+                                sg.get_group_id()[0] * sgSize);
+
+            if (base + elems_per_wi * sgSize < nelems) {
+                sycl::vec<dstT, vec_sz> dst_vec;
+
+#pragma unroll
+                for (std::uint8_t it = 0; it < n_vecs * vec_sz; it += vec_sz) {
+                    const std::size_t offset = base + it * sgSize;
+                    auto src_multi_ptr = sycl::address_space_cast<
+                        sycl::access::address_space::global_space,
+                        sycl::access::decorated::yes>(&src_p[offset]);
+                    auto dst_multi_ptr = sycl::address_space_cast<
+                        sycl::access::address_space::global_space,
+                        sycl::access::decorated::yes>(&dst_p[offset]);
+
+                    const sycl::vec<srcT, vec_sz> src_vec =
+                        sub_group_load<vec_sz>(sg, src_multi_ptr);
+#pragma unroll
+                    for (std::uint8_t k = 0; k < vec_sz; k++) {
+                        dst_vec[k] = fn(src_vec[k]);
+                    }
+                    sub_group_store<vec_sz>(sg, dst_vec, dst_multi_ptr);
+                }
+            }
+            else {
+                const std::size_t start = base + sg.get_local_id()[0];
+                for (std::size_t k = start; k < nelems; k += sgSize) {
+                    dst_p[k] = fn(src_p[k]);
+                }
+            }
+        }
+    }
+};
+
+/*!
+ * @brief Function pointer type for contiguous array cast and copy function.
+ */
+typedef sycl::event (*copy_and_cast_contig_fn_ptr_t)(
+    sycl::queue &,
+    std::size_t,
+    const char *,
+    char *,
+    const std::vector<sycl::event> &);
+
+/*!
+ * @brief Function to copy `nelems` elements from contiguous `src` usm_ndarray
+ to contiguous `dst` usm_ndarray while casting from `srcTy` to `dstTy`.
+
+   Both arrays have the same number of elements `nelems`.
+ `src_cp` and `dst_cp` represent char pointers to the start of respective
+ arrays. Kernel is submitted to sycl queue `q` with events `depends` as
+ dependencies.
+
+   @param  q       Sycl queue to which the kernel is submitted.
+   @param  nelems  Number of elements to cast and copy.
+   @param  src_p   Kernel accessible USM pointer for the source array
+   @param  dst_p   Kernel accessible USM pointer for the destination array
+   @param  depends  List of events to wait for before starting computations, if
+ any.
+
+   @return  Event to wait on to ensure that computation completes.
+   @ingroup CopyAndCastKernels
+ */
+template <typename dstTy, typename srcTy>
+sycl::event copy_and_cast_contig_impl(sycl::queue &q,
+                                      std::size_t nelems,
+                                      const char *src_cp,
+                                      char *dst_cp,
+                                      const std::vector<sycl::event> &depends)
+{
+    dpctl::tensor::type_utils::validate_type_for_device<dstTy>(q);
+    dpctl::tensor::type_utils::validate_type_for_device<srcTy>(q);
+
+    sycl::event copy_and_cast_ev = q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+
+        const srcTy *src_tp = reinterpret_cast<const srcTy *>(src_cp);
+        dstTy *dst_tp = reinterpret_cast<dstTy *>(dst_cp);
+
+        std::size_t lws = 64;
+        static constexpr std::uint32_t vec_sz = 4;
+        static constexpr std::uint32_t n_vecs = 2;
+        const std::size_t n_groups =
+            ((nelems + lws * n_vecs * vec_sz - 1) / (lws * n_vecs * vec_sz));
+        const auto gws_range = sycl::range<1>(n_groups * lws);
+        const auto lws_range = sycl::range<1>(lws);
+
+        if (is_aligned<required_alignment>(src_cp) &&
+            is_aligned<required_alignment>(dst_cp))
+        {
+            static constexpr bool enable_sg_loadstore = true;
+            using KernelName =
+                copy_cast_contig_kernel<srcTy, dstTy, vec_sz, n_vecs>;
+
+            cgh.parallel_for<KernelName>(
+                sycl::nd_range<1>(gws_range, lws_range),
+                ContigCopyFunctor<srcTy, dstTy, Caster<srcTy, dstTy>, vec_sz,
+                                  n_vecs, enable_sg_loadstore>(nelems, src_tp,
+                                                               dst_tp));
+        }
+        else {
+            static constexpr bool disable_sg_loadstore = false;
+            using InnerKernelName =
+                copy_cast_contig_kernel<srcTy, dstTy, vec_sz, n_vecs>;
+            using KernelName =
+                disabled_sg_loadstore_wrapper_krn<InnerKernelName>;
+
+            cgh.parallel_for<KernelName>(
+                sycl::nd_range<1>(gws_range, lws_range),
+                ContigCopyFunctor<srcTy, dstTy, Caster<srcTy, dstTy>, vec_sz,
+                                  n_vecs, disable_sg_loadstore>(nelems, src_tp,
+                                                                dst_tp));
+        }
+    });
+
+    return copy_and_cast_ev;
+}
+
+/*!
+ * @brief Factory to get specialized function pointer for casting and copying
+ * contiguous arrays.
+ * @ingroup CopyAndCastKernels
+ */
+template <typename fnT, typename D, typename S>
+struct CopyAndCastContigFactory
+{
+    fnT get()
+    {
+        fnT f = copy_and_cast_contig_impl<D, S>;
+        return f;
+    }
+};
+
+// Specialization of copy_and_cast for 1D arrays
+
+/*!
+ * @brief Factory to get function pointer for casting and copying 1D arrays.
+ * @ingroup CopyAndCastKernels
+ */
+typedef sycl::event (*copy_and_cast_1d_fn_ptr_t)(
+    sycl::queue &,
+    std::size_t,
+    const std::array<ssize_t, 1> &,
+    const std::array<ssize_t, 1> &,
+    const std::array<ssize_t, 1> &,
+    const char *,
+    ssize_t,
+    char *,
+    ssize_t,
+    const std::vector<sycl::event> &);
+
+/*!
+ * @brief Factory to get function pointer for casting and copying 2D arrays.
+ * @ingroup CopyAndCastKernels
+ */
+typedef sycl::event (*copy_and_cast_2d_fn_ptr_t)(
+    sycl::queue &,
+    std::size_t,
+    const std::array<ssize_t, 2> &,
+    const std::array<ssize_t, 2> &,
+    const std::array<ssize_t, 2> &,
+    const char *,
+    ssize_t,
+    char *,
+    ssize_t,
+    const std::vector<sycl::event> &);
+
+/*!
+ * @brief Specialized for given array dimension function to copy `nelems`
+ elements from `src` usm_ndarray to `dst` usm_ndarray while casting from `srcTy`
+ to `dstTy`.
+
+   Both arrays have array dimensionality known at compile time and specified in
+ template parameters `nd`. Arrays' shape and strides are provided as
+ `std::array`. `src_p` and `dst_p` represent pointers into respective arrays,
+ but the start of iteration begins at offset of `src_offset` elements for `src`
+ array and at offset `dst_offset` elements for `dst` array. Kernel is submitted
+ to sycl queue `q` with events `depends` as dependencies.
+
+   @param q  The queue where the routine should be executed.
+   @param nelems  Number of elements to cast and copy.
+   @param shape   Common shape of the arrays.
+   @param src_strides Strides of the source array.
+   @param dst_strides Strides of the destination array.
+   @param src_p  Kernel accessible USM pointer for the source array
+   @param src_offset  Offset to the beginning of iteration in number of elements
+ of the source array from `src_p`.
+   @param dst_p  Kernel accessible USM pointer for the destination array
+   @param dst_offset  Offset to the beginning of iteration in number of elements
+ of the destination array from `src_p`.
+   @param depends  List of events to wait for before starting computations, if
+ any.
+
+   @return  Event to wait on to ensure that computation completes.
+ * @ingroup CopyAndCastKernels
+ */
+template <typename dstTy, typename srcTy, int nd>
+sycl::event copy_and_cast_nd_specialized_impl(
+    sycl::queue &q,
+    std::size_t nelems,
+    const std::array<ssize_t, nd> &shape,
+    const std::array<ssize_t, nd> &src_strides,
+    const std::array<ssize_t, nd> &dst_strides,
+    const char *src_p,
+    ssize_t src_offset,
+    char *dst_p,
+    ssize_t dst_offset,
+    const std::vector<sycl::event> &depends)
+{
+    dpctl::tensor::type_utils::validate_type_for_device<dstTy>(q);
+    dpctl::tensor::type_utils::validate_type_for_device<srcTy>(q);
+
+    sycl::event copy_and_cast_ev = q.submit([&](sycl::handler &cgh) {
+        using IndexerT = TwoOffsets_FixedDimStridedIndexer<nd>;
+        const IndexerT indexer{shape, src_strides, dst_strides, src_offset,
+                               dst_offset};
+        const srcTy *src_tp = reinterpret_cast<const srcTy *>(src_p);
+        dstTy *dst_tp = reinterpret_cast<dstTy *>(dst_p);
+
+        cgh.depends_on(depends);
+        cgh.parallel_for<
+            class copy_cast_generic_kernel<srcTy, dstTy, IndexerT>>(
+            sycl::range<1>(nelems),
+            GenericCopyFunctor<srcTy, dstTy, Caster<srcTy, dstTy>, IndexerT>(
+                src_tp, dst_tp, indexer));
+    });
+
+    return copy_and_cast_ev;
+}
+
+/*!
+ * @brief Factory to get 1D-specialized function pointer of type `fnT` for given
+ * source data type `S` and destination data type `D`.
+ * @ingroup CopyAndCastKernels
+ */
+template <typename fnT, typename D, typename S>
+struct CopyAndCast1DFactory
+{
+    fnT get()
+    {
+        fnT f = copy_and_cast_nd_specialized_impl<D, S, 1>;
+        return f;
+    }
+};
+
+/*!
+ * @brief Factory to get 2D-specialized function pointer of type `fnT` for given
+ * source data type `S` and destination data type `D`.
+ * @ingroup CopyAndCastKernels
+ */
+template <typename fnT, typename D, typename S>
+struct CopyAndCast2DFactory
+{
+    fnT get()
+    {
+        fnT f = copy_and_cast_nd_specialized_impl<D, S, 2>;
+        return f;
+    }
+};
+
+// ====================== Copying from host to USM
+
+template <typename AccessorT,
+          typename dstTy,
+          typename CastFnT,
+          typename IndexerT>
+class GenericCopyFromHostFunctor
+{
+private:
+    AccessorT src_acc_;
+    dstTy *dst_ = nullptr;
+    IndexerT indexer_;
+
+public:
+    GenericCopyFromHostFunctor(const AccessorT &src_acc,
+                               dstTy *dst_p,
+                               const IndexerT &indexer)
+        : src_acc_(src_acc), dst_(dst_p), indexer_(indexer)
+    {
+    }
+
+    void operator()(sycl::id<1> wiid) const
+    {
+        const auto &offsets = indexer_(static_cast<ssize_t>(wiid.get(0)));
+        const ssize_t &src_offset = offsets.get_first_offset();
+        const ssize_t &dst_offset = offsets.get_second_offset();
+
+        CastFnT fn{};
+        dst_[dst_offset] = fn(src_acc_[src_offset]);
+    }
+};
+
+typedef void (*copy_and_cast_from_host_blocking_fn_ptr_t)(
+    sycl::queue &,
+    std::size_t,
+    int,
+    const ssize_t *,
+    const char *,
+    ssize_t,
+    ssize_t,
+    ssize_t,
+    char *,
+    ssize_t,
+    const std::vector<sycl::event> &,
+    const std::vector<sycl::event> &);
+
+/*!
+ * @brief Function to copy from NumPy's ndarray with elements of type `srcTy`
+ * into usm_ndarray with elements of type `srcTy`.
+ *
+ * Function to cast and copy elements from numpy.ndarray specified by typeless
+ * `host_src_p` and the `src_offset` given in the number of array elements.
+ * Arrays' metadata are given in packed USM vector of length `3*nd` whose first
+ * `nd` elements contain arrays' shape, next `nd` elements specify source
+ * strides in elements (not bytes), and trailing `nd` elements specify
+ * destination array strides. Kernel dependencies are given by two vectors of
+ * events: `depends` and `additional_depends`. The function execution is
+ * complete at the return.
+ *
+ * @param q  The queue where the routine should be executed.
+ * @param nelems Number of elements to cast and copy.
+ * @param nd The dimensionality of arrays
+ * @param shape_and_strides  Kernel accessible USM pointer to packed shape and
+ * strides.
+ * @param host_src_p  Host (not USM allocated) pointer associated with the
+ * source array.
+ * @param src_offset  Offset to the beginning of iteration in number of elements
+ * of the source array from `host_src_p`.
+ * @param src_min_nelem_offset  Smallest value of offset relative to
+ * `host_src_p` in number of elements attained while iterating over elements of
+ * the source array.
+ * @param src_max_nelem_offset  Largest value of offset relative to `host_src_p`
+ * in number of elements attained while iterating over elements of the source
+ * array.
+ * @param dst_p  USM pointer associated with the destination array.
+ * @param dst_offset  Offset to the beginning of iteration in number of elements
+ * of the destination array from `dst_p`.
+ * @param depends  List of events to wait for before starting computations, if
+ * any.
+ * @param additional_depends List of additional events to wait for before
+ * starting computations, if any.
+ *
+ * @ingroup CopyAndCastKernels
+ */
+template <typename dstTy, typename srcTy>
+void copy_and_cast_from_host_impl(
+    sycl::queue &q,
+    std::size_t nelems,
+    int nd,
+    const ssize_t *shape_and_strides,
+    const char *host_src_p,
+    ssize_t src_offset,
+    ssize_t src_min_nelem_offset,
+    ssize_t src_max_nelem_offset,
+    char *dst_p,
+    ssize_t dst_offset,
+    const std::vector<sycl::event> &depends,
+    const std::vector<sycl::event> &additional_depends)
+{
+    ssize_t nelems_range = src_max_nelem_offset - src_min_nelem_offset + 1;
+
+    dpctl::tensor::type_utils::validate_type_for_device<dstTy>(q);
+    dpctl::tensor::type_utils::validate_type_for_device<srcTy>(q);
+
+    sycl::buffer<srcTy, 1> npy_buf(
+        reinterpret_cast<const srcTy *>(host_src_p) + src_min_nelem_offset,
+        sycl::range<1>(nelems_range), {sycl::property::buffer::use_host_ptr{}});
+
+    sycl::event copy_and_cast_from_host_ev = q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+        cgh.depends_on(additional_depends);
+
+        sycl::accessor npy_acc(npy_buf, cgh, sycl::read_only);
+
+        const TwoOffsets_StridedIndexer indexer{
+            nd, src_offset - src_min_nelem_offset, dst_offset,
+            const_cast<const ssize_t *>(shape_and_strides)};
+
+        dstTy *dst_tp = reinterpret_cast<dstTy *>(dst_p);
+
+        cgh.parallel_for<copy_cast_from_host_kernel<srcTy, dstTy,
+                                                    TwoOffsets_StridedIndexer>>(
+            sycl::range<1>(nelems),
+            GenericCopyFromHostFunctor<decltype(npy_acc), dstTy,
+                                       Caster<srcTy, dstTy>,
+                                       TwoOffsets_StridedIndexer>(
+                npy_acc, dst_tp, indexer));
+    });
+
+    // perform explicit synchronization. Implicit synchronization would be
+    // performed by sycl::buffer destructor.
+    copy_and_cast_from_host_ev.wait();
+
+    return;
+}
+
+/*!
+ * @brief Factory to get function pointer of type `fnT` for given NumPy array
+ * source data type `S` and destination data type `D`.
+ * @defgroup CopyAndCastKernels
+ */
+template <typename fnT, typename D, typename S>
+struct CopyAndCastFromHostFactory
+{
+    fnT get()
+    {
+        fnT f = copy_and_cast_from_host_impl<D, S>;
+        return f;
+    }
+};
+
+typedef void (*copy_and_cast_from_host_contig_blocking_fn_ptr_t)(
+    sycl::queue &,
+    std::size_t,  /* nelems */
+    const char *, /* src_pointer */
+    ssize_t,      /* src_offset */
+    char *,       /* dst_pointer */
+    ssize_t,      /* dst_offset */
+    const std::vector<sycl::event> &);
+
+/*!
+ * @brief Function to copy from NumPy's ndarray with elements of type `srcTy`
+ * into usm_ndarray with elements of type `srcTy` for contiguous arrays.
+ *
+ * Function to cast and copy elements from numpy.ndarray specified by typeless
+ * `host_src_p` and the `src_offset` given in the number of array elements.
+ * Kernel dependencies are given by two vectors of
+ * events: `depends` and `additional_depends`. The function execution is
+ * complete at the return.
+ *
+ * @param q  The queue where the routine should be executed.
+ * @param nelems Number of elements to cast and copy.
+ * @param src_stride The stride of source array in elements
+ * @param dst_stride The stride of destimation array in elements
+ * @param host_src_p  Host (not USM allocated) pointer associated with the
+ * source array.
+ * @param src_offset  Offset to the beginning of iteration in number of elements
+ * of the source array from `host_src_p`.
+ * @param dst_p  USM pointer associated with the destination array.
+ * @param dst_offset  Offset to the beginning of iteration in number of elements
+ * of the destination array from `dst_p`.
+ * @param depends  List of events to wait for before starting computations, if
+ * any.
+ *
+ * @ingroup CopyAndCastKernels
+ */
+template <typename dstTy, typename srcTy>
+void copy_and_cast_from_host_contig_impl(
+    sycl::queue &q,
+    std::size_t nelems,
+    const char *host_src_p,
+    ssize_t src_offset,
+    char *dst_p,
+    ssize_t dst_offset,
+    const std::vector<sycl::event> &depends)
+{
+    dpctl::tensor::type_utils::validate_type_for_device<dstTy>(q);
+    dpctl::tensor::type_utils::validate_type_for_device<srcTy>(q);
+
+    sycl::buffer<srcTy, 1> npy_buf(
+        reinterpret_cast<const srcTy *>(host_src_p) + src_offset,
+        sycl::range<1>(nelems), {sycl::property::buffer::use_host_ptr{}});
+
+    sycl::event copy_and_cast_from_host_ev = q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+
+        sycl::accessor npy_acc(npy_buf, cgh, sycl::read_only);
+
+        using IndexerT = TwoOffsets_CombinedIndexer<NoOpIndexer, NoOpIndexer>;
+        static constexpr NoOpIndexer src_indexer{};
+        static constexpr NoOpIndexer dst_indexer{};
+        static constexpr TwoOffsets_CombinedIndexer indexer{src_indexer,
+                                                            dst_indexer};
+
+        dstTy *dst_tp = reinterpret_cast<dstTy *>(dst_p) + dst_offset;
+
+        cgh.parallel_for<
+            copy_cast_from_host_contig_kernel<srcTy, dstTy, IndexerT>>(
+            sycl::range<1>(nelems),
+            GenericCopyFromHostFunctor<decltype(npy_acc), dstTy,
+                                       Caster<srcTy, dstTy>, IndexerT>(
+                npy_acc, dst_tp, indexer));
+    });
+
+    // perform explicit synchronization. Implicit synchronization would be
+    // performed by sycl::buffer destructor.
+    copy_and_cast_from_host_ev.wait();
+
+    return;
+}
+
+/*!
+ * @brief Factory to get function pointer of type `fnT` for given NumPy array
+ * source data type `S` and destination data type `D`.
+ * @defgroup CopyAndCastKernels
+ */
+template <typename fnT, typename D, typename S>
+struct CopyAndCastFromHostContigFactory
+{
+    fnT get()
+    {
+        fnT f = copy_and_cast_from_host_contig_impl<D, S>;
+        return f;
+    }
+};
+
+// =============== Copying for reshape ================== //
+
+template <typename Ty, typename SrcIndexerT, typename DstIndexerT>
+class copy_for_reshape_generic_kernel;
+
+template <typename Ty, typename SrcIndexerT, typename DstIndexerT>
+class GenericCopyForReshapeFunctor
+{
+private:
+    const Ty *src_p = nullptr;
+    Ty *dst_p = nullptr;
+    SrcIndexerT src_indexer_;
+    DstIndexerT dst_indexer_;
+
+public:
+    GenericCopyForReshapeFunctor(const char *src_ptr,
+                                 char *dst_ptr,
+                                 const SrcIndexerT &src_indexer,
+                                 const DstIndexerT &dst_indexer)
+        : src_p(reinterpret_cast<const Ty *>(src_ptr)),
+          dst_p(reinterpret_cast<Ty *>(dst_ptr)), src_indexer_(src_indexer),
+          dst_indexer_(dst_indexer)
+    {
+    }
+
+    void operator()(sycl::id<1> wiid) const
+    {
+        const ssize_t src_offset = src_indexer_(wiid.get(0));
+        const ssize_t dst_offset = dst_indexer_(wiid.get(0));
+
+        dst_p[dst_offset] = src_p[src_offset];
+    }
+};
+
+// define function type
+typedef sycl::event (*copy_for_reshape_fn_ptr_t)(
+    sycl::queue &,
+    std::size_t,     // num_elements
+    int,             // src_nd
+    int,             // dst_nd
+    const ssize_t *, // packed shapes and strides
+    const char *,    // src_data_ptr
+    char *,          // dst_data_ptr
+    const std::vector<sycl::event> &);
+
+/*!
+ * @brief Function to copy content of array while reshaping.
+ *
+ * Submits a kernel to perform a copy `dst[unravel_index(i,
+ * dst.shape)] = src[unravel_undex(i, src.shape)]`.
+ *
+ * @param  q      The execution queue where kernel is submitted.
+ * @param  nelems The number of elements to copy
+ * @param  src_nd Array dimension of the source array
+ * @param  dst_nd Array dimension of the destination array
+ * @param  packed_shapes_and_strides Kernel accessible USM array of size
+ * `2*src_nd + 2*dst_nd` with content `[src_shape, src_strides, dst_shape,
+ * dst_strides]`.
+ * @param  src_p  Typeless USM pointer to the buffer of the source array
+ * @param  dst_p  Typeless USM pointer to the buffer of the destination array
+ * @param  depends  List of events to wait for before starting computations, if
+ * any.
+ *
+ * @return Event to wait on to ensure that computation completes.
+ * @ingroup CopyAndCastKernels
+ */
+template <typename Ty>
+sycl::event
+    copy_for_reshape_generic_impl(sycl::queue &q,
+                                  std::size_t nelems,
+                                  int src_nd,
+                                  int dst_nd,
+                                  const ssize_t *packed_shapes_and_strides,
+                                  const char *src_p,
+                                  char *dst_p,
+                                  const std::vector<sycl::event> &depends)
+{
+    dpctl::tensor::type_utils::validate_type_for_device<Ty>(q);
+
+    sycl::event copy_for_reshape_ev = q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+
+        // packed_shapes_and_strides:
+        //   USM array of size 2*(src_nd + dst_nd)
+        //   [ src_shape; src_strides; dst_shape; dst_strides ]
+
+        const ssize_t *src_shape_and_strides =
+            const_cast<const ssize_t *>(packed_shapes_and_strides);
+
+        const ssize_t *dst_shape_and_strides = const_cast<const ssize_t *>(
+            packed_shapes_and_strides + (2 * src_nd));
+
+        const StridedIndexer src_indexer{src_nd, 0, src_shape_and_strides};
+        const StridedIndexer dst_indexer{dst_nd, 0, dst_shape_and_strides};
+
+        using KernelName =
+            copy_for_reshape_generic_kernel<Ty, StridedIndexer, StridedIndexer>;
+
+        cgh.parallel_for<KernelName>(
+            sycl::range<1>(nelems),
+            GenericCopyForReshapeFunctor<Ty, StridedIndexer, StridedIndexer>(
+                src_p, dst_p, src_indexer, dst_indexer));
+    });
+
+    return copy_for_reshape_ev;
+}
+
+/*!
+ * @brief Factory to get function pointer of type `fnT` for given array data
+ * type `Ty`.
+ * @ingroup CopyAndCastKernels
+ */
+template <typename fnT, typename Ty>
+struct CopyForReshapeGenericFactory
+{
+    fnT get()
+    {
+        fnT f = copy_for_reshape_generic_impl<Ty>;
+        return f;
+    }
+};
+
+// ================== Copying for roll ================== //
+
+/*! @brief Functor to cyclically roll global_id to the left */
+struct LeftRolled1DTransformer
+{
+    LeftRolled1DTransformer(std::size_t offset, std::size_t size)
+        : offset_(offset), size_(size)
+    {
+    }
+
+    std::size_t operator()(std::size_t gid) const
+    {
+        const std::size_t shifted_gid =
+            ((gid < offset_) ? gid + size_ - offset_ : gid - offset_);
+        return shifted_gid;
+    }
+
+private:
+    std::size_t offset_ = 0;
+    std::size_t size_ = 1;
+};
+
+/*! @brief Indexer functor to compose indexer and transformer */
+template <typename IndexerT, typename TransformerT>
+struct CompositionIndexer
+{
+    CompositionIndexer(IndexerT f, TransformerT t) : f_(f), t_(t) {}
+
+    auto operator()(std::size_t gid) const
+    {
+        return f_(t_(gid));
+    }
+
+private:
+    IndexerT f_;
+    TransformerT t_;
+};
+
+/*! @brief Indexer functor to find offset for nd-shifted indices lifted from
+ * iteration id */
+struct RolledNDIndexer
+{
+    RolledNDIndexer(int nd,
+                    const ssize_t *shape,
+                    const ssize_t *strides,
+                    const ssize_t *ndshifts,
+                    ssize_t starting_offset)
+        : nd_(nd), shape_(shape), strides_(strides), ndshifts_(ndshifts),
+          starting_offset_(starting_offset)
+    {
+    }
+
+    ssize_t operator()(std::size_t gid) const
+    {
+        return compute_offset(gid);
+    }
+
+private:
+    int nd_ = -1;
+    const ssize_t *shape_ = nullptr;
+    const ssize_t *strides_ = nullptr;
+    const ssize_t *ndshifts_ = nullptr;
+    ssize_t starting_offset_ = 0;
+
+    ssize_t compute_offset(ssize_t gid) const
+    {
+        using dpctl::tensor::strides::CIndexer_vector;
+
+        CIndexer_vector _ind(nd_);
+        ssize_t relative_offset_(0);
+        _ind.get_left_rolled_displacement<const ssize_t *, const ssize_t *>(
+            gid,
+            shape_,    // shape ptr
+            strides_,  // strides ptr
+            ndshifts_, // shifts ptr
+            relative_offset_);
+        return starting_offset_ + relative_offset_;
+    }
+};
+
+template <typename Ty, typename SrcIndexerT, typename DstIndexerT>
+class copy_for_roll_strided_kernel;
+
+template <typename Ty, typename SrcIndexerT, typename DstIndexerT>
+class StridedCopyForRollFunctor
+{
+private:
+    const Ty *src_p = nullptr;
+    Ty *dst_p = nullptr;
+    SrcIndexerT src_indexer_;
+    DstIndexerT dst_indexer_;
+
+public:
+    StridedCopyForRollFunctor(const Ty *src_ptr,
+                              Ty *dst_ptr,
+                              const SrcIndexerT &src_indexer,
+                              const DstIndexerT &dst_indexer)
+        : src_p(src_ptr), dst_p(dst_ptr), src_indexer_(src_indexer),
+          dst_indexer_(dst_indexer)
+    {
+    }
+
+    void operator()(sycl::id<1> wiid) const
+    {
+        const std::size_t gid = wiid.get(0);
+
+        const ssize_t src_offset = src_indexer_(gid);
+        const ssize_t dst_offset = dst_indexer_(gid);
+
+        dst_p[dst_offset] = src_p[src_offset];
+    }
+};
+
+// define function type
+typedef sycl::event (*copy_for_roll_strided_fn_ptr_t)(
+    sycl::queue &,
+    std::size_t,     // shift
+    std::size_t,     // num_elements
+    int,             // common_nd
+    const ssize_t *, // packed shapes and strides
+    const char *,    // src_data_ptr
+    ssize_t,         // src_offset
+    char *,          // dst_data_ptr
+    ssize_t,         // dst_offset
+    const std::vector<sycl::event> &);
+
+/*!
+ * @brief Function to copy content of array with a shift.
+ *
+ * Submits a kernel to perform a copy `dst[unravel_index((i + shift) % nelems ,
+ * dst.shape)] = src[unravel_undex(i, src.shape)]`.
+ *
+ * @param  q      The execution queue where kernel is submitted.
+ * @param  shift  The shift in flat indexing, must be non-negative.
+ * @param  nelems The number of elements to copy
+ * @param  nd     Array dimensionality of the destination and source arrays
+ * @param  packed_shapes_and_strides Kernel accessible USM array
+ * of size `3*nd` with content `[common_shape, src_strides, dst_strides]`.
+ * @param  src_p  Typeless USM pointer to the buffer of the source array
+ * @param  src_offset Displacement of first element of src relative src_p in
+ * elements
+ * @param  dst_p  Typeless USM pointer to the buffer of the destination array
+ * @param  dst_offset Displacement of first element of dst relative dst_p in
+ * elements
+ * @param  depends  List of events to wait for before starting computations, if
+ * any.
+ *
+ * @return Event to wait on to ensure that computation completes.
+ * @ingroup CopyAndCastKernels
+ */
+template <typename Ty>
+sycl::event copy_for_roll_strided_impl(sycl::queue &q,
+                                       std::size_t shift,
+                                       std::size_t nelems,
+                                       int nd,
+                                       const ssize_t *packed_shapes_and_strides,
+                                       const char *src_p,
+                                       ssize_t src_offset,
+                                       char *dst_p,
+                                       ssize_t dst_offset,
+                                       const std::vector<sycl::event> &depends)
+{
+    dpctl::tensor::type_utils::validate_type_for_device<Ty>(q);
+
+    sycl::event copy_for_roll_ev = q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+
+        // packed_shapes_and_strides:
+        //   USM array of size 3 * nd
+        //   [ common_shape; src_strides; dst_strides ]
+
+        const StridedIndexer src_indexer{nd, src_offset,
+                                         packed_shapes_and_strides};
+        const LeftRolled1DTransformer left_roll_transformer{shift, nelems};
+
+        using CompositeIndexerT =
+            CompositionIndexer<StridedIndexer, LeftRolled1DTransformer>;
+
+        const CompositeIndexerT rolled_src_indexer(src_indexer,
+                                                   left_roll_transformer);
+
+        UnpackedStridedIndexer dst_indexer{nd, dst_offset,
+                                           packed_shapes_and_strides,
+                                           packed_shapes_and_strides + 2 * nd};
+
+        using KernelName = copy_for_roll_strided_kernel<Ty, CompositeIndexerT,
+                                                        UnpackedStridedIndexer>;
+
+        const Ty *src_tp = reinterpret_cast<const Ty *>(src_p);
+        Ty *dst_tp = reinterpret_cast<Ty *>(dst_p);
+
+        cgh.parallel_for<KernelName>(
+            sycl::range<1>(nelems),
+            StridedCopyForRollFunctor<Ty, CompositeIndexerT,
+                                      UnpackedStridedIndexer>(
+                src_tp, dst_tp, rolled_src_indexer, dst_indexer));
+    });
+
+    return copy_for_roll_ev;
+}
+
+// define function type
+typedef sycl::event (*copy_for_roll_contig_fn_ptr_t)(
+    sycl::queue &,
+    std::size_t,  // shift
+    std::size_t,  // num_elements
+    const char *, // src_data_ptr
+    ssize_t,      // src_offset
+    char *,       // dst_data_ptr
+    ssize_t,      // dst_offset
+    const std::vector<sycl::event> &);
+
+template <typename Ty>
+class copy_for_roll_contig_kernel;
+
+/*!
+ * @brief Function to copy content of array with a shift.
+ *
+ * Submits a kernel to perform a copy `dst[unravel_index((i + shift) % nelems ,
+ * dst.shape)] = src[unravel_undex(i, src.shape)]`.
+ *
+ * @param  q      The execution queue where kernel is submitted.
+ * @param  shift  The shift in flat indexing, must be non-negative.
+ * @param  nelems The number of elements to copy
+ * @param  src_p  Typeless USM pointer to the buffer of the source array
+ * @param  src_offset Displacement of the start of array src relative src_p in
+ * elements
+ * @param  dst_p  Typeless USM pointer to the buffer of the destination array
+ * @param  dst_offset Displacement of the start of array dst relative dst_p in
+ * elements
+ * @param  depends  List of events to wait for before starting computations, if
+ * any.
+ *
+ * @return Event to wait on to ensure that computation completes.
+ * @ingroup CopyAndCastKernels
+ */
+template <typename Ty>
+sycl::event copy_for_roll_contig_impl(sycl::queue &q,
+                                      std::size_t shift,
+                                      std::size_t nelems,
+                                      const char *src_p,
+                                      ssize_t src_offset,
+                                      char *dst_p,
+                                      ssize_t dst_offset,
+                                      const std::vector<sycl::event> &depends)
+{
+    dpctl::tensor::type_utils::validate_type_for_device<Ty>(q);
+
+    sycl::event copy_for_roll_ev = q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+
+        static constexpr NoOpIndexer src_indexer{};
+        const LeftRolled1DTransformer roller{shift, nelems};
+
+        const CompositionIndexer<NoOpIndexer, LeftRolled1DTransformer>
+            left_rolled_src_indexer{src_indexer, roller};
+        static constexpr NoOpIndexer dst_indexer{};
+
+        using KernelName = copy_for_roll_contig_kernel<Ty>;
+
+        const Ty *src_tp = reinterpret_cast<const Ty *>(src_p) + src_offset;
+        Ty *dst_tp = reinterpret_cast<Ty *>(dst_p) + dst_offset;
+
+        cgh.parallel_for<KernelName>(
+            sycl::range<1>(nelems),
+            StridedCopyForRollFunctor<
+                Ty, CompositionIndexer<NoOpIndexer, LeftRolled1DTransformer>,
+                NoOpIndexer>(src_tp, dst_tp, left_rolled_src_indexer,
+                             dst_indexer));
+    });
+
+    return copy_for_roll_ev;
+}
+
+/*!
+ * @brief Factory to get function pointer of type `fnT` for given array data
+ * type `Ty`.
+ * @ingroup CopyAndCastKernels
+ */
+template <typename fnT, typename Ty>
+struct CopyForRollStridedFactory
+{
+    fnT get()
+    {
+        fnT f = copy_for_roll_strided_impl<Ty>;
+        return f;
+    }
+};
+
+/*!
+ * @brief Factory to get function pointer of type `fnT` for given array data
+ * type `Ty`.
+ * @ingroup CopyAndCastKernels
+ */
+template <typename fnT, typename Ty>
+struct CopyForRollContigFactory
+{
+    fnT get()
+    {
+        fnT f = copy_for_roll_contig_impl<Ty>;
+        return f;
+    }
+};
+
+template <typename Ty, typename SrcIndexerT, typename DstIndexerT>
+class copy_for_roll_ndshift_strided_kernel;
+
+// define function type
+typedef sycl::event (*copy_for_roll_ndshift_strided_fn_ptr_t)(
+    sycl::queue &,
+    std::size_t,     // num_elements
+    int,             // common_nd
+    const ssize_t *, // packed shape, strides, shifts
+    const char *,    // src_data_ptr
+    ssize_t,         // src_offset
+    char *,          // dst_data_ptr
+    ssize_t,         // dst_offset
+    const std::vector<sycl::event> &);
+
+template <typename Ty>
+sycl::event copy_for_roll_ndshift_strided_impl(
+    sycl::queue &q,
+    std::size_t nelems,
+    int nd,
+    const ssize_t *packed_shapes_and_strides_and_shifts,
+    const char *src_p,
+    ssize_t src_offset,
+    char *dst_p,
+    ssize_t dst_offset,
+    const std::vector<sycl::event> &depends)
+{
+    dpctl::tensor::type_utils::validate_type_for_device<Ty>(q);
+
+    sycl::event copy_for_roll_ev = q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+
+        // packed_shapes_and_strides_and_shifts:
+        //   USM array of size 4 * nd
+        //   [ common_shape; src_strides; dst_strides; shifts ]
+
+        const ssize_t *shape_ptr = packed_shapes_and_strides_and_shifts;
+        const ssize_t *src_strides_ptr =
+            packed_shapes_and_strides_and_shifts + nd;
+        const ssize_t *dst_strides_ptr =
+            packed_shapes_and_strides_and_shifts + 2 * nd;
+        const ssize_t *shifts_ptr =
+            packed_shapes_and_strides_and_shifts + 3 * nd;
+
+        const RolledNDIndexer src_indexer{nd, shape_ptr, src_strides_ptr,
+                                          shifts_ptr, src_offset};
+
+        const UnpackedStridedIndexer dst_indexer{nd, dst_offset, shape_ptr,
+                                                 dst_strides_ptr};
+
+        using KernelName = copy_for_roll_strided_kernel<Ty, RolledNDIndexer,
+                                                        UnpackedStridedIndexer>;
+
+        const Ty *src_tp = reinterpret_cast<const Ty *>(src_p);
+        Ty *dst_tp = reinterpret_cast<Ty *>(dst_p);
+
+        cgh.parallel_for<KernelName>(
+            sycl::range<1>(nelems),
+            StridedCopyForRollFunctor<Ty, RolledNDIndexer,
+                                      UnpackedStridedIndexer>(
+                src_tp, dst_tp, src_indexer, dst_indexer));
+    });
+
+    return copy_for_roll_ev;
+}
+
+/*!
+ * @brief Factory to get function pointer of type `fnT` for given array data
+ * type `Ty`.
+ * @ingroup CopyAndCastKernels
+ */
+template <typename fnT, typename Ty>
+struct CopyForRollNDShiftFactory
+{
+    fnT get()
+    {
+        fnT f = copy_for_roll_ndshift_strided_impl<Ty>;
+        return f;
+    }
+};
+
+} // namespace dpctl::tensor::kernels::copy_and_cast
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/copy_as_contiguous.hpp b/dpctl_ext/tensor/libtensor/include/kernels/copy_as_contiguous.hpp
new file mode 100644
index 000000000000..37126a22dc64
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/include/kernels/copy_as_contiguous.hpp
@@ -0,0 +1,646 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for tensor copying and value casting.
+//===----------------------------------------------------------------------===//
+
+#pragma once
+#include <algorithm>
+#include <array>
+#include <cstddef>
+#include <cstdint>
+#include <sycl/sycl.hpp>
+#include <vector>
+
+#include "dpctl_tensor_types.hpp"
+#include "kernels/alignment.hpp"
+#include "utils/offset_utils.hpp"
+#include "utils/sycl_utils.hpp"
+#include "utils/type_utils.hpp"
+
+namespace dpctl::tensor::kernels::copy_as_contig
+{
+
+using dpctl::tensor::ssize_t;
+using dpctl::tensor::sycl_utils::sub_group_store;
+
+template <typename T,
+          typename IndexerT,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
+          bool enable_sg_loadstore = true>
+class CopyAsCContigFunctor
+{
+private:
+    std::size_t nelems;
+    const T *src_p = nullptr;
+    T *dst_p = nullptr;
+    IndexerT src_indexer;
+
+public:
+    CopyAsCContigFunctor(std::size_t n,
+                         const T *src_,
+                         T *dst_,
+                         const IndexerT &src_indexer_)
+        : nelems(n), src_p(src_), dst_p(dst_), src_indexer(src_indexer_)
+    {
+    }
+
+    void operator()(sycl::nd_item<1> ndit) const
+    {
+        static_assert(vec_sz > 0);
+        static_assert(n_vecs > 0);
+
+        static constexpr std::uint8_t elems_per_wi = vec_sz * n_vecs;
+
+        using dpctl::tensor::type_utils::is_complex;
+        if constexpr (!enable_sg_loadstore || is_complex<T>::value) {
+            const std::uint16_t sgSize =
+                ndit.get_sub_group().get_max_local_range()[0];
+            const std::size_t gid = ndit.get_global_linear_id();
+
+            // start = (gid / sgSize) * sgSize * elems_per_wi + (gid % sgSize)
+            // gid % sgSize == gid - (gid / sgSize) * sgSize
+            const std::uint16_t elems_per_sg = sgSize * elems_per_wi;
+            const std::size_t start =
+                (gid / sgSize) * (elems_per_sg - sgSize) + gid;
+            const std::size_t end = std::min(nelems, start + elems_per_sg);
+
+            for (std::size_t offset = start; offset < end; offset += sgSize) {
+                auto src_offset = src_indexer(offset);
+                dst_p[offset] = src_p[src_offset];
+            }
+        }
+        else {
+            auto sg = ndit.get_sub_group();
+            const std::uint16_t sgSize = sg.get_max_local_range()[0];
+            const std::size_t base =
+                elems_per_wi * (ndit.get_group(0) * ndit.get_local_range(0) +
+                                sg.get_group_id()[0] * sgSize);
+            const std::uint16_t elems_per_sg = elems_per_wi * sgSize;
+
+            if (base + elems_per_sg < nelems) {
+#pragma unroll
+                for (std::uint8_t it = 0; it < elems_per_wi; it += vec_sz) {
+                    // it == vec_id * vec_sz, for  0 <= vec_id < n_vecs
+                    const std::size_t block_start_id = base + it * sgSize;
+                    auto dst_multi_ptr = sycl::address_space_cast<
+                        sycl::access::address_space::global_space,
+                        sycl::access::decorated::yes>(&dst_p[block_start_id]);
+
+                    const std::size_t elem_id0 =
+                        block_start_id + sg.get_local_id();
+                    sycl::vec<T, vec_sz> dst_vec;
+#pragma unroll
+                    for (std::uint8_t k = 0; k < vec_sz; ++k) {
+                        const std::size_t elem_id = elem_id0 + k * sgSize;
+                        const ssize_t src_offset = src_indexer(elem_id);
+                        dst_vec[k] = src_p[src_offset];
+                    }
+                    sub_group_store<vec_sz>(sg, dst_vec, dst_multi_ptr);
+                }
+            }
+            else {
+                const std::size_t lane_id = sg.get_local_id()[0];
+                const std::size_t k0 = base + lane_id;
+                for (std::size_t k = k0; k < nelems; k += sgSize) {
+                    const ssize_t src_offset = src_indexer(k);
+                    dst_p[k] = src_p[src_offset];
+                }
+            }
+        }
+    }
+};
+
+template <typename T,
+          typename IndexerT,
+          std::uint8_t vec_sz,
+          std::uint8_t n_vecs,
+          bool enable_sg_load,
+          typename KernelName>
+sycl::event submit_c_contiguous_copy(sycl::queue &exec_q,
+                                     std::size_t nelems,
+                                     const T *src,
+                                     T *dst,
+                                     const IndexerT &src_indexer,
+                                     const std::vector<sycl::event> &depends)
+{
+    static_assert(vec_sz > 0);
+    static_assert(n_vecs > 0);
+
+    static constexpr std::size_t preferred_lws = 256;
+
+    const auto &kernel_id = sycl::get_kernel_id<KernelName>();
+
+    auto const &ctx = exec_q.get_context();
+    auto const &dev = exec_q.get_device();
+    auto kb = sycl::get_kernel_bundle<sycl::bundle_state::executable>(
+        ctx, {dev}, {kernel_id});
+
+    auto krn = kb.get_kernel(kernel_id);
+
+    const std::uint32_t max_sg_size = krn.template get_info<
+        sycl::info::kernel_device_specific::max_sub_group_size>(dev);
+
+    const std::size_t lws =
+        ((preferred_lws + max_sg_size - 1) / max_sg_size) * max_sg_size;
+
+    static constexpr std::uint8_t nelems_per_wi = n_vecs * vec_sz;
+
+    const std::size_t nelems_per_group = nelems_per_wi * lws;
+    const std::size_t n_groups =
+        (nelems + nelems_per_group - 1) / (nelems_per_group);
+
+    sycl::event copy_ev = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+        cgh.use_kernel_bundle(kb);
+
+        const sycl::range<1> gRange{n_groups * lws};
+        const sycl::range<1> lRange{lws};
+
+        cgh.parallel_for<KernelName>(
+            sycl::nd_range<1>(gRange, lRange),
+            CopyAsCContigFunctor<T, IndexerT, vec_sz, n_vecs, enable_sg_load>(
+                nelems, src, dst, src_indexer));
+    });
+    return copy_ev;
+}
+
+template <typename T,
+          typename IndexT,
+          std::uint8_t vec_sz,
+          std::uint8_t n_vecs,
+          bool enable_sgload>
+class as_contig_krn;
+
+template <typename T>
+sycl::event
+    as_c_contiguous_array_generic_impl(sycl::queue &exec_q,
+                                       std::size_t nelems,
+                                       int nd,
+                                       const ssize_t *shape_and_strides,
+                                       const char *src_p,
+                                       char *dst_p,
+                                       const std::vector<sycl::event> &depends)
+{
+    dpctl::tensor::type_utils::validate_type_for_device<T>(exec_q);
+
+    const T *src_tp = reinterpret_cast<const T *>(src_p);
+    T *dst_tp = reinterpret_cast<T *>(dst_p);
+
+    using IndexerT = dpctl::tensor::offset_utils::StridedIndexer;
+    const IndexerT src_indexer(nd, ssize_t(0), shape_and_strides);
+
+    static constexpr std::uint8_t vec_sz = 4u;
+    static constexpr std::uint8_t n_vecs = 2u;
+
+    using dpctl::tensor::kernels::alignment_utils::
+        disabled_sg_loadstore_wrapper_krn;
+    using dpctl::tensor::kernels::alignment_utils::is_aligned;
+    using dpctl::tensor::kernels::alignment_utils::required_alignment;
+
+    sycl::event copy_ev;
+    if (is_aligned<required_alignment>(dst_p)) {
+        static constexpr bool enable_sg_load = true;
+        using KernelName =
+            as_contig_krn<T, IndexerT, vec_sz, n_vecs, enable_sg_load>;
+        copy_ev = submit_c_contiguous_copy<T, IndexerT, vec_sz, n_vecs,
+                                           enable_sg_load, KernelName>(
+            exec_q, nelems, src_tp, dst_tp, src_indexer, depends);
+    }
+    else {
+        static constexpr bool disable_sg_load = false;
+        using InnerKernelName =
+            as_contig_krn<T, IndexerT, vec_sz, n_vecs, disable_sg_load>;
+        using KernelName = disabled_sg_loadstore_wrapper_krn<InnerKernelName>;
+        copy_ev = submit_c_contiguous_copy<T, IndexerT, vec_sz, n_vecs,
+                                           disable_sg_load, KernelName>(
+            exec_q, nelems, src_tp, dst_tp, src_indexer, depends);
+    }
+
+    return copy_ev;
+}
+
+typedef sycl::event (*as_c_contiguous_array_impl_fn_ptr_t)(
+    sycl::queue &,
+    std::size_t,
+    int,
+    const ssize_t *,
+    const char *,
+    char *,
+    const std::vector<sycl::event> &);
+
+template <typename fnT, typename T>
+struct AsCContigFactory
+{
+    fnT get()
+    {
+        return as_c_contiguous_array_generic_impl<T>;
+    }
+};
+
+template <typename T,
+          typename IndexerT,
+          std::uint16_t tile_size,
+          std::uint16_t n_lines>
+class as_contig_batch_of_square_matrices_krn;
+
+namespace detail
+{
+/*! @brief batch of matrices (n, n), source strides (1, src_ld), destination
+   strides (dst_ld, 1) src and destination arrays must be disjoint memory blocks
+   to avoid race condition
+ */
+template <typename T, typename BatchIndexerT>
+sycl::event as_c_contiguous_batch_of_square_matrices_impl(
+    sycl::queue &exec_q,
+    std::size_t batch_nelems,
+    const BatchIndexerT &batch_two_offsets_indexer,
+    std::size_t n,
+    const char *src_p,
+    ssize_t src_ld,
+    char *dst_p,
+    ssize_t dst_ld,
+    const std::vector<sycl::event> &depends)
+{
+    dpctl::tensor::type_utils::validate_type_for_device<T>(exec_q);
+
+    const T *src_tp = reinterpret_cast<const T *>(src_p);
+    T *dst_tp = reinterpret_cast<T *>(dst_p);
+
+    static constexpr std::uint16_t private_tile_size = 4;
+    static constexpr std::uint16_t n_lines = 2;
+    static constexpr std::uint16_t block_size =
+        n_lines * private_tile_size * private_tile_size;
+
+    static constexpr std::uint16_t lws0 = block_size;
+    static constexpr std::uint16_t lws1 = n_lines;
+    static constexpr std::uint16_t nelems_per_wi = (block_size / lws1);
+
+    static_assert(nelems_per_wi * lws1 == block_size);
+    static_assert(nelems_per_wi == private_tile_size * private_tile_size);
+
+    static constexpr std::uint32_t lws = lws0 * lws1;
+
+    const std::size_t n_tiles = (n + block_size - 1) / block_size;
+
+    const ssize_t src_stride = src_ld;
+    const ssize_t dst_stride = dst_ld;
+
+    sycl::range<1> lRange{lws};
+    sycl::range<1> gRange{batch_nelems * n_tiles * n_tiles * lws};
+
+    sycl::nd_range<1> ndRange{gRange, lRange};
+
+    using KernelName =
+        as_contig_batch_of_square_matrices_krn<T, BatchIndexerT,
+                                               private_tile_size, lws1>;
+
+    sycl::event e = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+
+        sycl::local_accessor<T, 1> local_block(block_size * block_size, cgh);
+
+        cgh.parallel_for<KernelName>(ndRange, [=](sycl::nd_item<1> nd_it) {
+            // 1. Read block from source array into SLM
+            const std::uint32_t lid_lin = nd_it.get_local_linear_id();
+            const std::size_t gr_id_lin = nd_it.get_group_linear_id();
+
+            const std::size_t batch_id = gr_id_lin / (n_tiles * n_tiles);
+            const std::size_t rem = gr_id_lin - batch_id * (n_tiles * n_tiles);
+
+            const auto &batch_two_offsets = batch_two_offsets_indexer(batch_id);
+            const auto &src_batch_offset = batch_two_offsets.get_first_offset();
+            const auto &dst_batch_offset =
+                batch_two_offsets.get_second_offset();
+
+            // Block id
+            /* 0 <= src_gr_i1 < n_groups_n1 */
+            const std::size_t src_tile_i1 = rem / n_tiles;
+            /* 0 <= src_gr_i0 < n_groups_n0 */
+            const std::size_t src_tile_i0 = rem - src_tile_i1 * n_tiles;
+
+            // ID of element within the block
+            /* 0 <= src_i1 < lws1 */
+            const std::uint32_t src_i1 = lid_lin / lws0;
+            /* 0 <= src_i0 < lws0 */
+            const std::uint32_t src_i0 = lid_lin - src_i1 * lws0;
+
+            // Matrix element ID
+            const std::size_t src_tile_start0 = src_tile_i0 * block_size;
+            const std::size_t src_tile_start1 = src_tile_i1 * block_size;
+            const std::size_t src_gid0 = (src_tile_start0 + src_i0);
+            const std::size_t src_gid1 = (src_tile_start1 + src_i1);
+
+            // src_offset = src_gid0 * 1 + (src_gid1 + pr_id * lws1) *
+            // src_stride
+            const std::size_t src_offset0 =
+                src_batch_offset + src_gid0 * 1 + src_gid1 * src_stride;
+            const std::size_t pr_step_src = lws1 * src_stride;
+
+            const std::uint32_t local_offset0 = src_i0 + src_i1 * block_size;
+            const std::uint32_t pr_step_local = lws1 * block_size;
+
+            for (std::uint32_t pr_id = 0; pr_id < nelems_per_wi; ++pr_id) {
+                local_block[local_offset0 + pr_step_local * pr_id] =
+                    (src_gid0 < n && src_gid1 + pr_id * lws1 < n)
+                        ? src_tp[src_offset0 + pr_step_src * pr_id]
+                        : T(0);
+            }
+
+            const std::uint32_t local_dim0 = static_cast<std::uint32_t>(
+                std::min<std::size_t>(src_tile_start0 + block_size, n) -
+                src_tile_start0);
+            const std::uint32_t local_dim1 = static_cast<std::uint32_t>(
+                std::min<std::size_t>(src_tile_start1 + block_size, n) -
+                src_tile_start1);
+
+            sycl::group_barrier(nd_it.get_group(),
+                                sycl::memory_scope::work_group);
+
+            // 2. Permute the block matrix in SLM using two private arrays
+            std::array<T, nelems_per_wi> private_block_01 = {T(0)};
+            std::array<T, nelems_per_wi> private_block_10 = {T(0)};
+
+            // 0 <= lid_lin < lws0 * lws1 ==
+            //       (block_size * block_size / nelems_per_wi) ==
+            //       (block_size/private_tile_size)**2
+            static constexpr std::uint16_t n_private_tiles_per_axis =
+                block_size / private_tile_size;
+            const std::uint16_t local_tile_id0 =
+                lid_lin / n_private_tiles_per_axis;
+            const std::uint16_t local_tile_id1 =
+                lid_lin - local_tile_id0 * n_private_tiles_per_axis;
+
+            if (local_tile_id0 <= local_tile_id1) {
+                for (std::uint16_t pr_i0 = 0; pr_i0 < private_tile_size;
+                     ++pr_i0) {
+                    for (std::uint16_t pr_i1 = 0; pr_i1 < private_tile_size;
+                         ++pr_i1) {
+                        const std::uint16_t t0_offset =
+                            local_tile_id0 * private_tile_size;
+                        const std::uint16_t t1_offset =
+                            local_tile_id1 * private_tile_size;
+
+                        const std::uint16_t pr_offset =
+                            pr_i1 * private_tile_size + pr_i0;
+                        const std::uint16_t rel_offset =
+                            pr_i0 + pr_i1 * block_size;
+
+                        // read (local_tile_id0, local_tile_id1)
+                        const std::uint16_t local_01_offset =
+                            (t0_offset + t1_offset * block_size) + rel_offset;
+                        private_block_01[pr_offset] =
+                            local_block[local_01_offset];
+
+                        // read (local_tile_id1, local_tile_id0)
+                        const std::uint16_t local_10_offset =
+                            (t1_offset + t0_offset * block_size) + rel_offset;
+                        private_block_10[pr_offset] =
+                            local_block[local_10_offset];
+                    }
+                }
+            }
+
+            sycl::group_barrier(nd_it.get_group(),
+                                sycl::memory_scope::work_group);
+
+            if (local_tile_id0 <= local_tile_id1) {
+                for (std::uint16_t pr_i0 = 0; pr_i0 < private_tile_size;
+                     ++pr_i0) {
+                    for (std::uint16_t pr_i1 = 0; pr_i1 < private_tile_size;
+                         ++pr_i1) {
+                        const std::uint16_t t0_offset =
+                            local_tile_id0 * private_tile_size;
+                        const std::uint16_t t1_offset =
+                            local_tile_id1 * private_tile_size;
+                        const std::uint16_t pr_offset =
+                            pr_i0 * private_tile_size + pr_i1;
+
+                        const std::uint16_t rel_offset =
+                            pr_i0 + pr_i1 * block_size;
+
+                        // write back permuted private blocks
+                        const std::uint32_t local_01_offset =
+                            (t0_offset + t1_offset * block_size) + rel_offset;
+                        local_block[local_01_offset] =
+                            private_block_10[pr_offset];
+
+                        const std::uint16_t local_10_offset =
+                            (t1_offset + t0_offset * block_size) + rel_offset;
+                        local_block[local_10_offset] =
+                            private_block_01[pr_offset];
+                    }
+                }
+            }
+
+            sycl::group_barrier(nd_it.get_group(),
+                                sycl::memory_scope::work_group);
+
+            // 3. Write out permuted SLM to destination array
+
+            const std::size_t dst_tile_start0 = src_tile_start0;
+            const std::size_t dst_tile_start1 = src_tile_start1;
+
+            if (local_dim0 == block_size && local_dim1 == block_size) {
+                const std::uint16_t dst_i0 = src_i1;
+                const std::uint16_t dst_i1 = src_i0;
+
+                const std::size_t dst_gid0 = (dst_tile_start0 + dst_i0);
+                const std::size_t dst_gid1 = (dst_tile_start1 + dst_i1);
+
+                const std::size_t dst_offset0 =
+                    dst_batch_offset + dst_gid0 * dst_stride + dst_gid1 * 1;
+                const std::size_t pr_step_dst = lws1 * dst_stride;
+
+                const std::uint16_t _local_offset0 =
+                    dst_i0 * block_size + dst_i1;
+                const std::uint16_t _pr_step_local = lws1 * block_size;
+
+                for (std::uint16_t pr_id = 0; pr_id < nelems_per_wi; ++pr_id) {
+                    if ((dst_gid1 < n) && ((dst_gid0 + pr_id * lws1) < n)) {
+                        dst_tp[dst_offset0 + pr_step_dst * pr_id] =
+                            local_block[_local_offset0 +
+                                        _pr_step_local * pr_id];
+                    }
+                }
+            }
+            else {
+                // map local_linear_id into (local_dim0, local_dim1)
+                for (std::uint16_t el_id = lid_lin;
+                     el_id < local_dim0 * local_dim1; el_id += lws0 * lws1)
+                {
+
+                    // 0 <= local_i0 < local_dim0
+                    const std::uint16_t loc_i0 = el_id / local_dim1;
+                    // 0 <= local_i1 < local_dim1
+                    const std::uint16_t loc_i1 = el_id - loc_i0 * local_dim1;
+
+                    const std::uint16_t dst_i0 = loc_i0;
+                    const std::uint16_t dst_i1 = loc_i1;
+
+                    const std::size_t dst_gid0 = (dst_tile_start0 + dst_i0);
+                    const std::size_t dst_gid1 = (dst_tile_start1 + dst_i1);
+
+                    const std::size_t dst_offset =
+                        dst_batch_offset + dst_gid0 * dst_stride + dst_gid1 * 1;
+                    const std::uint16_t local_offset =
+                        loc_i0 * block_size + loc_i1;
+
+                    if ((dst_gid1 < n) && (dst_gid0 < n)) {
+                        dst_tp[dst_offset] = local_block[local_offset];
+                    }
+                }
+            }
+        });
+    });
+
+    return e;
+}
+
+} // end of namespace detail
+
+template <typename T>
+sycl::event as_c_contiguous_1d_batch_of_square_matrices_impl(
+    sycl::queue &exec_q,
+    std::size_t batch_nelems,
+    ssize_t src_batch_step,
+    ssize_t dst_batch_step,
+    std::size_t n,
+    const char *src_p,
+    ssize_t src_ld,
+    char *dst_p,
+    ssize_t dst_ld,
+    const std::vector<sycl::event> &depends)
+{
+    using dpctl::tensor::offset_utils::Strided1DIndexer;
+    using dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer;
+    using BatchIndexerT =
+        TwoOffsets_CombinedIndexer<Strided1DIndexer, Strided1DIndexer>;
+
+    const auto &src_batch_indexer =
+        Strided1DIndexer(batch_nelems, src_batch_step);
+    const auto &dst_batch_indexer =
+        Strided1DIndexer(batch_nelems, dst_batch_step);
+
+    const BatchIndexerT batch_two_indexer{src_batch_indexer, dst_batch_indexer};
+
+    return detail::as_c_contiguous_batch_of_square_matrices_impl<T,
+                                                                 BatchIndexerT>(
+        exec_q, batch_nelems, batch_two_indexer, n, src_p, src_ld, dst_p,
+        dst_ld, depends);
+}
+
+typedef sycl::event (
+    *as_c_contiguous_1d_batch_of_square_matrices_impl_fn_ptr_t)(
+    sycl::queue &, /* execution queue */
+    std::size_t,   /* number of batch elements */
+    ssize_t,       /* distance between batches in source array */
+    ssize_t,       /* distance between batches in destination array */
+    std::size_t,   /* size of square matrices in the batch */
+    const char *,
+    ssize_t, /* untyped pointer to F-contig source array, and matrix leading
+                dimension */
+    char *,
+    ssize_t, /* untyped pointer to C-contig destination array, and matrix
+                leading dimension */
+    const std::vector<sycl::event> &);
+
+template <typename fnT, typename T>
+struct AsCContig1DBatchOfSquareMatricesFactory
+{
+    fnT get()
+    {
+        return as_c_contiguous_1d_batch_of_square_matrices_impl<T>;
+    }
+};
+
+template <typename T>
+sycl::event as_c_contiguous_nd_batch_of_square_matrices_impl(
+    sycl::queue &exec_q,
+    std::size_t batch_nelems,
+    int batch_nd,
+    const ssize_t *src_batch_shape_strides,
+    const ssize_t dst_batch_step,
+    std::size_t n,
+    const char *src_p,
+    ssize_t src_ld,
+    char *dst_p,
+    ssize_t dst_ld,
+    const std::vector<sycl::event> &depends)
+{
+    using SrcIndexerT = dpctl::tensor::offset_utils::StridedIndexer;
+    using DstIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer;
+    using dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer;
+    using BatchIndexerT = TwoOffsets_CombinedIndexer<SrcIndexerT, DstIndexerT>;
+
+    static constexpr ssize_t zero_offset{0};
+
+    const SrcIndexerT src_batch_indexer{batch_nd, zero_offset,
+                                        src_batch_shape_strides};
+    const DstIndexerT dst_batch_indexer{/* size */ batch_nelems,
+                                        /* step */ dst_batch_step};
+
+    const BatchIndexerT batch_two_offsets_indexer{src_batch_indexer,
+                                                  dst_batch_indexer};
+
+    return detail::as_c_contiguous_batch_of_square_matrices_impl<T,
+                                                                 BatchIndexerT>(
+        exec_q, batch_nelems, batch_two_offsets_indexer, n, src_p, src_ld,
+        dst_p, dst_ld, depends);
+}
+
+typedef sycl::event (
+    *as_c_contiguous_nd_batch_of_square_matrices_impl_fn_ptr_t)(
+    sycl::queue &, /* execution queue */
+    std::size_t,   /* number of matrices in the batch */
+    int,
+    const ssize_t *, /* dimensionality, and packed [shape, src_strides]
+                        describing iteration over batch in source array */
+    ssize_t,         /* distance between batches in destination array */
+    std::size_t,     /* matrix size */
+    const char *,
+    ssize_t, /* untyped pointer to source array of F-contig matrices, and
+                leading dimension of the matrix */
+    char *,
+    ssize_t, /* untyped pointer to destination array of F-contig matrices, and
+                leading dimension of the matrix */
+    const std::vector<sycl::event> &);
+
+template <typename fnT, typename T>
+struct AsCContigNDBatchOfSquareMatricesFactory
+{
+    fnT get()
+    {
+        return as_c_contiguous_nd_batch_of_square_matrices_impl<T>;
+    }
+};
+} // namespace dpctl::tensor::kernels::copy_as_contig
diff --git a/dpctl/tensor/libtensor/include/kernels/dpctl_tensor_types.hpp b/dpctl_ext/tensor/libtensor/include/kernels/dpctl_tensor_types.hpp
similarity index 100%
rename from dpctl/tensor/libtensor/include/kernels/dpctl_tensor_types.hpp
rename to dpctl_ext/tensor/libtensor/include/kernels/dpctl_tensor_types.hpp
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/common.hpp b/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/common.hpp
similarity index 100%
rename from dpctl/tensor/libtensor/include/kernels/elementwise_functions/common.hpp
rename to dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/common.hpp
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/common_detail.hpp b/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/common_detail.hpp
similarity index 100%
rename from dpctl/tensor/libtensor/include/kernels/elementwise_functions/common_detail.hpp
rename to dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/common_detail.hpp
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logaddexp.hpp b/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/logaddexp.hpp
similarity index 100%
rename from dpctl/tensor/libtensor/include/kernels/elementwise_functions/logaddexp.hpp
rename to dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/logaddexp.hpp
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/maximum.hpp b/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/maximum.hpp
similarity index 100%
rename from dpctl/tensor/libtensor/include/kernels/elementwise_functions/maximum.hpp
rename to dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/maximum.hpp
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/minimum.hpp b/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/minimum.hpp
similarity index 100%
rename from dpctl/tensor/libtensor/include/kernels/elementwise_functions/minimum.hpp
rename to dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/minimum.hpp
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sycl_complex.hpp b/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/sycl_complex.hpp
similarity index 100%
rename from dpctl/tensor/libtensor/include/kernels/elementwise_functions/sycl_complex.hpp
rename to dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/sycl_complex.hpp
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/vec_size_util.hpp b/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/vec_size_util.hpp
similarity index 100%
rename from dpctl/tensor/libtensor/include/kernels/elementwise_functions/vec_size_util.hpp
rename to dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/vec_size_util.hpp
diff --git a/dpctl/tensor/libtensor/include/utils/indexing_utils.hpp b/dpctl_ext/tensor/libtensor/include/utils/indexing_utils.hpp
similarity index 100%
rename from dpctl/tensor/libtensor/include/utils/indexing_utils.hpp
rename to dpctl_ext/tensor/libtensor/include/utils/indexing_utils.hpp
diff --git a/dpctl/tensor/libtensor/include/utils/math_utils.hpp b/dpctl_ext/tensor/libtensor/include/utils/math_utils.hpp
similarity index 100%
rename from dpctl/tensor/libtensor/include/utils/math_utils.hpp
rename to dpctl_ext/tensor/libtensor/include/utils/math_utils.hpp
diff --git a/dpctl/tensor/libtensor/include/utils/memory_overlap.hpp b/dpctl_ext/tensor/libtensor/include/utils/memory_overlap.hpp
similarity index 100%
rename from dpctl/tensor/libtensor/include/utils/memory_overlap.hpp
rename to dpctl_ext/tensor/libtensor/include/utils/memory_overlap.hpp
diff --git a/dpctl/tensor/libtensor/include/utils/offset_utils.hpp b/dpctl_ext/tensor/libtensor/include/utils/offset_utils.hpp
similarity index 100%
rename from dpctl/tensor/libtensor/include/utils/offset_utils.hpp
rename to dpctl_ext/tensor/libtensor/include/utils/offset_utils.hpp
diff --git a/dpctl/tensor/libtensor/include/utils/output_validation.hpp b/dpctl_ext/tensor/libtensor/include/utils/output_validation.hpp
similarity index 100%
rename from dpctl/tensor/libtensor/include/utils/output_validation.hpp
rename to dpctl_ext/tensor/libtensor/include/utils/output_validation.hpp
diff --git a/dpctl/tensor/libtensor/include/utils/strided_iters.hpp b/dpctl_ext/tensor/libtensor/include/utils/strided_iters.hpp
similarity index 100%
rename from dpctl/tensor/libtensor/include/utils/strided_iters.hpp
rename to dpctl_ext/tensor/libtensor/include/utils/strided_iters.hpp
diff --git a/dpctl/tensor/libtensor/include/utils/sycl_alloc_utils.hpp b/dpctl_ext/tensor/libtensor/include/utils/sycl_alloc_utils.hpp
similarity index 100%
rename from dpctl/tensor/libtensor/include/utils/sycl_alloc_utils.hpp
rename to dpctl_ext/tensor/libtensor/include/utils/sycl_alloc_utils.hpp
diff --git a/dpctl/tensor/libtensor/include/utils/sycl_utils.hpp b/dpctl_ext/tensor/libtensor/include/utils/sycl_utils.hpp
similarity index 100%
rename from dpctl/tensor/libtensor/include/utils/sycl_utils.hpp
rename to dpctl_ext/tensor/libtensor/include/utils/sycl_utils.hpp
diff --git a/dpctl/tensor/libtensor/include/utils/type_dispatch.hpp b/dpctl_ext/tensor/libtensor/include/utils/type_dispatch.hpp
similarity index 100%
rename from dpctl/tensor/libtensor/include/utils/type_dispatch.hpp
rename to dpctl_ext/tensor/libtensor/include/utils/type_dispatch.hpp
diff --git a/dpctl/tensor/libtensor/include/utils/type_dispatch_building.hpp b/dpctl_ext/tensor/libtensor/include/utils/type_dispatch_building.hpp
similarity index 100%
rename from dpctl/tensor/libtensor/include/utils/type_dispatch_building.hpp
rename to dpctl_ext/tensor/libtensor/include/utils/type_dispatch_building.hpp
diff --git a/dpctl/tensor/libtensor/include/utils/type_utils.hpp b/dpctl_ext/tensor/libtensor/include/utils/type_utils.hpp
similarity index 100%
rename from dpctl/tensor/libtensor/include/utils/type_utils.hpp
rename to dpctl_ext/tensor/libtensor/include/utils/type_utils.hpp
diff --git a/dpctl_ext/tensor/libtensor/source/copy_and_cast_usm_to_usm.cpp b/dpctl_ext/tensor/libtensor/source/copy_and_cast_usm_to_usm.cpp
new file mode 100644
index 000000000000..3d20be02f885
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/copy_and_cast_usm_to_usm.cpp
@@ -0,0 +1,297 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions
+//===----------------------------------------------------------------------===//
+
+#include <array>
+#include <cstddef>
+#include <sycl/sycl.hpp>
+#include <tuple>
+#include <utility>
+#include <vector>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/pybind11.h>
+
+#include "kernels/copy_and_cast.hpp"
+#include "utils/memory_overlap.hpp"
+#include "utils/offset_utils.hpp"
+#include "utils/output_validation.hpp"
+#include "utils/sycl_alloc_utils.hpp"
+#include "utils/type_dispatch.hpp"
+
+#include "copy_as_contig.hpp"
+#include "simplify_iteration_space.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+using dpctl::tensor::kernels::copy_and_cast::copy_and_cast_1d_fn_ptr_t;
+using dpctl::tensor::kernels::copy_and_cast::copy_and_cast_contig_fn_ptr_t;
+using dpctl::tensor::kernels::copy_and_cast::copy_and_cast_generic_fn_ptr_t;
+
+static copy_and_cast_generic_fn_ptr_t
+    copy_and_cast_generic_dispatch_table[td_ns::num_types][td_ns::num_types];
+static copy_and_cast_1d_fn_ptr_t
+    copy_and_cast_1d_dispatch_table[td_ns::num_types][td_ns::num_types];
+static copy_and_cast_contig_fn_ptr_t
+    copy_and_cast_contig_dispatch_table[td_ns::num_types][td_ns::num_types];
+
+namespace py = pybind11;
+
+using dpctl::utils::keep_args_alive;
+
+std::pair<sycl::event, sycl::event> copy_usm_ndarray_into_usm_ndarray(
+    const dpctl::tensor::usm_ndarray &src,
+    const dpctl::tensor::usm_ndarray &dst,
+    sycl::queue &exec_q,
+    const std::vector<sycl::event> &depends = {})
+{
+    // array dimensions must be the same
+    int src_nd = src.get_ndim();
+    int dst_nd = dst.get_ndim();
+
+    if (src_nd != dst_nd) {
+        throw py::value_error("Array dimensions are not the same.");
+    }
+
+    // shapes must be the same
+    const py::ssize_t *src_shape = src.get_shape_raw();
+    const py::ssize_t *dst_shape = dst.get_shape_raw();
+
+    bool shapes_equal(true);
+    std::size_t src_nelems(1);
+
+    for (int i = 0; shapes_equal && (i < src_nd); ++i) {
+        src_nelems *= static_cast<std::size_t>(src_shape[i]);
+        shapes_equal = shapes_equal && (src_shape[i] == dst_shape[i]);
+    }
+    if (!shapes_equal) {
+        throw py::value_error("Array shapes are not the same.");
+    }
+
+    if (src_nelems == 0) {
+        // nothing to do
+        return std::make_pair(sycl::event(), sycl::event());
+    }
+
+    dpctl::tensor::validation::AmpleMemory::throw_if_not_ample(dst, src_nelems);
+
+    // check compatibility of execution queue and allocation queue
+    if (!dpctl::utils::queues_are_compatible(exec_q, {src, dst})) {
+        throw py::value_error(
+            "Execution queue is not compatible with allocation queues");
+    }
+
+    dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst);
+
+    int src_typenum = src.get_typenum();
+    int dst_typenum = dst.get_typenum();
+
+    auto array_types = td_ns::usm_ndarray_types();
+    int src_type_id = array_types.typenum_to_lookup_id(src_typenum);
+    int dst_type_id = array_types.typenum_to_lookup_id(dst_typenum);
+
+    char *src_data = src.get_data();
+    char *dst_data = dst.get_data();
+
+    // check that arrays do not overlap, and concurrent copying is safe.
+    auto const &overlap = dpctl::tensor::overlap::MemoryOverlap();
+    if (overlap(src, dst)) {
+        // TODO: could use a temporary, but this is done by the caller
+        throw py::value_error("Arrays index overlapping segments of memory");
+    }
+
+    bool is_src_c_contig = src.is_c_contiguous();
+    bool is_src_f_contig = src.is_f_contiguous();
+
+    bool is_dst_c_contig = dst.is_c_contiguous();
+    bool is_dst_f_contig = dst.is_f_contiguous();
+
+    // check for applicability of special cases:
+    //      (both C-contiguous || both F-contiguous)
+    bool both_c_contig = (is_src_c_contig && is_dst_c_contig);
+    bool both_f_contig = (is_src_f_contig && is_dst_f_contig);
+    if (both_c_contig || both_f_contig) {
+
+        sycl::event copy_ev;
+        if (src_type_id == dst_type_id) {
+
+            int src_elem_size = src.get_elemsize();
+
+            copy_ev = exec_q.memcpy(static_cast<void *>(dst_data),
+                                    static_cast<const void *>(src_data),
+                                    src_nelems * src_elem_size, depends);
+        }
+        else {
+            auto contig_fn =
+                copy_and_cast_contig_dispatch_table[dst_type_id][src_type_id];
+            copy_ev =
+                contig_fn(exec_q, src_nelems, src_data, dst_data, depends);
+        }
+        // make sure src and dst are not GC-ed before copy_ev is complete
+        return std::make_pair(keep_args_alive(exec_q, {src, dst}, {copy_ev}),
+                              copy_ev);
+    }
+
+    if ((src_type_id == dst_type_id) && (src_nd > 1)) {
+        if (is_dst_c_contig) {
+            return py_as_c_contig(src, dst, exec_q, depends);
+        }
+        else if (is_dst_f_contig) {
+            return py_as_f_contig(src, dst, exec_q, depends);
+        }
+    }
+
+    auto const &src_strides = src.get_strides_vector();
+    auto const &dst_strides = dst.get_strides_vector();
+
+    using shT = std::vector<py::ssize_t>;
+    shT simplified_shape;
+    shT simplified_src_strides;
+    shT simplified_dst_strides;
+    py::ssize_t src_offset(0);
+    py::ssize_t dst_offset(0);
+
+    int nd = src_nd;
+    const py::ssize_t *shape = src_shape;
+
+    // nd, simplified_* and *_offset are modified by reference
+    dpctl::tensor::py_internal::simplify_iteration_space(
+        nd, shape, src_strides, dst_strides,
+        // output
+        simplified_shape, simplified_src_strides, simplified_dst_strides,
+        src_offset, dst_offset);
+
+    if (nd < 2) {
+        if (nd == 1) {
+            std::array<py::ssize_t, 1> shape_arr = {simplified_shape[0]};
+            std::array<py::ssize_t, 1> src_strides_arr = {
+                simplified_src_strides[0]};
+            std::array<py::ssize_t, 1> dst_strides_arr = {
+                simplified_dst_strides[0]};
+
+            sycl::event copy_and_cast_1d_event;
+            if ((src_strides_arr[0] == 1) && (dst_strides_arr[0] == 1) &&
+                (src_offset == 0) && (dst_offset == 0))
+            {
+                auto contig_fn =
+                    copy_and_cast_contig_dispatch_table[dst_type_id]
+                                                       [src_type_id];
+                copy_and_cast_1d_event =
+                    contig_fn(exec_q, src_nelems, src_data, dst_data, depends);
+            }
+            else {
+                auto fn =
+                    copy_and_cast_1d_dispatch_table[dst_type_id][src_type_id];
+                copy_and_cast_1d_event =
+                    fn(exec_q, src_nelems, shape_arr, src_strides_arr,
+                       dst_strides_arr, src_data, src_offset, dst_data,
+                       dst_offset, depends);
+            }
+            return std::make_pair(
+                keep_args_alive(exec_q, {src, dst}, {copy_and_cast_1d_event}),
+                copy_and_cast_1d_event);
+        }
+        else if (nd == 0) { // case of a scalar
+            assert(src_nelems == 1);
+            std::array<py::ssize_t, 1> shape_arr = {1};
+            std::array<py::ssize_t, 1> src_strides_arr = {1};
+            std::array<py::ssize_t, 1> dst_strides_arr = {1};
+
+            auto fn = copy_and_cast_1d_dispatch_table[dst_type_id][src_type_id];
+
+            sycl::event copy_and_cast_0d_event = fn(
+                exec_q, src_nelems, shape_arr, src_strides_arr, dst_strides_arr,
+                src_data, src_offset, dst_data, dst_offset, depends);
+
+            return std::make_pair(
+                keep_args_alive(exec_q, {src, dst}, {copy_and_cast_0d_event}),
+                copy_and_cast_0d_event);
+        }
+    }
+
+    // Generic implementation
+    auto copy_and_cast_fn =
+        copy_and_cast_generic_dispatch_table[dst_type_id][src_type_id];
+
+    std::vector<sycl::event> host_task_events;
+    host_task_events.reserve(2);
+
+    using dpctl::tensor::offset_utils::device_allocate_and_pack;
+    auto ptr_size_event_tuple = device_allocate_and_pack<py::ssize_t>(
+        exec_q, host_task_events, simplified_shape, simplified_src_strides,
+        simplified_dst_strides);
+    auto shape_strides_owner = std::move(std::get<0>(ptr_size_event_tuple));
+    const sycl::event &copy_shape_ev = std::get<2>(ptr_size_event_tuple);
+    const py::ssize_t *shape_strides = shape_strides_owner.get();
+
+    const sycl::event &copy_and_cast_generic_ev = copy_and_cast_fn(
+        exec_q, src_nelems, nd, shape_strides, src_data, src_offset, dst_data,
+        dst_offset, depends, {copy_shape_ev});
+
+    // async free of shape_strides temporary
+    const auto &temporaries_cleanup_ev =
+        dpctl::tensor::alloc_utils::async_smart_free(
+            exec_q, {copy_and_cast_generic_ev}, shape_strides_owner);
+    host_task_events.push_back(temporaries_cleanup_ev);
+
+    return std::make_pair(keep_args_alive(exec_q, {src, dst}, host_task_events),
+                          copy_and_cast_generic_ev);
+}
+
+void init_copy_and_cast_usm_to_usm_dispatch_tables(void)
+{
+    using namespace td_ns;
+
+    using dpctl::tensor::kernels::copy_and_cast::CopyAndCastContigFactory;
+    DispatchTableBuilder<copy_and_cast_contig_fn_ptr_t,
+                         CopyAndCastContigFactory, num_types>
+        dtb_contig;
+    dtb_contig.populate_dispatch_table(copy_and_cast_contig_dispatch_table);
+
+    using dpctl::tensor::kernels::copy_and_cast::CopyAndCastGenericFactory;
+    DispatchTableBuilder<copy_and_cast_generic_fn_ptr_t,
+                         CopyAndCastGenericFactory, num_types>
+        dtb_generic;
+    dtb_generic.populate_dispatch_table(copy_and_cast_generic_dispatch_table);
+
+    using dpctl::tensor::kernels::copy_and_cast::CopyAndCast1DFactory;
+    DispatchTableBuilder<copy_and_cast_1d_fn_ptr_t, CopyAndCast1DFactory,
+                         num_types>
+        dtb_1d;
+    dtb_1d.populate_dispatch_table(copy_and_cast_1d_dispatch_table);
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/copy_and_cast_usm_to_usm.hpp b/dpctl_ext/tensor/libtensor/source/copy_and_cast_usm_to_usm.hpp
new file mode 100644
index 000000000000..d2e07b08d38f
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/copy_and_cast_usm_to_usm.hpp
@@ -0,0 +1,53 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions
+//===----------------------------------------------------------------------===//
+
+#pragma once
+#include <sycl/sycl.hpp>
+#include <utility>
+#include <vector>
+
+#include "dpnp4pybind11.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+extern std::pair<sycl::event, sycl::event> copy_usm_ndarray_into_usm_ndarray(
+    const dpctl::tensor::usm_ndarray &src,
+    const dpctl::tensor::usm_ndarray &dst,
+    sycl::queue &exec_q,
+    const std::vector<sycl::event> &depends = {});
+
+extern void init_copy_and_cast_usm_to_usm_dispatch_tables();
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/copy_as_contig.cpp b/dpctl_ext/tensor/libtensor/source/copy_as_contig.cpp
new file mode 100644
index 000000000000..bbee24c95d4d
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/copy_as_contig.cpp
@@ -0,0 +1,786 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions
+//===----------------------------------------------------------------------===//
+
+#include <cstddef>
+#include <iterator>
+#include <numeric>
+#include <stdexcept>
+#include <tuple>
+#include <utility>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/pybind11.h>
+
+#include "kernels/copy_as_contiguous.hpp"
+#include "utils/memory_overlap.hpp"
+#include "utils/offset_utils.hpp"
+#include "utils/output_validation.hpp"
+#include "utils/sycl_alloc_utils.hpp"
+#include "utils/type_dispatch.hpp"
+
+#include "copy_as_contig.hpp"
+#include "simplify_iteration_space.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+using dpctl::tensor::kernels::copy_as_contig::
+    as_c_contiguous_1d_batch_of_square_matrices_impl_fn_ptr_t;
+using dpctl::tensor::kernels::copy_as_contig::
+    as_c_contiguous_array_impl_fn_ptr_t;
+using dpctl::tensor::kernels::copy_as_contig::
+    as_c_contiguous_nd_batch_of_square_matrices_impl_fn_ptr_t;
+using dpctl::utils::keep_args_alive;
+
+static as_c_contiguous_array_impl_fn_ptr_t
+    as_c_contig_array_dispatch_vector[td_ns::num_types];
+
+static as_c_contiguous_1d_batch_of_square_matrices_impl_fn_ptr_t
+    as_c_contig_1d_batch_of_square_matrices_dispatch_vector[td_ns::num_types];
+
+static as_c_contiguous_nd_batch_of_square_matrices_impl_fn_ptr_t
+    as_c_contig_nd_batch_of_square_matrices_dispatch_vector[td_ns::num_types];
+
+void init_copy_as_contig_dispatch_vectors(void)
+{
+
+    using dpctl::tensor::kernels::copy_as_contig::
+        AsCContig1DBatchOfSquareMatricesFactory;
+    using dpctl::tensor::kernels::copy_as_contig::AsCContigFactory;
+    using dpctl::tensor::kernels::copy_as_contig::
+        AsCContigNDBatchOfSquareMatricesFactory;
+    using td_ns::DispatchVectorBuilder;
+
+    // Generic to c-contig
+    DispatchVectorBuilder<as_c_contiguous_array_impl_fn_ptr_t, AsCContigFactory,
+                          td_ns::num_types>
+        dtv_as_c_contig_array;
+
+    dtv_as_c_contig_array.populate_dispatch_vector(
+        as_c_contig_array_dispatch_vector);
+
+    // 1D batch of square views into F-contig matrices to c-contig array
+    DispatchVectorBuilder<
+        as_c_contiguous_1d_batch_of_square_matrices_impl_fn_ptr_t,
+        AsCContig1DBatchOfSquareMatricesFactory, td_ns::num_types>
+        dtv_as_c_contig_1d_batch_of_square_matrices;
+
+    dtv_as_c_contig_1d_batch_of_square_matrices.populate_dispatch_vector(
+        as_c_contig_1d_batch_of_square_matrices_dispatch_vector);
+
+    // ND batch of square views into F-contig matrices to c-contig array
+    DispatchVectorBuilder<
+        as_c_contiguous_nd_batch_of_square_matrices_impl_fn_ptr_t,
+        AsCContigNDBatchOfSquareMatricesFactory, td_ns::num_types>
+        dtv_as_c_contig_nd_batch_of_square_matrices;
+
+    dtv_as_c_contig_nd_batch_of_square_matrices.populate_dispatch_vector(
+        as_c_contig_nd_batch_of_square_matrices_dispatch_vector);
+}
+
+namespace
+{
+
+template <typename dimT>
+std::size_t get_nelems(const std::vector<dimT> &shape)
+{
+    auto mult_fn = [](std::size_t prod, const dimT &term) -> std::size_t {
+        return prod * static_cast<std::size_t>(term);
+    };
+
+    static constexpr std::size_t unit{1};
+
+    const std::size_t nelems =
+        std::accumulate(std::begin(shape), std::end(shape), unit, mult_fn);
+    return nelems;
+}
+
+} // end of anonymous namespace
+
+std::pair<sycl::event, sycl::event>
+    py_as_c_contig_f2c(const dpctl::tensor::usm_ndarray &src,
+                       const dpctl::tensor::usm_ndarray &dst,
+                       sycl::queue &exec_q,
+                       const std::vector<sycl::event> &depends);
+
+std::pair<sycl::event, sycl::event>
+    py_as_c_contig(const dpctl::tensor::usm_ndarray &src,
+                   const dpctl::tensor::usm_ndarray &dst,
+                   sycl::queue &exec_q,
+                   const std::vector<sycl::event> &depends)
+{
+    /*  Same dimensions, same shape, same data-type
+     *  dst is C-contiguous.
+     */
+    const int src_nd = src.get_ndim();
+    const int dst_nd = dst.get_ndim();
+
+    if (src_nd != dst_nd) {
+        throw py::value_error("Number of dimensions must be the same");
+    }
+
+    const auto &src_shape_vec = src.get_shape_vector();
+    const auto &dst_shape_vec = dst.get_shape_vector();
+
+    if (src_shape_vec != dst_shape_vec) {
+        throw py::value_error("Shapes must be equal");
+    }
+
+    int src_typenum = src.get_typenum();
+    int dst_typenum = dst.get_typenum();
+
+    const auto &array_types = td_ns::usm_ndarray_types();
+    const int src_type_id = array_types.typenum_to_lookup_id(src_typenum);
+    const int dst_type_id = array_types.typenum_to_lookup_id(dst_typenum);
+
+    if (src_type_id != dst_type_id) {
+        throw py::value_error(
+            "Source and destination arrays must have the same data type");
+    }
+
+    // ensures also that destination is plenty ample to accommodate all
+    // elements of src array
+    if (!dst.is_c_contiguous()) {
+        throw py::value_error("Destination array must be C-contiguous");
+    }
+
+    dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst);
+
+    // check compatibility of execution queue and allocation queue
+    if (!dpctl::utils::queues_are_compatible(exec_q, {src, dst})) {
+        throw py::value_error(
+            "Execution queue is not compatible with allocation queues");
+    }
+
+    // check that arrays do not overlap, and concurrent copying is safe.
+    auto const &overlap = dpctl::tensor::overlap::MemoryOverlap();
+    if (overlap(src, dst)) {
+        throw py::value_error("Arrays index overlapping segments of memory");
+    }
+
+    const auto &src_strides_vec = src.get_strides_vector();
+
+    if (src_nd >= 2) {
+        auto n = dst_shape_vec.back();
+        if (n == dst_shape_vec[src_nd - 2]) {
+            static constexpr auto unit_stride = py::ssize_t(1);
+            if (src_strides_vec[src_nd - 2] == unit_stride) {
+                return py_as_c_contig_f2c(src, dst, exec_q, depends);
+            }
+        }
+    }
+
+    const std::size_t nelems = get_nelems(src_shape_vec);
+
+    if (nelems == 0) {
+        // nothing to do
+        return std::make_pair(sycl::event(), sycl::event());
+    }
+
+    // simplify iteration space
+    using shT = std::vector<py::ssize_t>;
+    shT simplified_shape;
+    shT simplified_src_strides;
+    shT simplified_dst_strides;
+    py::ssize_t src_offset(0);
+    py::ssize_t dst_offset(0);
+
+    int nd = src_nd;
+
+    // nd, simplified_* and *_offset are modified by reference
+    dpctl::tensor::py_internal::simplify_iteration_space(
+        nd, src_shape_vec.data(), src_strides_vec, dst.get_strides_vector(),
+        // output
+        simplified_shape, simplified_src_strides, simplified_dst_strides,
+        src_offset, dst_offset);
+
+    if (!((0 == src_offset) && (0 == dst_offset))) {
+        throw std::runtime_error(
+            "Unexpected result of simplifying iteration space, 1");
+    }
+
+    std::vector<sycl::event> host_task_events{};
+    auto ptr_size_event_tuple =
+        dpctl::tensor::offset_utils::device_allocate_and_pack<py::ssize_t>(
+            exec_q, host_task_events, simplified_shape, simplified_src_strides);
+    auto shape_stride_owner = std::move(std::get<0>(ptr_size_event_tuple));
+    const sycl::event &copy_shape_ev = std::get<2>(ptr_size_event_tuple);
+    const py::ssize_t *shape_stride = shape_stride_owner.get();
+
+    auto ascontig_fn = as_c_contig_array_dispatch_vector[src_type_id];
+
+    std::vector<sycl::event> all_depends;
+    all_depends.reserve(depends.size() + 1);
+    all_depends.insert(std::end(all_depends), std::begin(depends),
+                       std::end(depends));
+    all_depends.push_back(copy_shape_ev);
+
+    sycl::event ascontig_ev =
+        ascontig_fn(exec_q, nelems, nd, shape_stride, src.get_data(),
+                    dst.get_data(), all_depends);
+
+    const auto &temporaries_cleanup_ev =
+        dpctl::tensor::alloc_utils::async_smart_free(exec_q, {ascontig_ev},
+                                                     shape_stride_owner);
+    host_task_events.push_back(temporaries_cleanup_ev);
+
+    return std::make_pair(keep_args_alive(exec_q, {src, dst}, host_task_events),
+                          ascontig_ev);
+}
+
+std::pair<sycl::event, sycl::event>
+    py_as_f_contig_c2f(const dpctl::tensor::usm_ndarray &src,
+                       const dpctl::tensor::usm_ndarray &dst,
+                       sycl::queue &exec_q,
+                       const std::vector<sycl::event> &depends);
+
+std::pair<sycl::event, sycl::event>
+    py_as_f_contig(const dpctl::tensor::usm_ndarray &src,
+                   const dpctl::tensor::usm_ndarray &dst,
+                   sycl::queue &exec_q,
+                   const std::vector<sycl::event> &depends)
+{
+    /*  Same dimensions, same shape, same data-type
+     *  dst is F-contiguous.
+     */
+    int src_nd = src.get_ndim();
+    int dst_nd = dst.get_ndim();
+
+    if (src_nd != dst_nd) {
+        throw py::value_error("Number of dimensions must be the same");
+    }
+
+    const auto &src_shape_vec = src.get_shape_vector();
+    const auto &dst_shape_vec = dst.get_shape_vector();
+
+    if (src_shape_vec != dst_shape_vec) {
+        throw py::value_error("Shapes must be equal");
+    }
+
+    int src_typenum = src.get_typenum();
+    int dst_typenum = dst.get_typenum();
+
+    const auto &array_types = td_ns::usm_ndarray_types();
+    const int src_type_id = array_types.typenum_to_lookup_id(src_typenum);
+    const int dst_type_id = array_types.typenum_to_lookup_id(dst_typenum);
+
+    if (src_type_id != dst_type_id) {
+        throw py::value_error(
+            "Source and destination arrays must have the same data type");
+    }
+
+    // ensures also that destination is plenty ample to accommodate all
+    // elements of src array
+    if (!dst.is_f_contiguous()) {
+        throw py::value_error("Destination array must be F-contiguous");
+    }
+
+    dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst);
+
+    // check compatibility of execution queue and allocation queue
+    if (!dpctl::utils::queues_are_compatible(exec_q, {src, dst})) {
+        throw py::value_error(
+            "Execution queue is not compatible with allocation queues");
+    }
+
+    // check that arrays do not overlap, and concurrent copying is safe.
+    auto const &overlap = dpctl::tensor::overlap::MemoryOverlap();
+    if (overlap(src, dst)) {
+        throw py::value_error("Arrays index overlapping segments of memory");
+    }
+
+    const auto &src_strides_vec = src.get_strides_vector();
+
+    if (src_nd >= 2) {
+        auto n = dst_shape_vec.front();
+        if (n == dst_shape_vec[1]) {
+            static constexpr auto unit_stride = py::ssize_t(1);
+            if (src_strides_vec[1] == unit_stride) {
+                return py_as_f_contig_c2f(src, dst, exec_q, depends);
+            }
+        }
+    }
+
+    const std::size_t nelems = get_nelems(src_shape_vec);
+
+    if (nelems == 0) {
+        // nothing to do
+        return std::make_pair(sycl::event(), sycl::event());
+    }
+
+    // simplify batch iteration space
+    // NB: simplification reverses dst strides to C contig,
+    // it also reverses simplified_shape and simplified_src_strides
+
+    using shT = std::vector<py::ssize_t>;
+    shT simplified_shape;
+    shT simplified_src_strides;
+    shT simplified_dst_strides;
+    py::ssize_t src_offset(0);
+    py::ssize_t dst_offset(0);
+
+    int nd = src_nd;
+
+    // nd, simplified_* and *_offset are modified by reference
+    dpctl::tensor::py_internal::simplify_iteration_space(
+        nd, src_shape_vec.data(), src_strides_vec, dst.get_strides_vector(),
+        // output
+        simplified_shape, simplified_src_strides, simplified_dst_strides,
+        src_offset, dst_offset);
+
+    if (!((0 == src_offset) && (0 == dst_offset))) {
+        throw std::runtime_error(
+            "Unexpected result of simplifying iteration space, 1");
+    }
+
+    std::vector<sycl::event> host_task_events{};
+    auto ptr_size_event_tuple =
+        dpctl::tensor::offset_utils::device_allocate_and_pack<py::ssize_t>(
+            exec_q, host_task_events, simplified_shape, simplified_src_strides);
+    auto shape_stride_owner = std::move(std::get<0>(ptr_size_event_tuple));
+    const sycl::event &copy_shape_ev = std::get<2>(ptr_size_event_tuple);
+    const py::ssize_t *shape_stride = shape_stride_owner.get();
+
+    auto ascontig_fn = as_c_contig_array_dispatch_vector[src_type_id];
+
+    std::vector<sycl::event> all_depends;
+    all_depends.reserve(depends.size() + 1);
+    all_depends.insert(std::end(all_depends), std::begin(depends),
+                       std::end(depends));
+    all_depends.push_back(copy_shape_ev);
+
+    sycl::event ascontig_ev =
+        ascontig_fn(exec_q, nelems, nd, shape_stride, src.get_data(),
+                    dst.get_data(), all_depends);
+
+    const auto &temporaries_cleanup_ev =
+        dpctl::tensor::alloc_utils::async_smart_free(exec_q, {ascontig_ev},
+                                                     shape_stride_owner);
+    host_task_events.push_back(temporaries_cleanup_ev);
+
+    return std::make_pair(keep_args_alive(exec_q, {src, dst}, host_task_events),
+                          ascontig_ev);
+}
+
+std::pair<sycl::event, sycl::event>
+    py_as_c_contig_f2c(const dpctl::tensor::usm_ndarray &src,
+                       const dpctl::tensor::usm_ndarray &dst,
+                       sycl::queue &exec_q,
+                       const std::vector<sycl::event> &depends)
+{
+    /*  Same dimensions, same shape, same data-type
+     *  dst is C-contiguous.
+     */
+    int src_nd = src.get_ndim();
+    int dst_nd = dst.get_ndim();
+
+    if (src_nd != dst_nd) {
+        throw py::value_error("Number of dimensions must be the same.");
+    }
+    if (src_nd < 2) {
+        throw py::value_error("Arrays must have 2 or more axes");
+    }
+
+    const auto &src_shape_vec = src.get_shape_vector();
+    const auto &dst_shape_vec = dst.get_shape_vector();
+
+    std::size_t nelems{1};
+    bool equal_shapes = true;
+
+    for (int i = 0; equal_shapes && (i < src_nd); ++i) {
+        auto sh_i = src_shape_vec[i];
+        equal_shapes = equal_shapes && (sh_i == dst_shape_vec[i]);
+        nelems *= static_cast<std::size_t>(sh_i);
+    }
+
+    if (!equal_shapes) {
+        throw py::value_error("Shapes must be equal");
+    }
+
+    const auto n = src_shape_vec.back();
+    if (src_shape_vec[src_nd - 2] != n) {
+        throw py::value_error("Matrices must be square");
+    }
+
+    const auto &src_strides_vec = src.get_strides_vector();
+
+    if (src_strides_vec[src_nd - 2] != py::ssize_t(1)) {
+        throw py::value_error("Unexpected destination array layout");
+    }
+
+    int src_typenum = src.get_typenum();
+    int dst_typenum = dst.get_typenum();
+
+    auto array_types = td_ns::usm_ndarray_types();
+    const int src_type_id = array_types.typenum_to_lookup_id(src_typenum);
+    const int dst_type_id = array_types.typenum_to_lookup_id(dst_typenum);
+
+    if (src_type_id != dst_type_id) {
+        throw py::value_error(
+            "Source and destination arrays must have the same data type");
+    }
+
+    // ensures also that destination is plenty ample to accommodate all
+    // elements of src array
+    if (!dst.is_c_contiguous()) {
+        throw py::value_error("Destination array must be C-contiguous");
+    }
+
+    dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst);
+
+    // check compatibility of execution queue and allocation queue
+    if (!dpctl::utils::queues_are_compatible(exec_q, {src, dst})) {
+        throw py::value_error(
+            "Execution queue is not compatible with allocation queues");
+    }
+
+    // check that arrays do not overlap, and concurrent copying is safe.
+    auto const &overlap = dpctl::tensor::overlap::MemoryOverlap();
+    if (overlap(src, dst)) {
+        throw py::value_error("Arrays index overlapping segments of memory");
+    }
+
+    if (nelems == 0) {
+        // nothing to do
+        return std::make_pair(sycl::event(), sycl::event());
+    }
+
+    const auto &dst_strides_vec = dst.get_strides_vector();
+
+    const std::size_t batch_nelems =
+        (src_nd == 2) ? std::size_t(1) : (nelems / (n * n));
+    const py::ssize_t dst_batch_step =
+        (src_nd == 2) ? py::ssize_t(0) : dst_strides_vec[src_nd - 3];
+
+    std::vector<py::ssize_t> src_batch_strides_vec;
+    std::vector<py::ssize_t> dst_batch_strides_vec;
+    std::vector<py::ssize_t> batch_shape_vec;
+
+    if (src_nd == 2) {
+        batch_shape_vec.push_back(py::ssize_t(1));
+        src_batch_strides_vec.push_back(py::ssize_t(0));
+        dst_batch_strides_vec.push_back(dst_batch_step);
+    }
+    else {
+        batch_shape_vec.insert(std::end(batch_shape_vec),
+                               std::begin(src_shape_vec),
+                               std::end(src_shape_vec) - 2);
+        src_batch_strides_vec.insert(std::end(src_batch_strides_vec),
+                                     std::begin(src_strides_vec),
+                                     std::end(src_strides_vec) - 2);
+        dst_batch_strides_vec.insert(std::end(dst_batch_strides_vec),
+                                     std::begin(dst_strides_vec),
+                                     std::end(dst_strides_vec) - 2);
+    }
+
+    // simplify batch iteration space
+    using shT = std::vector<py::ssize_t>;
+    shT simplified_shape;
+    shT simplified_src_strides;
+    shT simplified_dst_strides;
+    py::ssize_t src_offset(0);
+    py::ssize_t dst_offset(0);
+
+    int nd = static_cast<int>(batch_shape_vec.size());
+
+    // nd, simplified_* and *_offset are modified by reference
+    dpctl::tensor::py_internal::simplify_iteration_space(
+        nd, batch_shape_vec.data(), src_batch_strides_vec,
+        dst_batch_strides_vec,
+        // output
+        simplified_shape, simplified_src_strides, simplified_dst_strides,
+        src_offset, dst_offset);
+
+    if (!((0 == src_offset) && (0 == dst_offset))) {
+        throw std::runtime_error(
+            "Unexpected result of simplifying iteration space, 1");
+    }
+
+    if (1 == nd) {
+        const auto expected_dim = static_cast<py::ssize_t>(batch_nelems);
+        if ((simplified_shape.front() != expected_dim) ||
+            (simplified_dst_strides.front() != dst_batch_step))
+        {
+            throw std::runtime_error(
+                "Unexpected result of simplifying iteration space, 2");
+        }
+
+        auto impl_fn = as_c_contig_1d_batch_of_square_matrices_dispatch_vector
+            [src_type_id];
+        const py::ssize_t src_batch_step = simplified_src_strides.front();
+
+        sycl::event ascontig_ev =
+            impl_fn(exec_q, batch_nelems, src_batch_step, dst_batch_step, n,
+                    src.get_data(), src_strides_vec.back(), dst.get_data(),
+                    dst_strides_vec[src_nd - 2], depends);
+
+        return std::make_pair(
+            keep_args_alive(exec_q, {src, dst}, {ascontig_ev}), ascontig_ev);
+    }
+
+    auto impl_fn =
+        as_c_contig_nd_batch_of_square_matrices_dispatch_vector[src_type_id];
+
+    std::vector<sycl::event> host_task_events;
+    host_task_events.reserve(2);
+
+    using dpctl::tensor::offset_utils::device_allocate_and_pack;
+    auto ptr_size_event_tuple = device_allocate_and_pack<py::ssize_t>(
+        exec_q, host_task_events, simplified_shape, simplified_src_strides);
+    auto packed_shape_strides_owner =
+        std::move(std::get<0>(ptr_size_event_tuple));
+    const sycl::event &copy_shape_ev = std::get<2>(ptr_size_event_tuple);
+    const py::ssize_t *packed_shape_strides = packed_shape_strides_owner.get();
+
+    std::vector<sycl::event> all_depends;
+    all_depends.reserve(depends.size() + 1);
+    all_depends.insert(std::end(all_depends), std::begin(depends),
+                       std::end(depends));
+    all_depends.push_back(copy_shape_ev);
+
+    sycl::event ascontig_ev =
+        impl_fn(exec_q, batch_nelems, nd, packed_shape_strides, dst_batch_step,
+                n, src.get_data(), src_strides_vec.back(), dst.get_data(),
+                dst_strides_vec[src_nd - 2], all_depends);
+
+    // async free of shape_strides temporary
+    sycl::event temporaries_cleanup_ev =
+        dpctl::tensor::alloc_utils::async_smart_free(
+            exec_q, {ascontig_ev}, packed_shape_strides_owner);
+    host_task_events.push_back(temporaries_cleanup_ev);
+
+    return std::make_pair(keep_args_alive(exec_q, {src, dst}, host_task_events),
+                          ascontig_ev);
+}
+
+std::pair<sycl::event, sycl::event>
+    py_as_f_contig_c2f(const dpctl::tensor::usm_ndarray &src,
+                       const dpctl::tensor::usm_ndarray &dst,
+                       sycl::queue &exec_q,
+                       const std::vector<sycl::event> &depends)
+{
+    /*  Same dimensions, same shape, same data-type
+     *  dst is F-contiguous.
+     */
+    int src_nd = src.get_ndim();
+    int dst_nd = dst.get_ndim();
+
+    if (src_nd != dst_nd) {
+        throw py::value_error("Number of dimensions must be the same.");
+    }
+    if (src_nd < 2) {
+        throw py::value_error("Arrays must have 2 or more axes");
+    }
+
+    // ensures also that destination is plenty ample to accommodate all
+    // elements of src array
+    if (!dst.is_f_contiguous()) {
+        throw py::value_error("Destination array must be C-contiguous");
+    }
+
+    const auto &src_shape_vec = src.get_shape_vector();
+    const auto &dst_shape_vec = dst.get_shape_vector();
+
+    std::size_t nelems{1};
+    bool equal_shapes = true;
+
+    for (int i = 0; equal_shapes && (i < src_nd); ++i) {
+        auto sh_i = src_shape_vec[i];
+        equal_shapes = equal_shapes && (sh_i == dst_shape_vec[i]);
+        nelems *= static_cast<std::size_t>(sh_i);
+    }
+
+    if (!equal_shapes) {
+        throw py::value_error("Shapes must be equal");
+    }
+
+    const auto n = dst_shape_vec.front();
+    if (dst_shape_vec[1] != n) {
+        throw py::value_error("Matrices must be square");
+    }
+
+    const auto &src_strides_vec = src.get_strides_vector();
+
+    if (src_strides_vec[1] != py::ssize_t(1)) {
+        throw py::value_error("Unexpected destination array layout");
+    }
+
+    dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst);
+
+    // check compatibility of execution queue and allocation queue
+    if (!dpctl::utils::queues_are_compatible(exec_q, {src, dst})) {
+        throw py::value_error(
+            "Execution queue is not compatible with allocation queues");
+    }
+
+    // check that arrays do not overlap, and concurrent copying is safe.
+    auto const &overlap = dpctl::tensor::overlap::MemoryOverlap();
+    if (overlap(src, dst)) {
+        throw py::value_error("Arrays index overlapping segments of memory");
+    }
+
+    int src_typenum = src.get_typenum();
+    int dst_typenum = dst.get_typenum();
+
+    auto array_types = td_ns::usm_ndarray_types();
+    const int src_type_id = array_types.typenum_to_lookup_id(src_typenum);
+    const int dst_type_id = array_types.typenum_to_lookup_id(dst_typenum);
+
+    if (src_type_id != dst_type_id) {
+        throw py::value_error(
+            "Source and destination arrays must have the same data type");
+    }
+
+    if (nelems == 0) {
+        // nothing to do
+        return std::make_pair(sycl::event(), sycl::event());
+    }
+
+    const auto &dst_strides_vec = dst.get_strides_vector();
+
+    const std::size_t batch_nelems =
+        (src_nd == 2) ? std::size_t(1) : (nelems / (n * n));
+    const py::ssize_t dst_batch_step =
+        (src_nd == 2) ? py::ssize_t(0) : dst_strides_vec[2];
+
+    std::vector<py::ssize_t> src_batch_strides_vec;
+    std::vector<py::ssize_t> dst_batch_strides_vec;
+    std::vector<py::ssize_t> batch_shape_vec;
+
+    if (src_nd == 2) {
+        batch_shape_vec.push_back(py::ssize_t(1));
+        src_batch_strides_vec.push_back(py::ssize_t(0));
+        dst_batch_strides_vec.push_back(dst_batch_step);
+    }
+    else {
+        batch_shape_vec.insert(std::end(batch_shape_vec),
+                               std::begin(src_shape_vec) + 2,
+                               std::end(src_shape_vec));
+        src_batch_strides_vec.insert(std::end(src_batch_strides_vec),
+                                     std::begin(src_strides_vec) + 2,
+                                     std::end(src_strides_vec));
+        dst_batch_strides_vec.insert(std::end(dst_batch_strides_vec),
+                                     std::begin(dst_strides_vec) + 2,
+                                     std::end(dst_strides_vec));
+    }
+
+    // simplify batch iteration space
+    // NB: simplification reverses dst strides to C contig,
+    // it also reverses simplified_shape and simplified_src_strides
+    using shT = std::vector<py::ssize_t>;
+    shT simplified_shape;
+    shT simplified_src_strides;
+    shT simplified_dst_strides;
+    py::ssize_t src_offset(0);
+    py::ssize_t dst_offset(0);
+
+    int nd = static_cast<int>(batch_shape_vec.size());
+
+    // nd, simplified_* and *_offset are modified by reference
+    dpctl::tensor::py_internal::simplify_iteration_space(
+        nd, batch_shape_vec.data(), src_batch_strides_vec,
+        dst_batch_strides_vec,
+        // output
+        simplified_shape, simplified_src_strides, simplified_dst_strides,
+        src_offset, dst_offset);
+
+    if (!((0 == src_offset) && (0 == dst_offset))) {
+        throw std::runtime_error(
+            "Unexpected result of simplifying iteration space, 1");
+    }
+
+    if (1 == nd) {
+        const auto expected_dim = static_cast<py::ssize_t>(batch_nelems);
+        if ((simplified_shape.front() != expected_dim) ||
+            (simplified_dst_strides.front() != dst_batch_step))
+        {
+            throw std::runtime_error(
+                "Unexpected result of simplifying iteration space, 2");
+        }
+
+        auto impl_fn = as_c_contig_1d_batch_of_square_matrices_dispatch_vector
+            [src_type_id];
+        const py::ssize_t src_batch_step = simplified_src_strides.front();
+
+        sycl::event ascontig_ev =
+            impl_fn(exec_q, batch_nelems, src_batch_step, dst_batch_step, n,
+                    src.get_data(), src_strides_vec.front(), dst.get_data(),
+                    dst_strides_vec[1], depends);
+
+        return std::make_pair(
+            keep_args_alive(exec_q, {src, dst}, {ascontig_ev}), ascontig_ev);
+    }
+
+    auto impl_fn =
+        as_c_contig_nd_batch_of_square_matrices_dispatch_vector[src_type_id];
+
+    std::vector<sycl::event> host_task_events;
+    host_task_events.reserve(2);
+
+    using dpctl::tensor::offset_utils::device_allocate_and_pack;
+    auto ptr_size_event_tuple = device_allocate_and_pack<py::ssize_t>(
+        exec_q, host_task_events, simplified_shape, simplified_src_strides);
+    auto packed_shape_strides_owner =
+        std::move(std::get<0>(ptr_size_event_tuple));
+    const sycl::event &copy_shape_ev = std::get<2>(ptr_size_event_tuple);
+    const py::ssize_t *packed_shape_strides = packed_shape_strides_owner.get();
+
+    std::vector<sycl::event> all_depends;
+    all_depends.reserve(depends.size() + 1);
+    all_depends.insert(std::end(all_depends), std::begin(depends),
+                       std::end(depends));
+    all_depends.push_back(copy_shape_ev);
+
+    sycl::event ascontig_ev =
+        impl_fn(exec_q, batch_nelems, nd, packed_shape_strides, dst_batch_step,
+                n, src.get_data(), src_strides_vec.front(), dst.get_data(),
+                dst_strides_vec[1], all_depends);
+
+    // async free of shape_strides
+    sycl::event temporaries_cleanup_ev =
+        dpctl::tensor::alloc_utils::async_smart_free(
+            exec_q, {ascontig_ev}, packed_shape_strides_owner);
+    host_task_events.push_back(temporaries_cleanup_ev);
+
+    return std::make_pair(keep_args_alive(exec_q, {src, dst}, host_task_events),
+                          ascontig_ev);
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/copy_as_contig.hpp b/dpctl_ext/tensor/libtensor/source/copy_as_contig.hpp
new file mode 100644
index 000000000000..bfe3159c8813
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/copy_as_contig.hpp
@@ -0,0 +1,54 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+
+#pragma once
+
+#include <utility>
+#include <vector>
+
+#include "dpnp4pybind11.hpp"
+#include <sycl/sycl.hpp>
+
+namespace dpctl::tensor::py_internal
+{
+
+std::pair<sycl::event, sycl::event>
+    py_as_c_contig(const dpctl::tensor::usm_ndarray &,
+                   const dpctl::tensor::usm_ndarray &,
+                   sycl::queue &,
+                   const std::vector<sycl::event> &);
+
+std::pair<sycl::event, sycl::event>
+    py_as_f_contig(const dpctl::tensor::usm_ndarray &,
+                   const dpctl::tensor::usm_ndarray &,
+                   sycl::queue &,
+                   const std::vector<sycl::event> &);
+
+void init_copy_as_contig_dispatch_vectors(void);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/device_support_queries.cpp b/dpctl_ext/tensor/libtensor/source/device_support_queries.cpp
new file mode 100644
index 000000000000..97a8ba83831e
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/device_support_queries.cpp
@@ -0,0 +1,179 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===--------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions
+//===--------------------------------------------------------------------===//
+
+#include <string>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+#include <sycl/sycl.hpp>
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace py = pybind11;
+
+namespace
+{
+
+std::string _default_device_fp_type(const sycl::device &d)
+{
+    if (d.has(sycl::aspect::fp64)) {
+        return "f8";
+    }
+    else {
+        return "f4";
+    }
+}
+
+int get_numpy_major_version()
+{
+
+    py::module_ numpy = py::module_::import("numpy");
+    py::str version_string = numpy.attr("__version__");
+    py::module_ numpy_lib = py::module_::import("numpy.lib");
+
+    py::object numpy_version = numpy_lib.attr("NumpyVersion")(version_string);
+    int major_version = numpy_version.attr("major").cast<int>();
+
+    return major_version;
+}
+
+std::string _default_device_int_type(const sycl::device &)
+{
+    const int np_ver = get_numpy_major_version();
+
+    if (np_ver >= 2) {
+        return "i8";
+    }
+    else {
+        // code for numpy.dtype('long') to be consistent
+        // with NumPy's default integer type across
+        // platforms.
+        return "l";
+    }
+}
+
+std::string _default_device_uint_type(const sycl::device &)
+{
+    const int np_ver = get_numpy_major_version();
+
+    if (np_ver >= 2) {
+        return "u8";
+    }
+    else {
+        // code for numpy.dtype('long') to be consistent
+        // with NumPy's default integer type across
+        // platforms.
+        return "L";
+    }
+}
+
+std::string _default_device_complex_type(const sycl::device &d)
+{
+    if (d.has(sycl::aspect::fp64)) {
+        return "c16";
+    }
+    else {
+        return "c8";
+    }
+}
+
+std::string _default_device_bool_type(const sycl::device &)
+{
+    return "b1";
+}
+
+std::string _default_device_index_type(const sycl::device &)
+{
+    return "i8";
+}
+
+sycl::device _extract_device(const py::object &arg)
+{
+    auto const &api = dpctl::detail::dpctl_capi::get();
+
+    PyObject *source = arg.ptr();
+    if (api.PySyclQueue_Check_(source)) {
+        const sycl::queue &q = py::cast<sycl::queue>(arg);
+        return q.get_device();
+    }
+    else if (api.PySyclDevice_Check_(source)) {
+        return py::cast<sycl::device>(arg);
+    }
+    else {
+        throw py::type_error(
+            "Expected type `dpctl.SyclQueue` or `dpctl.SyclDevice`.");
+    }
+}
+
+} // namespace
+
+std::string default_device_fp_type(const py::object &arg)
+{
+    const sycl::device &d = _extract_device(arg);
+    return _default_device_fp_type(d);
+}
+
+std::string default_device_int_type(const py::object &arg)
+{
+    const sycl::device &d = _extract_device(arg);
+    return _default_device_int_type(d);
+}
+
+std::string default_device_uint_type(const py::object &arg)
+{
+    const sycl::device &d = _extract_device(arg);
+    return _default_device_uint_type(d);
+}
+
+std::string default_device_bool_type(const py::object &arg)
+{
+    const sycl::device &d = _extract_device(arg);
+    return _default_device_bool_type(d);
+}
+
+std::string default_device_complex_type(const py::object &arg)
+{
+    const sycl::device &d = _extract_device(arg);
+    return _default_device_complex_type(d);
+}
+
+std::string default_device_index_type(const py::object &arg)
+{
+    const sycl::device &d = _extract_device(arg);
+    return _default_device_index_type(d);
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/device_support_queries.hpp b/dpctl_ext/tensor/libtensor/source/device_support_queries.hpp
new file mode 100644
index 000000000000..adde7aefe3dd
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/device_support_queries.hpp
@@ -0,0 +1,50 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//===--------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions
+//===--------------------------------------------------------------------===//
+
+#pragma once
+#include <string>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/pybind11.h>
+
+namespace dpctl::tensor::py_internal
+{
+
+extern std::string default_device_fp_type(const py::object &);
+extern std::string default_device_int_type(const py::object &);
+extern std::string default_device_uint_type(const py::object &);
+extern std::string default_device_bool_type(const py::object &);
+extern std::string default_device_complex_type(const py::object &);
+extern std::string default_device_index_type(const py::object &);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/simplify_iteration_space.cpp b/dpctl_ext/tensor/libtensor/source/simplify_iteration_space.cpp
new file mode 100644
index 000000000000..e3cff701ed50
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/simplify_iteration_space.cpp
@@ -0,0 +1,540 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===--------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions
+//===--------------------------------------------------------------------===//
+
+#include "simplify_iteration_space.hpp"
+#include "utils/strided_iters.hpp"
+#include <algorithm>
+#include <cstddef>
+#include <iterator>
+#include <pybind11/pybind11.h>
+#include <vector>
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace py = pybind11;
+
+void simplify_iteration_space_1(int &nd,
+                                const py::ssize_t *const &shape,
+                                std::vector<py::ssize_t> const &strides,
+                                // output
+                                std::vector<py::ssize_t> &simplified_shape,
+                                std::vector<py::ssize_t> &simplified_strides,
+                                py::ssize_t &offset)
+{
+    using dpctl::tensor::strides::simplify_iteration_stride;
+    if (nd > 1) {
+        // Simplify iteration space to reduce dimensionality
+        // and improve access pattern
+        simplified_shape.reserve(nd);
+        simplified_shape.insert(std::end(simplified_shape), shape, shape + nd);
+
+        simplified_strides.reserve(nd);
+        simplified_strides.insert(std::end(simplified_strides),
+                                  std::begin(strides), std::end(strides));
+
+        assert(simplified_shape.size() == static_cast<std::size_t>(nd));
+        assert(simplified_strides.size() == static_cast<std::size_t>(nd));
+        int contracted_nd = simplify_iteration_stride(
+            nd, simplified_shape.data(), simplified_strides.data(),
+            offset // modified by reference
+        );
+        simplified_shape.resize(contracted_nd);
+        simplified_strides.resize(contracted_nd);
+
+        nd = contracted_nd;
+    }
+    else if (nd == 1) {
+        offset = 0;
+        // Populate vectors
+        simplified_shape.reserve(nd);
+        simplified_shape.push_back(shape[0]);
+
+        simplified_strides.reserve(nd);
+        simplified_strides.push_back((strides[0] >= 0) ? strides[0]
+                                                       : -strides[0]);
+        if ((strides[0] < 0) && (shape[0] > 1)) {
+            offset += (shape[0] - 1) * strides[0];
+        }
+
+        assert(simplified_shape.size() == static_cast<std::size_t>(nd));
+        assert(simplified_strides.size() == static_cast<std::size_t>(nd));
+    }
+}
+
+void simplify_iteration_space(int &nd,
+                              const py::ssize_t *const &shape,
+                              std::vector<py::ssize_t> const &src_strides,
+                              std::vector<py::ssize_t> const &dst_strides,
+                              // output
+                              std::vector<py::ssize_t> &simplified_shape,
+                              std::vector<py::ssize_t> &simplified_src_strides,
+                              std::vector<py::ssize_t> &simplified_dst_strides,
+                              py::ssize_t &src_offset,
+                              py::ssize_t &dst_offset)
+{
+    using dpctl::tensor::strides::simplify_iteration_two_strides;
+    if (nd > 1) {
+        // Simplify iteration space to reduce dimensionality
+        // and improve access pattern
+        simplified_shape.reserve(nd);
+        simplified_shape.insert(std::begin(simplified_shape), shape,
+                                shape + nd);
+        assert(simplified_shape.size() == static_cast<std::size_t>(nd));
+
+        simplified_src_strides.reserve(nd);
+        simplified_src_strides.insert(std::end(simplified_src_strides),
+                                      std::begin(src_strides),
+                                      std::end(src_strides));
+        assert(simplified_src_strides.size() == static_cast<std::size_t>(nd));
+
+        simplified_dst_strides.reserve(nd);
+        simplified_dst_strides.insert(std::end(simplified_dst_strides),
+                                      std::begin(dst_strides),
+                                      std::end(dst_strides));
+        assert(simplified_dst_strides.size() == static_cast<std::size_t>(nd));
+
+        int contracted_nd = simplify_iteration_two_strides(
+            nd, simplified_shape.data(), simplified_src_strides.data(),
+            simplified_dst_strides.data(),
+            src_offset, // modified by reference
+            dst_offset  // modified by reference
+        );
+        simplified_shape.resize(contracted_nd);
+        simplified_src_strides.resize(contracted_nd);
+        simplified_dst_strides.resize(contracted_nd);
+
+        nd = contracted_nd;
+    }
+    else if (nd == 1) {
+        src_offset = 0;
+        dst_offset = 0;
+        // Populate vectors
+        simplified_shape.reserve(nd);
+        simplified_shape.push_back(shape[0]);
+        assert(simplified_shape.size() == static_cast<std::size_t>(nd));
+
+        simplified_src_strides.reserve(nd);
+        simplified_dst_strides.reserve(nd);
+
+        if (src_strides[0] < 0 && dst_strides[0] < 0) {
+            simplified_src_strides.push_back(-src_strides[0]);
+            simplified_dst_strides.push_back(-dst_strides[0]);
+            if (shape[0] > 1) {
+                src_offset += (shape[0] - 1) * src_strides[0];
+                dst_offset += (shape[0] - 1) * dst_strides[0];
+            }
+        }
+        else {
+            simplified_src_strides.push_back(src_strides[0]);
+            simplified_dst_strides.push_back(dst_strides[0]);
+        }
+
+        assert(simplified_src_strides.size() == static_cast<std::size_t>(nd));
+        assert(simplified_dst_strides.size() == static_cast<std::size_t>(nd));
+    }
+}
+
+void simplify_iteration_space_3(
+    int &nd,
+    const py::ssize_t *const &shape,
+    // src1
+    std::vector<py::ssize_t> const &src1_strides,
+    // src2
+    std::vector<py::ssize_t> const &src2_strides,
+    // dst
+    std::vector<py::ssize_t> const &dst_strides,
+    // output
+    std::vector<py::ssize_t> &simplified_shape,
+    std::vector<py::ssize_t> &simplified_src1_strides,
+    std::vector<py::ssize_t> &simplified_src2_strides,
+    std::vector<py::ssize_t> &simplified_dst_strides,
+    py::ssize_t &src1_offset,
+    py::ssize_t &src2_offset,
+    py::ssize_t &dst_offset)
+{
+    using dpctl::tensor::strides::simplify_iteration_three_strides;
+    if (nd > 1) {
+        // Simplify iteration space to reduce dimensionality
+        // and improve access pattern
+        simplified_shape.reserve(nd);
+        simplified_shape.insert(std::end(simplified_shape), shape, shape + nd);
+        assert(simplified_shape.size() == static_cast<std::size_t>(nd));
+
+        simplified_src1_strides.reserve(nd);
+        simplified_src1_strides.insert(std::end(simplified_src1_strides),
+                                       std::begin(src1_strides),
+                                       std::end(src1_strides));
+        assert(simplified_src1_strides.size() == static_cast<std::size_t>(nd));
+
+        simplified_src2_strides.reserve(nd);
+        simplified_src2_strides.insert(std::end(simplified_src2_strides),
+                                       std::begin(src2_strides),
+                                       std::end(src2_strides));
+        assert(simplified_src2_strides.size() == static_cast<std::size_t>(nd));
+
+        simplified_dst_strides.reserve(nd);
+        simplified_dst_strides.insert(std::end(simplified_dst_strides),
+                                      std::begin(dst_strides),
+                                      std::end(dst_strides));
+        assert(simplified_dst_strides.size() == static_cast<std::size_t>(nd));
+
+        int contracted_nd = simplify_iteration_three_strides(
+            nd, simplified_shape.data(), simplified_src1_strides.data(),
+            simplified_src2_strides.data(), simplified_dst_strides.data(),
+            src1_offset, // modified by reference
+            src2_offset, // modified by reference
+            dst_offset   // modified by reference
+        );
+        simplified_shape.resize(contracted_nd);
+        simplified_src1_strides.resize(contracted_nd);
+        simplified_src2_strides.resize(contracted_nd);
+        simplified_dst_strides.resize(contracted_nd);
+
+        nd = contracted_nd;
+    }
+    else if (nd == 1) {
+        src1_offset = 0;
+        src2_offset = 0;
+        dst_offset = 0;
+        // Populate vectors
+        simplified_shape.reserve(nd);
+        simplified_shape.push_back(shape[0]);
+        assert(simplified_shape.size() == static_cast<std::size_t>(nd));
+
+        simplified_src1_strides.reserve(nd);
+        simplified_src2_strides.reserve(nd);
+        simplified_dst_strides.reserve(nd);
+
+        if ((src1_strides[0] < 0) && (src2_strides[0] < 0) &&
+            (dst_strides[0] < 0)) {
+            simplified_src1_strides.push_back(-src1_strides[0]);
+            simplified_src2_strides.push_back(-src2_strides[0]);
+            simplified_dst_strides.push_back(-dst_strides[0]);
+            if (shape[0] > 1) {
+                src1_offset += src1_strides[0] * (shape[0] - 1);
+                src2_offset += src2_strides[0] * (shape[0] - 1);
+                dst_offset += dst_strides[0] * (shape[0] - 1);
+            }
+        }
+        else {
+            simplified_src1_strides.push_back(src1_strides[0]);
+            simplified_src2_strides.push_back(src2_strides[0]);
+            simplified_dst_strides.push_back(dst_strides[0]);
+        }
+
+        assert(simplified_src1_strides.size() == static_cast<std::size_t>(nd));
+        assert(simplified_src2_strides.size() == static_cast<std::size_t>(nd));
+        assert(simplified_dst_strides.size() == static_cast<std::size_t>(nd));
+    }
+}
+
+void simplify_iteration_space_4(
+    int &nd,
+    const py::ssize_t *const &shape,
+    // src1
+    std::vector<py::ssize_t> const &src1_strides,
+    // src2
+    std::vector<py::ssize_t> const &src2_strides,
+    // src3
+    std::vector<py::ssize_t> const &src3_strides,
+    // dst
+    std::vector<py::ssize_t> const &dst_strides,
+    // output
+    std::vector<py::ssize_t> &simplified_shape,
+    std::vector<py::ssize_t> &simplified_src1_strides,
+    std::vector<py::ssize_t> &simplified_src2_strides,
+    std::vector<py::ssize_t> &simplified_src3_strides,
+    std::vector<py::ssize_t> &simplified_dst_strides,
+    py::ssize_t &src1_offset,
+    py::ssize_t &src2_offset,
+    py::ssize_t &src3_offset,
+    py::ssize_t &dst_offset)
+{
+    using dpctl::tensor::strides::simplify_iteration_four_strides;
+    if (nd > 1) {
+        // Simplify iteration space to reduce dimensionality
+        // and improve access pattern
+        simplified_shape.reserve(nd);
+        simplified_shape.insert(std::end(simplified_shape), shape, shape + nd);
+        assert(simplified_shape.size() == static_cast<std::size_t>(nd));
+
+        simplified_src1_strides.reserve(nd);
+        simplified_src1_strides.insert(std::end(simplified_src1_strides),
+                                       std::begin(src1_strides),
+                                       std::end(src1_strides));
+        assert(simplified_src1_strides.size() == static_cast<std::size_t>(nd));
+
+        simplified_src2_strides.reserve(nd);
+        simplified_src2_strides.insert(std::end(simplified_src2_strides),
+                                       std::begin(src2_strides),
+                                       std::end(src2_strides));
+        assert(simplified_src2_strides.size() == static_cast<std::size_t>(nd));
+
+        simplified_src3_strides.reserve(nd);
+        simplified_src3_strides.insert(std::end(simplified_src3_strides),
+                                       std::begin(src3_strides),
+                                       std::end(src3_strides));
+        assert(simplified_src3_strides.size() == static_cast<std::size_t>(nd));
+
+        simplified_dst_strides.reserve(nd);
+        simplified_dst_strides.insert(std::end(simplified_dst_strides),
+                                      std::begin(dst_strides),
+                                      std::end(dst_strides));
+        assert(simplified_dst_strides.size() == static_cast<std::size_t>(nd));
+
+        int contracted_nd = simplify_iteration_four_strides(
+            nd, simplified_shape.data(), simplified_src1_strides.data(),
+            simplified_src2_strides.data(), simplified_src3_strides.data(),
+            simplified_dst_strides.data(),
+            src1_offset, // modified by reference
+            src2_offset, // modified by reference
+            src3_offset, // modified by reference
+            dst_offset   // modified by reference
+        );
+        simplified_shape.resize(contracted_nd);
+        simplified_src1_strides.resize(contracted_nd);
+        simplified_src2_strides.resize(contracted_nd);
+        simplified_src3_strides.resize(contracted_nd);
+        simplified_dst_strides.resize(contracted_nd);
+
+        nd = contracted_nd;
+    }
+    else if (nd == 1) {
+        src1_offset = 0;
+        src2_offset = 0;
+        src3_offset = 0;
+        dst_offset = 0;
+        // Populate vectors
+        simplified_shape.reserve(nd);
+        simplified_shape.push_back(shape[0]);
+        assert(simplified_shape.size() == static_cast<std::size_t>(nd));
+
+        simplified_src1_strides.reserve(nd);
+        simplified_src2_strides.reserve(nd);
+        simplified_src3_strides.reserve(nd);
+        simplified_dst_strides.reserve(nd);
+
+        if ((src1_strides[0] < 0) && (src2_strides[0] < 0) &&
+            (src3_strides[0] < 0) && (dst_strides[0] < 0))
+        {
+            simplified_src1_strides.push_back(-src1_strides[0]);
+            simplified_src2_strides.push_back(-src2_strides[0]);
+            simplified_src3_strides.push_back(-src3_strides[0]);
+            simplified_dst_strides.push_back(-dst_strides[0]);
+            if (shape[0] > 1) {
+                src1_offset += src1_strides[0] * (shape[0] - 1);
+                src2_offset += src2_strides[0] * (shape[0] - 1);
+                src3_offset += src3_strides[0] * (shape[0] - 1);
+                dst_offset += dst_strides[0] * (shape[0] - 1);
+            }
+        }
+        else {
+            simplified_src1_strides.push_back(src1_strides[0]);
+            simplified_src2_strides.push_back(src2_strides[0]);
+            simplified_src3_strides.push_back(src3_strides[0]);
+            simplified_dst_strides.push_back(dst_strides[0]);
+        }
+
+        assert(simplified_src1_strides.size() == static_cast<std::size_t>(nd));
+        assert(simplified_src2_strides.size() == static_cast<std::size_t>(nd));
+        assert(simplified_src3_strides.size() == static_cast<std::size_t>(nd));
+        assert(simplified_dst_strides.size() == static_cast<std::size_t>(nd));
+    }
+}
+
+void compact_iteration_space(int &nd,
+                             const py::ssize_t *const &shape,
+                             std::vector<py::ssize_t> const &strides,
+                             // output
+                             std::vector<py::ssize_t> &compact_shape,
+                             std::vector<py::ssize_t> &compact_strides)
+{
+    using dpctl::tensor::strides::compact_iteration;
+    if (nd > 1) {
+        // Compact iteration space to reduce dimensionality
+        // and improve access pattern
+        compact_shape.reserve(nd);
+        compact_shape.insert(std::begin(compact_shape), shape, shape + nd);
+        assert(compact_shape.size() == static_cast<std::size_t>(nd));
+
+        compact_strides.reserve(nd);
+        compact_strides.insert(std::end(compact_strides), std::begin(strides),
+                               std::end(strides));
+        assert(compact_strides.size() == static_cast<std::size_t>(nd));
+
+        int contracted_nd =
+            compact_iteration(nd, compact_shape.data(), compact_strides.data());
+        compact_shape.resize(contracted_nd);
+        compact_strides.resize(contracted_nd);
+
+        nd = contracted_nd;
+    }
+    else if (nd == 1) {
+        // Populate vectors
+        compact_shape.reserve(nd);
+        compact_shape.push_back(shape[0]);
+        assert(compact_shape.size() == static_cast<std::size_t>(nd));
+
+        compact_strides.reserve(nd);
+        compact_strides.push_back(strides[0]);
+        assert(compact_strides.size() == static_cast<std::size_t>(nd));
+    }
+}
+
+/* @brief Split shape/strides into dir1 (complementary to axis_start <= i <
+ * axis_end) and dir2 (along given set of axes)
+ */
+void split_iteration_space(const std::vector<py::ssize_t> &shape_vec,
+                           const std::vector<py::ssize_t> &strides_vec,
+                           int axis_start,
+                           int axis_end,
+                           std::vector<py::ssize_t> &dir1_shape_vec,
+                           std::vector<py::ssize_t> &dir2_shape_vec,
+                           std::vector<py::ssize_t> &dir1_strides_vec,
+                           std::vector<py::ssize_t> &dir2_strides_vec)
+{
+    int nd = static_cast<int>(shape_vec.size());
+    int dir2_sz = axis_end - axis_start;
+    int dir1_sz = nd - dir2_sz;
+
+    assert(dir1_sz > 0);
+    assert(dir2_sz > 0);
+
+    dir1_shape_vec.resize(dir1_sz);
+    dir2_shape_vec.resize(dir2_sz);
+
+    std::copy(shape_vec.begin(), shape_vec.begin() + axis_start,
+              dir1_shape_vec.begin());
+    std::copy(shape_vec.begin() + axis_end, shape_vec.end(),
+              dir1_shape_vec.begin() + axis_start);
+
+    std::copy(shape_vec.begin() + axis_start, shape_vec.begin() + axis_end,
+              dir2_shape_vec.begin());
+
+    dir1_strides_vec.resize(dir1_sz);
+    dir2_strides_vec.resize(dir2_sz);
+
+    std::copy(strides_vec.begin(), strides_vec.begin() + axis_start,
+              dir1_strides_vec.begin());
+    std::copy(strides_vec.begin() + axis_end, strides_vec.end(),
+              dir1_strides_vec.begin() + axis_start);
+
+    std::copy(strides_vec.begin() + axis_start, strides_vec.begin() + axis_end,
+              dir2_strides_vec.begin());
+
+    return;
+}
+
+py::ssize_t _ravel_multi_index_c(std::vector<py::ssize_t> const &mi,
+                                 std::vector<py::ssize_t> const &shape)
+{
+    std::size_t nd = shape.size();
+    if (nd != mi.size()) {
+        throw py::value_error(
+            "Multi-index and shape vectors must have the same length.");
+    }
+
+    py::ssize_t flat_index = 0;
+    py::ssize_t s = 1;
+    for (std::size_t i = 0; i < nd; ++i) {
+        flat_index += mi.at(nd - 1 - i) * s;
+        s *= shape.at(nd - 1 - i);
+    }
+
+    return flat_index;
+}
+
+py::ssize_t _ravel_multi_index_f(std::vector<py::ssize_t> const &mi,
+                                 std::vector<py::ssize_t> const &shape)
+{
+    std::size_t nd = shape.size();
+    if (nd != mi.size()) {
+        throw py::value_error(
+            "Multi-index and shape vectors must have the same length.");
+    }
+
+    py::ssize_t flat_index = 0;
+    py::ssize_t s = 1;
+    for (std::size_t i = 0; i < nd; ++i) {
+        flat_index += mi.at(i) * s;
+        s *= shape.at(i);
+    }
+
+    return flat_index;
+}
+
+std::vector<py::ssize_t> _unravel_index_c(py::ssize_t flat_index,
+                                          std::vector<py::ssize_t> const &shape)
+{
+    std::size_t nd = shape.size();
+    std::vector<py::ssize_t> mi;
+    mi.resize(nd);
+
+    py::ssize_t i_ = flat_index;
+    for (std::size_t dim = 0; dim + 1 < nd; ++dim) {
+        const py::ssize_t si = shape[nd - 1 - dim];
+        const py::ssize_t q = i_ / si;
+        const py::ssize_t r = (i_ - q * si);
+        mi[nd - 1 - dim] = r;
+        i_ = q;
+    }
+    if (nd) {
+        mi[0] = i_;
+    }
+    return mi;
+}
+
+std::vector<py::ssize_t> _unravel_index_f(py::ssize_t flat_index,
+                                          std::vector<py::ssize_t> const &shape)
+{
+    std::size_t nd = shape.size();
+    std::vector<py::ssize_t> mi;
+    mi.resize(nd);
+
+    py::ssize_t i_ = flat_index;
+    for (std::size_t dim = 0; dim + 1 < nd; ++dim) {
+        const py::ssize_t si = shape[dim];
+        const py::ssize_t q = i_ / si;
+        const py::ssize_t r = (i_ - q * si);
+        mi[dim] = r;
+        i_ = q;
+    }
+    if (nd) {
+        mi[nd - 1] = i_;
+    }
+    return mi;
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/simplify_iteration_space.hpp b/dpctl_ext/tensor/libtensor/source/simplify_iteration_space.hpp
new file mode 100644
index 000000000000..acbc833157d1
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/simplify_iteration_space.hpp
@@ -0,0 +1,125 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===--------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions
+//===--------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+#include <vector>
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace py = pybind11;
+
+void simplify_iteration_space_1(int &,
+                                const py::ssize_t *const &,
+                                std::vector<py::ssize_t> const &,
+                                std::vector<py::ssize_t> &,
+                                std::vector<py::ssize_t> &,
+                                py::ssize_t &);
+
+void simplify_iteration_space(int &,
+                              const py::ssize_t *const &,
+                              std::vector<py::ssize_t> const &,
+                              std::vector<py::ssize_t> const &,
+                              std::vector<py::ssize_t> &,
+                              std::vector<py::ssize_t> &,
+                              std::vector<py::ssize_t> &,
+                              py::ssize_t &,
+                              py::ssize_t &);
+
+void simplify_iteration_space_3(int &,
+                                const py::ssize_t *const &,
+                                // src1
+                                std::vector<py::ssize_t> const &,
+                                // src2
+                                std::vector<py::ssize_t> const &,
+                                // dst
+                                std::vector<py::ssize_t> const &,
+                                // output
+                                std::vector<py::ssize_t> &,
+                                std::vector<py::ssize_t> &,
+                                std::vector<py::ssize_t> &,
+                                std::vector<py::ssize_t> &,
+                                py::ssize_t &,
+                                py::ssize_t &,
+                                py::ssize_t &);
+
+void simplify_iteration_space_4(int &,
+                                const py::ssize_t *const &,
+                                // src1
+                                std::vector<py::ssize_t> const &,
+                                // src2
+                                std::vector<py::ssize_t> const &,
+                                // src3
+                                std::vector<py::ssize_t> const &,
+                                // dst
+                                std::vector<py::ssize_t> const &,
+                                // output
+                                std::vector<py::ssize_t> &,
+                                std::vector<py::ssize_t> &,
+                                std::vector<py::ssize_t> &,
+                                std::vector<py::ssize_t> &,
+                                std::vector<py::ssize_t> &,
+                                py::ssize_t &,
+                                py::ssize_t &,
+                                py::ssize_t &,
+                                py::ssize_t &);
+
+void compact_iteration_space(int &,
+                             const py::ssize_t *const &,
+                             std::vector<py::ssize_t> const &,
+                             // output
+                             std::vector<py::ssize_t> &,
+                             std::vector<py::ssize_t> &);
+
+void split_iteration_space(const std::vector<py::ssize_t> &,
+                           const std::vector<py::ssize_t> &,
+                           int,
+                           int,
+                           // output
+                           std::vector<py::ssize_t> &,
+                           std::vector<py::ssize_t> &,
+                           std::vector<py::ssize_t> &,
+                           std::vector<py::ssize_t> &);
+
+py::ssize_t _ravel_multi_index_c(std::vector<py::ssize_t> const &,
+                                 std::vector<py::ssize_t> const &);
+py::ssize_t _ravel_multi_index_f(std::vector<py::ssize_t> const &,
+                                 std::vector<py::ssize_t> const &);
+std::vector<py::ssize_t> _unravel_index_c(py::ssize_t,
+                                          std::vector<py::ssize_t> const &);
+std::vector<py::ssize_t> _unravel_index_f(py::ssize_t,
+                                          std::vector<py::ssize_t> const &);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp b/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp
new file mode 100644
index 000000000000..54d6adbc8f6e
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp
@@ -0,0 +1,501 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions
+//===----------------------------------------------------------------------===//
+
+#include <optional>
+#include <type_traits>
+#include <utility>
+#include <vector>
+
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+
+// #include "accumulators.hpp"
+// #include "boolean_advanced_indexing.hpp"
+// #include "clip.hpp"
+#include "copy_and_cast_usm_to_usm.hpp"
+#include "copy_as_contig.hpp"
+// #include "copy_for_reshape.hpp"
+// #include "copy_for_roll.hpp"
+// #include "copy_numpy_ndarray_into_usm_ndarray.hpp"
+#include "device_support_queries.hpp"
+// #include "eye_ctor.hpp"
+// #include "full_ctor.hpp"
+// #include "integer_advanced_indexing.hpp"
+#include "kernels/dpctl_tensor_types.hpp"
+// #include "linear_sequences.hpp"
+// #include "repeat.hpp"
+#include "simplify_iteration_space.hpp"
+// #include "triul_ctor.hpp"
+#include "utils/memory_overlap.hpp"
+#include "utils/strided_iters.hpp"
+// #include "where.hpp"
+// #include "zeros_ctor.hpp"
+
+namespace py = pybind11;
+
+static_assert(std::is_same_v<py::ssize_t, dpctl::tensor::ssize_t>);
+
+namespace
+{
+
+using dpctl::tensor::c_contiguous_strides;
+using dpctl::tensor::f_contiguous_strides;
+
+using dpctl::tensor::overlap::MemoryOverlap;
+using dpctl::tensor::overlap::SameLogicalTensors;
+
+using dpctl::tensor::py_internal::copy_usm_ndarray_into_usm_ndarray;
+using dpctl::tensor::py_internal::py_as_c_contig;
+using dpctl::tensor::py_internal::py_as_f_contig;
+
+/* =========================== Copy for reshape ============================= */
+
+// using dpctl::tensor::py_internal::copy_usm_ndarray_for_reshape;
+
+/* =========================== Copy for roll ============================= */
+
+// using dpctl::tensor::py_internal::copy_usm_ndarray_for_roll_1d;
+// using dpctl::tensor::py_internal::copy_usm_ndarray_for_roll_nd;
+
+/* ============= Copy from numpy.ndarray to usm_ndarray ==================== */
+
+// using dpctl::tensor::py_internal::copy_numpy_ndarray_into_usm_ndarray;
+
+/* ============= linear-sequence ==================== */
+
+// using dpctl::tensor::py_internal::usm_ndarray_linear_sequence_affine;
+// using dpctl::tensor::py_internal::usm_ndarray_linear_sequence_step;
+
+/* ================ Full ================== */
+
+// using dpctl::tensor::py_internal::usm_ndarray_full;
+
+/* ================ Zeros ================== */
+
+// using dpctl::tensor::py_internal::usm_ndarray_zeros;
+
+/* ============== Advanced Indexing ============= */
+// using dpctl::tensor::py_internal::usm_ndarray_put;
+// using dpctl::tensor::py_internal::usm_ndarray_take;
+
+// using dpctl::tensor::py_internal::py_extract;
+// using dpctl::tensor::py_internal::py_mask_positions;
+// using dpctl::tensor::py_internal::py_nonzero;
+// using dpctl::tensor::py_internal::py_place;
+
+/* ================= Repeat ====================*/
+// using dpctl::tensor::py_internal::py_cumsum_1d;
+// using dpctl::tensor::py_internal::py_repeat_by_scalar;
+// using dpctl::tensor::py_internal::py_repeat_by_sequence;
+
+/* ================ Eye ================== */
+
+// using dpctl::tensor::py_internal::usm_ndarray_eye;
+
+/* =========================== Tril and triu ============================== */
+
+// using dpctl::tensor::py_internal::usm_ndarray_triul;
+
+/* =========================== Where ============================== */
+
+// using dpctl::tensor::py_internal::py_where;
+
+/* =========================== Clip ============================== */
+// using dpctl::tensor::py_internal::py_clip;
+
+// populate dispatch tables
+void init_dispatch_tables(void)
+{
+    using namespace dpctl::tensor::py_internal;
+
+    init_copy_and_cast_usm_to_usm_dispatch_tables();
+    // init_copy_numpy_ndarray_into_usm_ndarray_dispatch_tables();
+    // init_advanced_indexing_dispatch_tables();
+    // init_where_dispatch_tables();
+    return;
+}
+
+// populate dispatch vectors
+void init_dispatch_vectors(void)
+{
+    using namespace dpctl::tensor::py_internal;
+
+    init_copy_as_contig_dispatch_vectors();
+    // init_copy_for_reshape_dispatch_vectors();
+    // init_copy_for_roll_dispatch_vectors();
+    // init_linear_sequences_dispatch_vectors();
+    // init_full_ctor_dispatch_vectors();
+    // init_zeros_ctor_dispatch_vectors();
+    // init_eye_ctor_dispatch_vectors();
+    // init_triul_ctor_dispatch_vectors();
+
+    // populate_masked_extract_dispatch_vectors();
+    // populate_masked_place_dispatch_vectors();
+
+    // populate_mask_positions_dispatch_vectors();
+
+    // populate_cumsum_1d_dispatch_vectors();
+    // init_repeat_dispatch_vectors();
+
+    // init_clip_dispatch_vectors();
+
+    return;
+}
+
+} // namespace
+
+PYBIND11_MODULE(_tensor_impl, m)
+{
+    init_dispatch_tables();
+    init_dispatch_vectors();
+
+    using dpctl::tensor::strides::contract_iter;
+    m.def(
+        "_contract_iter", &contract_iter<py::ssize_t, py::value_error>,
+        "Simplifies iteration of array of given shape & stride. Returns "
+        "a triple: shape, stride and offset for the new iterator of possible "
+        "smaller dimension, which traverses the same elements as the original "
+        "iterator, possibly in a different order.");
+
+    m.def("_copy_usm_ndarray_into_usm_ndarray",
+          &copy_usm_ndarray_into_usm_ndarray,
+          "Copies from usm_ndarray `src` into usm_ndarray `dst` of the same "
+          "shape. "
+          "Returns a tuple of events: (host_task_event, compute_task_event)",
+          py::arg("src"), py::arg("dst"), py::arg("sycl_queue"),
+          py::arg("depends") = py::list());
+
+    m.def("_as_c_contig", &py_as_c_contig,
+          "Copies from usm_ndarray `src` into C-contiguous usm_ndarray "
+          "`dst` of the same shape and the same data type. "
+          "Returns a tuple of events: (host_task_event, compute_task_event)",
+          py::arg("src"), py::arg("dst"), py::arg("sycl_queue"),
+          py::arg("depends") = py::list());
+
+    m.def("_as_f_contig", &py_as_f_contig,
+          "Copies from usm_ndarray `src` into F-contiguous usm_ndarray "
+          "`dst` of the same shape and the same data type. "
+          "Returns a tuple of events: (host_task_event, compute_task_event)",
+          py::arg("src"), py::arg("dst"), py::arg("sycl_queue"),
+          py::arg("depends") = py::list());
+
+    using dpctl::tensor::strides::contract_iter2;
+    m.def(
+        "_contract_iter2", &contract_iter2<py::ssize_t, py::value_error>,
+        "Simplifies iteration over elements of pair of arrays of given shape "
+        "with strides stride1 and stride2. Returns "
+        "a 5-tuple: shape, stride and offset for the new iterator of possible "
+        "smaller dimension for each array, which traverses the same elements "
+        "as the original "
+        "iterator, possibly in a different order.");
+
+    using dpctl::tensor::strides::contract_iter3;
+    m.def(
+        "_contract_iter3", &contract_iter3<py::ssize_t, py::value_error>,
+        "Simplifies iteration over elements of 3-tuple of arrays of given "
+        "shape "
+        "with strides stride1, stride2, and stride3. Returns "
+        "a 7-tuple: shape, stride and offset for the new iterator of possible "
+        "smaller dimension for each array, which traverses the same elements "
+        "as the original "
+        "iterator, possibly in a different order.");
+
+    using dpctl::tensor::strides::contract_iter4;
+    m.def(
+        "_contract_iter4", &contract_iter4<py::ssize_t, py::value_error>,
+        "Simplifies iteration over elements of 4-tuple of arrays of given "
+        "shape "
+        "with strides stride1, stride2, stride3, and stride4. Returns "
+        "a 9-tuple: shape, stride and offset for the new iterator of possible "
+        "smaller dimension for each array, which traverses the same elements "
+        "as the original "
+        "iterator, possibly in a different order.");
+
+    static constexpr char orderC = 'C';
+    m.def(
+        "_ravel_multi_index",
+        [](const std::vector<py::ssize_t> &mi,
+           const std::vector<py::ssize_t> &shape, char order = 'C') {
+            if (order == orderC) {
+                return dpctl::tensor::py_internal::_ravel_multi_index_c(mi,
+                                                                        shape);
+            }
+            else {
+                return dpctl::tensor::py_internal::_ravel_multi_index_f(mi,
+                                                                        shape);
+            }
+        },
+        "");
+
+    m.def(
+        "_unravel_index",
+        [](py::ssize_t flat_index, const std::vector<py::ssize_t> &shape,
+           char order = 'C') {
+            if (order == orderC) {
+                return dpctl::tensor::py_internal::_unravel_index_c(flat_index,
+                                                                    shape);
+            }
+            else {
+                return dpctl::tensor::py_internal::_unravel_index_f(flat_index,
+                                                                    shape);
+            }
+        },
+        "");
+
+    // m.def("_copy_usm_ndarray_for_reshape", &copy_usm_ndarray_for_reshape,
+    //       "Copies from usm_ndarray `src` into usm_ndarray `dst` with the same
+    //       " "number of elements using underlying 'C'-contiguous order for
+    //       flat " "traversal. " "Returns a tuple of events: (ht_event,
+    //       comp_event)", py::arg("src"), py::arg("dst"),
+    //       py::arg("sycl_queue"), py::arg("depends") = py::list());
+
+    // m.def("_copy_usm_ndarray_for_roll_1d", &copy_usm_ndarray_for_roll_1d,
+    //       "Copies from usm_ndarray `src` into usm_ndarray `dst` with the same
+    //       " "shapes using underlying 'C'-contiguous order for flat "
+    //       "traversal with shift. "
+    //       "Returns a tuple of events: (ht_event, comp_event)",
+    //       py::arg("src"), py::arg("dst"), py::arg("shift"),
+    //       py::arg("sycl_queue"), py::arg("depends") = py::list());
+
+    // m.def("_copy_usm_ndarray_for_roll_nd", &copy_usm_ndarray_for_roll_nd,
+    //       "Copies from usm_ndarray `src` into usm_ndarray `dst` with the same
+    //       " "shapes using underlying 'C'-contiguous order for " "traversal
+    //       with shifts along each axis. " "Returns a tuple of events:
+    //       (ht_event, comp_event)", py::arg("src"), py::arg("dst"),
+    //       py::arg("shifts"), py::arg("sycl_queue"), py::arg("depends") =
+    //       py::list());
+
+    // m.def("_linspace_step", &usm_ndarray_linear_sequence_step,
+    //       "Fills input 1D contiguous usm_ndarray `dst` with linear sequence "
+    //       "specified by "
+    //       "starting point `start` and step `dt`. "
+    //       "Returns a tuple of events: (ht_event, comp_event)",
+    //       py::arg("start"), py::arg("dt"), py::arg("dst"),
+    //       py::arg("sycl_queue"), py::arg("depends") = py::list());
+
+    // m.def("_linspace_affine", &usm_ndarray_linear_sequence_affine,
+    //       "Fills input 1D contiguous usm_ndarray `dst` with linear sequence "
+    //       "specified by "
+    //       "starting point `start` and end point `end`. "
+    //       "Returns a tuple of events: (ht_event, comp_event)",
+    //       py::arg("start"), py::arg("end"), py::arg("dst"),
+    //       py::arg("include_endpoint"), py::arg("sycl_queue"),
+    //       py::arg("depends") = py::list());
+
+    // m.def("_copy_numpy_ndarray_into_usm_ndarray",
+    //       &copy_numpy_ndarray_into_usm_ndarray,
+    //       "Copy from numpy array `src` into usm_ndarray `dst`
+    //       synchronously.", py::arg("src"), py::arg("dst"),
+    //       py::arg("sycl_queue"), py::arg("depends") = py::list());
+
+    // m.def("_zeros_usm_ndarray", &usm_ndarray_zeros,
+    //       "Populate usm_ndarray `dst` with zeros.", py::arg("dst"),
+    //       py::arg("sycl_queue"), py::arg("depends") = py::list());
+
+    // m.def("_full_usm_ndarray", &usm_ndarray_full,
+    //       "Populate usm_ndarray `dst` with given fill_value.",
+    //       py::arg("fill_value"), py::arg("dst"), py::arg("sycl_queue"),
+    //       py::arg("depends") = py::list());
+
+    // m.def("_take", &usm_ndarray_take,
+    //       "Takes elements at usm_ndarray indices `ind` and axes starting "
+    //       "at axis `axis_start` from array `src` and copies them "
+    //       "into usm_ndarray `dst` synchronously."
+    //       "Returns a tuple of events: (hev, ev)",
+    //       py::arg("src"), py::arg("ind"), py::arg("dst"),
+    //       py::arg("axis_start"), py::arg("mode"), py::arg("sycl_queue"),
+    //   py::arg("depends") = py::list());
+
+    // m.def("_put", &usm_ndarray_put,
+    //       "Puts elements at usm_ndarray indices `ind` and axes starting "
+    //       "at axis `axis_start` into array `dst` from "
+    //       "usm_ndarray `val` synchronously."
+    //       "Returns a tuple of events: (hev, ev)",
+    //       py::arg("dst"), py::arg("ind"), py::arg("val"),
+    //       py::arg("axis_start"), py::arg("mode"), py::arg("sycl_queue"),
+    //       py::arg("depends") = py::list());
+
+    // m.def("_eye", &usm_ndarray_eye,
+    //       "Fills input 2D contiguous usm_ndarray `dst` with "
+    //       "zeros outside of the diagonal "
+    //       "specified by "
+    //       "the diagonal index `k` "
+    //       "which is filled with ones."
+    //       "Returns a tuple of events: (ht_event, comp_event)",
+    //       py::arg("k"), py::arg("dst"), py::arg("sycl_queue"),
+    //       py::arg("depends") = py::list());
+
+    m.def("default_device_fp_type",
+          dpctl::tensor::py_internal::default_device_fp_type,
+          "Gives default floating point type supported by device.",
+          py::arg("dev"));
+
+    m.def("default_device_int_type",
+          dpctl::tensor::py_internal::default_device_int_type,
+          "Gives default signed integer type supported by device.",
+          py::arg("dev"));
+
+    m.def("default_device_uint_type",
+          dpctl::tensor::py_internal::default_device_uint_type,
+          "Gives default unsigned integer type supported by device.",
+          py::arg("dev"));
+
+    m.def("default_device_bool_type",
+          dpctl::tensor::py_internal::default_device_bool_type,
+          "Gives default boolean type supported by device.", py::arg("dev"));
+
+    m.def("default_device_complex_type",
+          dpctl::tensor::py_internal::default_device_complex_type,
+          "Gives default complex floating point type supported by device.",
+          py::arg("dev"));
+
+    m.def("default_device_index_type",
+          dpctl::tensor::py_internal::default_device_index_type,
+          "Gives default index type supported by device.", py::arg("dev"));
+
+    // auto tril_fn = [](const dpctl::tensor::usm_ndarray &src,
+    //                   const dpctl::tensor::usm_ndarray &dst, py::ssize_t k,
+    //                   sycl::queue &exec_q,
+    //                   const std::vector<sycl::event> depends)
+    //     -> std::pair<sycl::event, sycl::event> {
+    //     return usm_ndarray_triul(exec_q, src, dst, 'l', k, depends);
+    // };
+    // m.def("_tril", tril_fn, "Tril helper function.", py::arg("src"),
+    //       py::arg("dst"), py::arg("k") = 0, py::arg("sycl_queue"),
+    //       py::arg("depends") = py::list());
+
+    // auto triu_fn = [](const dpctl::tensor::usm_ndarray &src,
+    //                   const dpctl::tensor::usm_ndarray &dst, py::ssize_t k,
+    //                   sycl::queue &exec_q,
+    //                   const std::vector<sycl::event> depends)
+    //     -> std::pair<sycl::event, sycl::event> {
+    //     return usm_ndarray_triul(exec_q, src, dst, 'u', k, depends);
+    // };
+    // m.def("_triu", triu_fn, "Triu helper function.", py::arg("src"),
+    //       py::arg("dst"), py::arg("k") = 0, py::arg("sycl_queue"),
+    //       py::arg("depends") = py::list());
+
+    // m.def("mask_positions", &py_mask_positions, "", py::arg("mask"),
+    //       py::arg("cumsum"), py::arg("sycl_queue"),
+    //       py::arg("depends") = py::list());
+
+    // m.def("_cumsum_1d", &py_cumsum_1d, "", py::arg("src"), py::arg("cumsum"),
+    //       py::arg("sycl_queue"), py::arg("depends") = py::list());
+
+    // m.def("_extract", &py_extract, "", py::arg("src"), py::arg("cumsum"),
+    //       py::arg("axis_start"), py::arg("axis_end"), py::arg("dst"),
+    //       py::arg("sycl_queue"), py::arg("depends") = py::list());
+
+    auto overlap = [](const dpctl::tensor::usm_ndarray &x1,
+                      const dpctl::tensor::usm_ndarray &x2) -> bool {
+        auto const &overlap = MemoryOverlap();
+        return overlap(x1, x2);
+    };
+    m.def("_array_overlap", overlap,
+          "Determines if the memory regions indexed by each array overlap",
+          py::arg("array1"), py::arg("array2"));
+
+    auto same_logical_tensors =
+        [](const dpctl::tensor::usm_ndarray &x1,
+           const dpctl::tensor::usm_ndarray &x2) -> bool {
+        auto const &same_logical_tensors = SameLogicalTensors();
+        return same_logical_tensors(x1, x2);
+    };
+    m.def("_same_logical_tensors", same_logical_tensors,
+          "Determines if the memory regions indexed by each array are the same",
+          py::arg("array1"), py::arg("array2"));
+
+    // m.def("_place", &py_place, "", py::arg("dst"), py::arg("cumsum"),
+    //       py::arg("axis_start"), py::arg("axis_end"), py::arg("rhs"),
+    //       py::arg("sycl_queue"), py::arg("depends") = py::list());
+
+    // m.def("_nonzero", &py_nonzero, "", py::arg("cumsum"), py::arg("indexes"),
+    //       py::arg("mask_shape"), py::arg("sycl_queue"),
+    //       py::arg("depends") = py::list());
+
+    // m.def("_where", &py_where, "", py::arg("condition"), py::arg("x1"),
+    //       py::arg("x2"), py::arg("dst"), py::arg("sycl_queue"),
+    //       py::arg("depends") = py::list());
+
+    // auto repeat_sequence = [](const dpctl::tensor::usm_ndarray &src,
+    //                           const dpctl::tensor::usm_ndarray &dst,
+    //                           const dpctl::tensor::usm_ndarray &reps,
+    //                           const dpctl::tensor::usm_ndarray &cumsum,
+    //                           std::optional<int> axis, sycl::queue &exec_q,
+    //                           const std::vector<sycl::event> depends)
+    //     -> std::pair<sycl::event, sycl::event> {
+    //     if (axis) {
+    //         return py_repeat_by_sequence(src, dst, reps, cumsum,
+    //         axis.value(),
+    //                                      exec_q, depends);
+    //     }
+    //     else {
+    //         return py_repeat_by_sequence(src, dst, reps, cumsum, exec_q,
+    //                                      depends);
+    //     }
+    // };
+    // m.def("_repeat_by_sequence", repeat_sequence, py::arg("src"),
+    //       py::arg("dst"), py::arg("reps"), py::arg("cumsum"),
+    //       py::arg("axis"), py::arg("sycl_queue"), py::arg("depends") =
+    //       py::list());
+
+    // auto repeat_scalar = [](const dpctl::tensor::usm_ndarray &src,
+    //                         const dpctl::tensor::usm_ndarray &dst,
+    //                         const py::ssize_t reps, std::optional<int> axis,
+    //                         sycl::queue &exec_q,
+    //                         const std::vector<sycl::event> depends)
+    //     -> std::pair<sycl::event, sycl::event> {
+    //     if (axis) {
+    //         return py_repeat_by_scalar(src, dst, reps, axis.value(), exec_q,
+    //                                    depends);
+    //     }
+    //     else {
+    //         return py_repeat_by_scalar(src, dst, reps, exec_q, depends);
+    //     }
+    // };
+    // m.def("_repeat_by_scalar", repeat_scalar, py::arg("src"), py::arg("dst"),
+    //       py::arg("reps"), py::arg("axis"), py::arg("sycl_queue"),
+    //       py::arg("depends") = py::list());
+
+    // m.def("_clip", &py_clip,
+    //       "Clamps elements of array `x` to the range "
+    //       "[`min`, `max] and writes the result to the "
+    //       "array `dst` for each element of `x`, `min`, and `max`."
+    //       "Returns a tuple of events: (hev, ev)",
+    //       py::arg("src"), py::arg("min"), py::arg("max"), py::arg("dst"),
+    //       py::arg("sycl_queue"), py::arg("depends") = py::list());
+}
diff --git a/dpnp/backend/extensions/blas/CMakeLists.txt b/dpnp/backend/extensions/blas/CMakeLists.txt
index 86457cd73ea6..69a99b996d97 100644
--- a/dpnp/backend/extensions/blas/CMakeLists.txt
+++ b/dpnp/backend/extensions/blas/CMakeLists.txt
@@ -68,7 +68,7 @@ target_include_directories(
     PRIVATE
         ${CMAKE_CURRENT_SOURCE_DIR}/../common
         ${CMAKE_SOURCE_DIR}/dpnp/backend/include
-        ${CMAKE_SOURCE_DIR}/dpctl/tensor/libtensor/include
+        ${CMAKE_SOURCE_DIR}/dpctl_ext/tensor/libtensor/include
 )
 
 # treat below headers as system to suppress the warnings there during the build
diff --git a/dpnp/backend/extensions/fft/CMakeLists.txt b/dpnp/backend/extensions/fft/CMakeLists.txt
index 59615ea7a6d5..8a96d8cbd25a 100644
--- a/dpnp/backend/extensions/fft/CMakeLists.txt
+++ b/dpnp/backend/extensions/fft/CMakeLists.txt
@@ -61,7 +61,7 @@ target_include_directories(
     ${python_module_name}
     PRIVATE
         ${CMAKE_SOURCE_DIR}/dpnp/backend/include
-        ${CMAKE_SOURCE_DIR}/dpctl/tensor/libtensor/include
+        ${CMAKE_SOURCE_DIR}/dpctl_ext/tensor/libtensor/include
 )
 
 # treat below headers as system to suppress the warnings there during the build
diff --git a/dpnp/backend/extensions/indexing/CMakeLists.txt b/dpnp/backend/extensions/indexing/CMakeLists.txt
index b6c644ceb0f6..373c6152f662 100644
--- a/dpnp/backend/extensions/indexing/CMakeLists.txt
+++ b/dpnp/backend/extensions/indexing/CMakeLists.txt
@@ -65,7 +65,7 @@ target_include_directories(
     PRIVATE
         ${CMAKE_CURRENT_SOURCE_DIR}/../common
         ${CMAKE_SOURCE_DIR}/dpnp/backend/include
-        ${CMAKE_SOURCE_DIR}/dpctl/tensor/libtensor/include
+        ${CMAKE_SOURCE_DIR}/dpctl_ext/tensor/libtensor/include
 )
 
 # treat below headers as system to suppress the warnings there during the build
diff --git a/dpnp/backend/extensions/lapack/CMakeLists.txt b/dpnp/backend/extensions/lapack/CMakeLists.txt
index e35621735dbd..2bac0932a673 100644
--- a/dpnp/backend/extensions/lapack/CMakeLists.txt
+++ b/dpnp/backend/extensions/lapack/CMakeLists.txt
@@ -91,7 +91,7 @@ target_include_directories(
         ${CMAKE_CURRENT_SOURCE_DIR}/../../
         ${CMAKE_CURRENT_SOURCE_DIR}/../common
         ${CMAKE_SOURCE_DIR}/dpnp/backend/include
-        ${CMAKE_SOURCE_DIR}/dpctl/tensor/libtensor/include
+        ${CMAKE_SOURCE_DIR}/dpctl_ext/tensor/libtensor/include
 )
 
 # treat below headers as system to suppress the warnings there during the build
diff --git a/dpnp/backend/extensions/statistics/CMakeLists.txt b/dpnp/backend/extensions/statistics/CMakeLists.txt
index fdafbb2c4a92..60d26295acf8 100644
--- a/dpnp/backend/extensions/statistics/CMakeLists.txt
+++ b/dpnp/backend/extensions/statistics/CMakeLists.txt
@@ -70,7 +70,7 @@ target_include_directories(
     PRIVATE
         ${CMAKE_CURRENT_SOURCE_DIR}/../common
         ${CMAKE_SOURCE_DIR}/dpnp/backend/include
-        ${CMAKE_SOURCE_DIR}/dpctl/tensor/libtensor/include
+        ${CMAKE_SOURCE_DIR}/dpctl_ext/tensor/libtensor/include
 )
 
 # treat below headers as system to suppress the warnings there during the build
diff --git a/dpnp/backend/extensions/ufunc/CMakeLists.txt b/dpnp/backend/extensions/ufunc/CMakeLists.txt
index eb9b16edcc63..45d2706fb48d 100644
--- a/dpnp/backend/extensions/ufunc/CMakeLists.txt
+++ b/dpnp/backend/extensions/ufunc/CMakeLists.txt
@@ -88,7 +88,7 @@ target_include_directories(
         ${CMAKE_CURRENT_SOURCE_DIR}/../../
         ${CMAKE_CURRENT_SOURCE_DIR}/../common
         ${CMAKE_SOURCE_DIR}/dpnp/backend/include
-        ${CMAKE_SOURCE_DIR}/dpctl/tensor/libtensor/include
+        ${CMAKE_SOURCE_DIR}/dpctl_ext/tensor/libtensor/include
 )
 
 # treat below headers as system to suppress the warnings there during the build
diff --git a/dpnp/backend/extensions/vm/CMakeLists.txt b/dpnp/backend/extensions/vm/CMakeLists.txt
index 3e9b40344444..32f7d4281c2f 100644
--- a/dpnp/backend/extensions/vm/CMakeLists.txt
+++ b/dpnp/backend/extensions/vm/CMakeLists.txt
@@ -110,7 +110,7 @@ target_include_directories(
     PRIVATE
         ${CMAKE_CURRENT_SOURCE_DIR}/../common
         ${CMAKE_SOURCE_DIR}/dpnp/backend/include
-        ${CMAKE_SOURCE_DIR}/dpctl/tensor/libtensor/include
+        ${CMAKE_SOURCE_DIR}/dpctl_ext/tensor/libtensor/include
 )
 
 # treat below headers as system to suppress the warnings there during the build
diff --git a/dpnp/backend/extensions/window/CMakeLists.txt b/dpnp/backend/extensions/window/CMakeLists.txt
index 3877c958e76c..5b7921ad324c 100644
--- a/dpnp/backend/extensions/window/CMakeLists.txt
+++ b/dpnp/backend/extensions/window/CMakeLists.txt
@@ -66,7 +66,7 @@ target_include_directories(
         ${CMAKE_CURRENT_SOURCE_DIR}/../../
         ${CMAKE_CURRENT_SOURCE_DIR}/../common
         ${CMAKE_SOURCE_DIR}/dpnp/backend/include
-        ${CMAKE_SOURCE_DIR}/dpctl/tensor/libtensor/include
+        ${CMAKE_SOURCE_DIR}/dpctl_ext/tensor/libtensor/include
 )
 
 # treat below headers as system to suppress the warnings there during the build
diff --git a/dpnp/dpnp_algo/dpnp_elementwise_common.py b/dpnp/dpnp_algo/dpnp_elementwise_common.py
index 57bf50422fa0..88abcee5035c 100644
--- a/dpnp/dpnp_algo/dpnp_elementwise_common.py
+++ b/dpnp/dpnp_algo/dpnp_elementwise_common.py
@@ -31,7 +31,6 @@
 
 import dpctl.tensor as dpt
 import dpctl.tensor._copy_utils as dtc
-import dpctl.tensor._tensor_impl as dti
 import dpctl.tensor._type_utils as dtu
 import dpctl.utils as dpu
 import numpy
@@ -45,6 +44,10 @@
     _validate_dtype,
 )
 
+# pylint: disable=no-name-in-module
+# TODO: revert to `import dpctl.tensor...`
+# when dpnp fully migrates dpctl/tensor
+import dpctl_ext.tensor._tensor_impl as dti
 import dpnp
 import dpnp.backend.extensions.vm._vm_impl as vmi
 from dpnp.dpnp_array import dpnp_array
diff --git a/dpnp/dpnp_iface.py b/dpnp/dpnp_iface.py
index fba1a215756a..50b474014666 100644
--- a/dpnp/dpnp_iface.py
+++ b/dpnp/dpnp_iface.py
@@ -40,21 +40,22 @@
 """
 
 # pylint: disable=protected-access
+# pylint: disable=no-name-in-module
 
 import os
 
 import dpctl
 import dpctl.tensor as dpt
-import dpctl.tensor._tensor_impl as ti
 import dpctl.utils as dpu
 import numpy
 from dpctl.tensor._device import normalize_queue_device
 
+# TODO: revert to `import dpctl.tensor...`
+# when dpnp fully migrates dpctl/tensor
+import dpctl_ext.tensor._tensor_impl as ti
 import dpnp
 
 from .dpnp_array import dpnp_array
-
-# pylint: disable=no-name-in-module
 from .dpnp_utils import (
     dpnp_descriptor,
     map_dtype_to_device,
diff --git a/dpnp/dpnp_iface_searching.py b/dpnp/dpnp_iface_searching.py
index 6eefe010b699..16ab633d506b 100644
--- a/dpnp/dpnp_iface_searching.py
+++ b/dpnp/dpnp_iface_searching.py
@@ -40,8 +40,11 @@
 """
 
 import dpctl.tensor as dpt
-import dpctl.tensor._tensor_impl as dti
 
+# pylint: disable=no-name-in-module
+# TODO: revert to `import dpctl.tensor...`
+# when dpnp fully migrates dpctl/tensor
+import dpctl_ext.tensor._tensor_impl as dti
 import dpnp
 
 from .dpnp_array import dpnp_array
diff --git a/dpnp/dpnp_utils/dpnp_utils_linearalgebra.py b/dpnp/dpnp_utils/dpnp_utils_linearalgebra.py
index 191b8aa65d13..9ad97742ee18 100644
--- a/dpnp/dpnp_utils/dpnp_utils_linearalgebra.py
+++ b/dpnp/dpnp_utils/dpnp_utils_linearalgebra.py
@@ -28,7 +28,6 @@
 
 import dpctl
 import dpctl.tensor as dpt
-import dpctl.tensor._tensor_impl as ti
 import dpctl.utils as dpu
 import numpy
 from dpctl.tensor._numpy_helper import (
@@ -38,6 +37,10 @@
 )
 from dpctl.utils import ExecutionPlacementError
 
+# pylint: disable=no-name-in-module
+# TODO: revert to `import dpctl.tensor...`
+# when dpnp fully migrates dpctl/tensor
+import dpctl_ext.tensor._tensor_impl as ti
 import dpnp
 import dpnp.backend.extensions.blas._blas_impl as bi
 from dpnp.dpnp_array import dpnp_array
diff --git a/dpnp/scipy/linalg/_utils.py b/dpnp/scipy/linalg/_utils.py
index f00db6fdfb92..9e5ae405ccc5 100644
--- a/dpnp/scipy/linalg/_utils.py
+++ b/dpnp/scipy/linalg/_utils.py
@@ -43,9 +43,11 @@
 
 from warnings import warn
 
-import dpctl.tensor._tensor_impl as ti
 import dpctl.utils as dpu
 
+# TODO: revert to `import dpctl.tensor...`
+# when dpnp fully migrates dpctl/tensor
+import dpctl_ext.tensor._tensor_impl as ti
 import dpnp
 import dpnp.backend.extensions.lapack._lapack_impl as li
 from dpnp.dpnp_utils import get_usm_allocations
diff --git a/dpnp/tests/test_array_api_info.py b/dpnp/tests/test_array_api_info.py
index b310192ffc59..32730c8724dc 100644
--- a/dpnp/tests/test_array_api_info.py
+++ b/dpnp/tests/test_array_api_info.py
@@ -1,9 +1,11 @@
-import numpy
 import pytest
 from dpctl import SyclDeviceCreationError, get_devices, select_default_device
-from dpctl.tensor._tensor_impl import default_device_complex_type
 
 import dpnp
+
+# TODO: revert to `from dpctl.tensor....`
+# when dpnp fully migrates dpctl/tensor
+from dpctl_ext.tensor._tensor_impl import default_device_complex_type
 from dpnp.tests.helper import (
     has_support_aspect64,
     is_win_platform,
diff --git a/pyproject.toml b/pyproject.toml
index cdf592535d11..67fb75cb5f54 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -108,7 +108,7 @@ target-version = ['py310', 'py311', 'py312', 'py313', 'py314']
 [tool.codespell]
 builtin = "clear,rare,informal,names"
 check-filenames = true
-ignore-words-list = "amin,arange,elemt,fro,hist,ith,mone,nd,nin,sinc,vart,GroupT"
+ignore-words-list = "amin,arange,elemt,fro,hist,ith,mone,nd,nin,sinc,vart,GroupT,AccessorT,IndexT"
 quiet-level = 3
 
 [tool.coverage.report]
diff --git a/setup.py b/setup.py
index cc21221299c4..7ffef3bed9d8 100644
--- a/setup.py
+++ b/setup.py
@@ -44,6 +44,9 @@
         "dpnp.scipy",
         "dpnp.scipy.linalg",
         "dpnp.scipy.special",
+        # TODO: replace with dpctl; dpctl.tensor
+        "dpctl_ext",
+        "dpctl_ext.tensor",
     ],
     package_data={
         "dpnp": [

From b00d0641a673a8a76e51b3641357d71e2ffb74b5 Mon Sep 17 00:00:00 2001
From: vlad-perevezentsev <vladislav.perevezentsev@intel.com>
Date: Wed, 25 Feb 2026 00:27:47 +0100
Subject: [PATCH 03/43] Extend `._tensor_impl` with remaining functions used by
 dpnp (#2758)

This PR extends `_tensor_impl` in `dpctl_ext.tensor` with the remaining
functions that are explicitly used in `dpnp` implementations (`_take`,
`_full_usm_ndarray`, `_zeros_usm_ndarray`, `_triu`) enabling a complete
switch to `dpctl_ext.tensor._tensor_impl` instead of
`dpctl.tensor._tensor_impl`

It also adds `take()`, `put()`, `full()`,`tril()` and `triu()` to
`dpctl_ext.tensor` and updates the corresponding dpnp functions to use
these implementations internally
---
 dpctl_ext/tensor/CMakeLists.txt               |   8 +-
 dpctl_ext/tensor/__init__.py                  |  19 +
 dpctl_ext/tensor/_ctors.py                    | 326 +++++++
 dpctl_ext/tensor/_indexing_functions.py       | 329 +++++++
 dpctl_ext/tensor/_numpy_helper.py             |  45 +
 .../include/kernels/constructors.hpp          | 303 +++++++
 .../kernels/integer_advanced_indexing.hpp     | 418 +++++++++
 .../tensor/libtensor/source/full_ctor.cpp     | 311 +++++++
 .../tensor/libtensor/source/full_ctor.hpp     |  57 ++
 .../source/integer_advanced_indexing.cpp      | 817 ++++++++++++++++++
 .../source/integer_advanced_indexing.hpp      |  71 ++
 .../tensor/libtensor/source/tensor_ctors.cpp  | 144 ++-
 .../tensor/libtensor/source/triul_ctor.cpp    | 246 ++++++
 .../tensor/libtensor/source/triul_ctor.hpp    |  58 ++
 .../tensor/libtensor/source/zeros_ctor.cpp    | 161 ++++
 .../tensor/libtensor/source/zeros_ctor.hpp    |  54 ++
 dpnp/dpnp_algo/dpnp_fill.py                   |   9 +-
 dpnp/dpnp_container.py                        |   7 +-
 dpnp/dpnp_iface.py                            |   1 +
 dpnp/dpnp_iface_indexing.py                   |  11 +-
 dpnp/fft/dpnp_utils_fft.py                    |  14 +-
 dpnp/linalg/dpnp_utils_linalg.py              |   5 +-
 dpnp/scipy/linalg/_utils.py                   |   1 +
 23 files changed, 3325 insertions(+), 90 deletions(-)
 create mode 100644 dpctl_ext/tensor/_ctors.py
 create mode 100644 dpctl_ext/tensor/_indexing_functions.py
 create mode 100644 dpctl_ext/tensor/_numpy_helper.py
 create mode 100644 dpctl_ext/tensor/libtensor/include/kernels/constructors.hpp
 create mode 100644 dpctl_ext/tensor/libtensor/include/kernels/integer_advanced_indexing.hpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/full_ctor.cpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/full_ctor.hpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/integer_advanced_indexing.cpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/integer_advanced_indexing.hpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/triul_ctor.cpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/triul_ctor.hpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/zeros_ctor.cpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/zeros_ctor.hpp

diff --git a/dpctl_ext/tensor/CMakeLists.txt b/dpctl_ext/tensor/CMakeLists.txt
index ed69b4f10cba..fd781a9f9586 100644
--- a/dpctl_ext/tensor/CMakeLists.txt
+++ b/dpctl_ext/tensor/CMakeLists.txt
@@ -52,12 +52,12 @@ set(_tensor_impl_sources
     # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_for_reshape.cpp
     # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_for_roll.cpp
     # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/linear_sequences.cpp
-    # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/integer_advanced_indexing.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/integer_advanced_indexing.cpp
     # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/boolean_advanced_indexing.cpp
     # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/eye_ctor.cpp
-    # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/full_ctor.cpp
-    # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/zeros_ctor.cpp
-    # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/triul_ctor.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/full_ctor.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/zeros_ctor.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/triul_ctor.cpp
     # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/where.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/device_support_queries.cpp
     # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/repeat.cpp
diff --git a/dpctl_ext/tensor/__init__.py b/dpctl_ext/tensor/__init__.py
index a71324cb88d8..3c6939eff7a0 100644
--- a/dpctl_ext/tensor/__init__.py
+++ b/dpctl_ext/tensor/__init__.py
@@ -25,3 +25,22 @@
 # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
 # THE POSSIBILITY OF SUCH DAMAGE.
 # *****************************************************************************
+
+
+from dpctl_ext.tensor._ctors import (
+    full,
+    tril,
+    triu,
+)
+from dpctl_ext.tensor._indexing_functions import (
+    put,
+    take,
+)
+
+__all__ = [
+    "full",
+    "put",
+    "take",
+    "tril",
+    "triu",
+]
diff --git a/dpctl_ext/tensor/_ctors.py b/dpctl_ext/tensor/_ctors.py
new file mode 100644
index 000000000000..a0e7b28e66ff
--- /dev/null
+++ b/dpctl_ext/tensor/_ctors.py
@@ -0,0 +1,326 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+import operator
+from numbers import Number
+
+import dpctl
+import dpctl.tensor as dpt
+import dpctl.utils
+import numpy as np
+from dpctl.tensor._data_types import _get_dtype
+from dpctl.tensor._device import normalize_queue_device
+
+import dpctl_ext.tensor._tensor_impl as ti
+
+
+def _cast_fill_val(fill_val, dt):
+    """
+    Casts the Python scalar `fill_val` to another Python type coercible to the
+    requested data type `dt`, if necessary.
+    """
+    val_type = type(fill_val)
+    if val_type in [float, complex] and np.issubdtype(dt, np.integer):
+        return int(fill_val.real)
+    elif val_type is complex and np.issubdtype(dt, np.floating):
+        return fill_val.real
+    elif val_type is int and np.issubdtype(dt, np.integer):
+        return _to_scalar(fill_val, dt)
+    else:
+        return fill_val
+
+
+def _to_scalar(obj, sc_ty):
+    """A way to convert object to NumPy scalar type.
+    Raises OverflowError if obj can not be represented
+    using the requested scalar type.
+    """
+    zd_arr = np.asarray(obj, dtype=sc_ty)
+    return zd_arr[()]
+
+
+def _validate_fill_value(fill_val):
+    """Validates that `fill_val` is a numeric or boolean scalar."""
+    # TODO: verify if `np.True_` and `np.False_` should be instances of
+    # Number in NumPy, like other NumPy scalars and like Python bools
+    # check for `np.bool_` separately as NumPy<2 has no `np.bool`
+    if not isinstance(fill_val, Number) and not isinstance(fill_val, np.bool_):
+        raise TypeError(
+            f"array cannot be filled with scalar of type {type(fill_val)}"
+        )
+
+
+def full(
+    shape,
+    fill_value,
+    *,
+    dtype=None,
+    order="C",
+    device=None,
+    usm_type=None,
+    sycl_queue=None,
+):
+    """
+    Returns a new :class:`dpctl.tensor.usm_ndarray` having a specified
+    shape and filled with `fill_value`.
+
+    Args:
+        shape (tuple):
+            Dimensions of the array to be created.
+        fill_value (int,float,complex,usm_ndarray):
+            fill value
+        dtype (optional): data type of the array. Can be typestring,
+            a :class:`numpy.dtype` object, :mod:`numpy` char string,
+            or a NumPy scalar type. Default: ``None``
+        order ("C", or "F"):
+            memory layout for the array. Default: ``"C"``
+        device (optional): array API concept of device where the output array
+            is created. ``device`` can be ``None``, a oneAPI filter selector
+            string, an instance of :class:`dpctl.SyclDevice` corresponding to
+            a non-partitioned SYCL device, an instance of
+            :class:`dpctl.SyclQueue`, or a :class:`dpctl.tensor.Device` object
+            returned by :attr:`dpctl.tensor.usm_ndarray.device`.
+            Default: ``None``
+        usm_type (``"device"``, ``"shared"``, ``"host"``, optional):
+            The type of SYCL USM allocation for the output array.
+            Default: ``"device"``
+        sycl_queue (:class:`dpctl.SyclQueue`, optional):
+            The SYCL queue to use
+            for output array allocation and copying. ``sycl_queue`` and
+            ``device`` are complementary arguments, i.e. use one or another.
+            If both are specified, a :exc:`TypeError` is raised unless both
+            imply the same underlying SYCL queue to be used. If both are
+            ``None``, a cached queue targeting default-selected device is
+            used for allocation and population. Default: ``None``
+
+    Returns:
+        usm_ndarray:
+            New array initialized with given value.
+    """
+    if not isinstance(order, str) or len(order) == 0 or order[0] not in "CcFf":
+        raise ValueError(
+            "Unrecognized order keyword value, expecting 'F' or 'C'."
+        )
+    order = order[0].upper()
+    dpctl.utils.validate_usm_type(usm_type, allow_none=True)
+
+    if isinstance(fill_value, (dpt.usm_ndarray, np.ndarray, tuple, list)):
+        if (
+            isinstance(fill_value, dpt.usm_ndarray)
+            and sycl_queue is None
+            and device is None
+        ):
+            sycl_queue = fill_value.sycl_queue
+        else:
+            sycl_queue = normalize_queue_device(
+                sycl_queue=sycl_queue, device=device
+            )
+        X = dpt.asarray(
+            fill_value,
+            dtype=dtype,
+            order=order,
+            usm_type=usm_type,
+            sycl_queue=sycl_queue,
+        )
+        return dpt.copy(dpt.broadcast_to(X, shape), order=order)
+    else:
+        _validate_fill_value(fill_value)
+
+    sycl_queue = normalize_queue_device(sycl_queue=sycl_queue, device=device)
+    usm_type = usm_type if usm_type is not None else "device"
+    dtype = _get_dtype(dtype, sycl_queue, ref_type=type(fill_value))
+    res = dpt.usm_ndarray(
+        shape,
+        dtype=dtype,
+        buffer=usm_type,
+        order=order,
+        buffer_ctor_kwargs={"queue": sycl_queue},
+    )
+    fill_value = _cast_fill_val(fill_value, dtype)
+
+    _manager = dpctl.utils.SequentialOrderManager[sycl_queue]
+    # populating new allocation, no dependent events
+    hev, full_ev = ti._full_usm_ndarray(fill_value, res, sycl_queue)
+    _manager.add_event_pair(hev, full_ev)
+    return res
+
+
+def tril(x, /, *, k=0):
+    """
+    Returns the lower triangular part of a matrix (or a stack of matrices)
+    ``x``.
+
+    The lower triangular part of the matrix is defined as the elements on and
+    below the specified diagonal ``k``.
+
+    Args:
+        x (usm_ndarray):
+            Input array
+        k (int, optional):
+            Specifies the diagonal above which to set
+            elements to zero. If ``k = 0``, the diagonal is the main diagonal.
+            If ``k < 0``, the diagonal is below the main diagonal.
+            If ``k > 0``, the diagonal is above the main diagonal.
+            Default: ``0``
+
+    Returns:
+        usm_ndarray:
+            A lower-triangular array or a stack of lower-triangular arrays.
+    """
+    if not isinstance(x, dpt.usm_ndarray):
+        raise TypeError(
+            "Expected argument of type dpctl.tensor.usm_ndarray, "
+            f"got {type(x)}."
+        )
+
+    k = operator.index(k)
+
+    order = "F" if (x.flags.f_contiguous) else "C"
+
+    shape = x.shape
+    nd = x.ndim
+    if nd < 2:
+        raise ValueError("Array dimensions less than 2.")
+
+    q = x.sycl_queue
+    if k >= shape[nd - 1] - 1:
+        res = dpt.empty(
+            x.shape,
+            dtype=x.dtype,
+            order=order,
+            usm_type=x.usm_type,
+            sycl_queue=q,
+        )
+        _manager = dpctl.utils.SequentialOrderManager[q]
+        dep_evs = _manager.submitted_events
+        hev, cpy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
+            src=x, dst=res, sycl_queue=q, depends=dep_evs
+        )
+        _manager.add_event_pair(hev, cpy_ev)
+    elif k < -shape[nd - 2]:
+        res = dpt.zeros(
+            x.shape,
+            dtype=x.dtype,
+            order=order,
+            usm_type=x.usm_type,
+            sycl_queue=q,
+        )
+    else:
+        res = dpt.empty(
+            x.shape,
+            dtype=x.dtype,
+            order=order,
+            usm_type=x.usm_type,
+            sycl_queue=q,
+        )
+        _manager = dpctl.utils.SequentialOrderManager[q]
+        dep_evs = _manager.submitted_events
+        hev, tril_ev = ti._tril(
+            src=x, dst=res, k=k, sycl_queue=q, depends=dep_evs
+        )
+        _manager.add_event_pair(hev, tril_ev)
+
+    return res
+
+
+def triu(x, /, *, k=0):
+    """
+    Returns the upper triangular part of a matrix (or a stack of matrices)
+    ``x``.
+
+    The upper triangular part of the matrix is defined as the elements on and
+    above the specified diagonal ``k``.
+
+    Args:
+        x (usm_ndarray):
+            Input array
+        k (int, optional):
+            Specifies the diagonal below which to set
+            elements to zero. If ``k = 0``, the diagonal is the main diagonal.
+            If ``k < 0``, the diagonal is below the main diagonal.
+            If ``k > 0``, the diagonal is above the main diagonal.
+            Default: ``0``
+
+    Returns:
+        usm_ndarray:
+            An upper-triangular array or a stack of upper-triangular arrays.
+    """
+    if not isinstance(x, dpt.usm_ndarray):
+        raise TypeError(
+            "Expected argument of type dpctl.tensor.usm_ndarray, "
+            f"got {type(x)}."
+        )
+
+    k = operator.index(k)
+
+    order = "F" if (x.flags.f_contiguous) else "C"
+
+    shape = x.shape
+    nd = x.ndim
+    if nd < 2:
+        raise ValueError("Array dimensions less than 2.")
+
+    q = x.sycl_queue
+    if k > shape[nd - 1]:
+        res = dpt.zeros(
+            x.shape,
+            dtype=x.dtype,
+            order=order,
+            usm_type=x.usm_type,
+            sycl_queue=q,
+        )
+    elif k <= -shape[nd - 2] + 1:
+        res = dpt.empty(
+            x.shape,
+            dtype=x.dtype,
+            order=order,
+            usm_type=x.usm_type,
+            sycl_queue=q,
+        )
+        _manager = dpctl.utils.SequentialOrderManager[q]
+        dep_evs = _manager.submitted_events
+        hev, cpy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
+            src=x, dst=res, sycl_queue=q, depends=dep_evs
+        )
+        _manager.add_event_pair(hev, cpy_ev)
+    else:
+        res = dpt.empty(
+            x.shape,
+            dtype=x.dtype,
+            order=order,
+            usm_type=x.usm_type,
+            sycl_queue=q,
+        )
+        _manager = dpctl.utils.SequentialOrderManager[q]
+        dep_evs = _manager.submitted_events
+        hev, triu_ev = ti._triu(
+            src=x, dst=res, k=k, sycl_queue=q, depends=dep_evs
+        )
+        _manager.add_event_pair(hev, triu_ev)
+
+    return res
diff --git a/dpctl_ext/tensor/_indexing_functions.py b/dpctl_ext/tensor/_indexing_functions.py
new file mode 100644
index 000000000000..106df09cf97e
--- /dev/null
+++ b/dpctl_ext/tensor/_indexing_functions.py
@@ -0,0 +1,329 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+import operator
+
+import dpctl
+import dpctl.tensor as dpt
+import dpctl.utils
+
+import dpctl_ext.tensor._tensor_impl as ti
+
+from ._numpy_helper import normalize_axis_index
+
+
+def _get_indexing_mode(name):
+    modes = {"wrap": 0, "clip": 1}
+    try:
+        return modes[name]
+    except KeyError:
+        raise ValueError(
+            "`mode` must be `wrap` or `clip`." "Got `{}`.".format(name)
+        )
+
+
+def put(x, indices, vals, /, *, axis=None, mode="wrap"):
+    """put(x, indices, vals, axis=None, mode="wrap")
+
+    Puts values into an array along a given axis at given indices.
+
+    Args:
+        x (usm_ndarray):
+            The array the values will be put into.
+        indices (usm_ndarray):
+            One-dimensional array of indices.
+        vals (usm_ndarray):
+            Array of values to be put into ``x``.
+            Must be broadcastable to the result shape
+            ``x.shape[:axis] + indices.shape + x.shape[axis+1:]``.
+        axis (int, optional):
+            The axis along which the values will be placed.
+            If ``x`` is one-dimensional, this argument is optional.
+            Default: ``None``.
+        mode (str, optional):
+            How out-of-bounds indices will be handled. Possible values
+            are:
+
+            - ``"wrap"``: clamps indices to (``-n <= i < n``), then wraps
+              negative indices.
+            - ``"clip"``: clips indices to (``0 <= i < n``).
+
+            Default: ``"wrap"``.
+
+    .. note::
+
+        If input array ``indices`` contains duplicates, a race condition
+        occurs, and the value written into corresponding positions in ``x``
+        may vary from run to run. Preserving sequential semantics in handing
+        the duplicates to achieve deterministic behavior requires additional
+        work, e.g.
+
+        :Example:
+
+            .. code-block:: python
+
+                from dpctl import tensor as dpt
+
+                def put_vec_duplicates(vec, ind, vals):
+                    "Put values into vec, handling possible duplicates in ind"
+                    assert vec.ndim, ind.ndim, vals.ndim == 1, 1, 1
+
+                    # find positions of last occurrences of each
+                    # unique index
+                    ind_flipped = dpt.flip(ind)
+                    ind_uniq = dpt.unique_all(ind_flipped).indices
+                    has_dups = len(ind) != len(ind_uniq)
+
+                    if has_dups:
+                        ind_uniq = dpt.subtract(vec.size - 1, ind_uniq)
+                        ind = dpt.take(ind, ind_uniq)
+                        vals = dpt.take(vals, ind_uniq)
+
+                    dpt.put(vec, ind, vals)
+
+                n = 512
+                ind = dpt.concat((dpt.arange(n), dpt.arange(n, -1, step=-1)))
+                x = dpt.zeros(ind.size, dtype="int32")
+                vals = dpt.arange(ind.size, dtype=x.dtype)
+
+                # Values corresponding to last positions of
+                # duplicate indices are written into the vector x
+                put_vec_duplicates(x, ind, vals)
+
+                parts = (vals[-1:-n-2:-1], dpt.zeros(n, dtype=x.dtype))
+                expected = dpt.concat(parts)
+                assert dpt.all(x == expected)
+    """
+    if not isinstance(x, dpt.usm_ndarray):
+        raise TypeError(
+            "Expected instance of `dpt.usm_ndarray`, got `{}`.".format(type(x))
+        )
+    if not isinstance(indices, dpt.usm_ndarray):
+        raise TypeError(
+            "`indices` expected `dpt.usm_ndarray`, got `{}`.".format(
+                type(indices)
+            )
+        )
+    if isinstance(vals, dpt.usm_ndarray):
+        queues_ = [x.sycl_queue, indices.sycl_queue, vals.sycl_queue]
+        usm_types_ = [x.usm_type, indices.usm_type, vals.usm_type]
+    else:
+        queues_ = [x.sycl_queue, indices.sycl_queue]
+        usm_types_ = [x.usm_type, indices.usm_type]
+    if indices.ndim != 1:
+        raise ValueError(
+            "`indices` expected a 1D array, got `{}`".format(indices.ndim)
+        )
+    if indices.dtype.kind not in "ui":
+        raise IndexError(
+            "`indices` expected integer data type, got `{}`".format(
+                indices.dtype
+            )
+        )
+    exec_q = dpctl.utils.get_execution_queue(queues_)
+    if exec_q is None:
+        raise dpctl.utils.ExecutionPlacementError
+    vals_usm_type = dpctl.utils.get_coerced_usm_type(usm_types_)
+
+    mode = _get_indexing_mode(mode)
+
+    x_ndim = x.ndim
+    if axis is None:
+        if x_ndim > 1:
+            raise ValueError(
+                "`axis` cannot be `None` for array of dimension `{}`".format(
+                    x_ndim
+                )
+            )
+        axis = 0
+
+    if x_ndim > 0:
+        axis = normalize_axis_index(operator.index(axis), x_ndim)
+        x_sh = x.shape
+        if x_sh[axis] == 0 and indices.size != 0:
+            raise IndexError("cannot take non-empty indices from an empty axis")
+        val_shape = x.shape[:axis] + indices.shape + x.shape[axis + 1 :]
+    else:
+        if axis != 0:
+            raise ValueError("`axis` must be 0 for an array of dimension 0.")
+        val_shape = indices.shape
+
+    if not isinstance(vals, dpt.usm_ndarray):
+        vals = dpt.asarray(
+            vals, dtype=x.dtype, usm_type=vals_usm_type, sycl_queue=exec_q
+        )
+    # choose to throw here for consistency with `place`
+    if vals.size == 0:
+        raise ValueError(
+            "cannot put into non-empty indices along an empty axis"
+        )
+    if vals.dtype == x.dtype:
+        rhs = vals
+    else:
+        rhs = dpt.astype(vals, x.dtype)
+    rhs = dpt.broadcast_to(rhs, val_shape)
+
+    _manager = dpctl.utils.SequentialOrderManager[exec_q]
+    deps_ev = _manager.submitted_events
+    hev, put_ev = ti._put(
+        x, (indices,), rhs, axis, mode, sycl_queue=exec_q, depends=deps_ev
+    )
+    _manager.add_event_pair(hev, put_ev)
+
+
+def take(x, indices, /, *, axis=None, out=None, mode="wrap"):
+    """take(x, indices, axis=None, out=None, mode="wrap")
+
+    Takes elements from an array along a given axis at given indices.
+
+    Args:
+        x (usm_ndarray):
+            The array that elements will be taken from.
+        indices (usm_ndarray):
+            One-dimensional array of indices.
+        axis (int, optional):
+            The axis along which the values will be selected.
+            If ``x`` is one-dimensional, this argument is optional.
+            Default: ``None``.
+        out (Optional[usm_ndarray]):
+            Output array to populate. Array must have the correct
+            shape and the expected data type.
+        mode (str, optional):
+            How out-of-bounds indices will be handled. Possible values
+            are:
+
+            - ``"wrap"``: clamps indices to (``-n <= i < n``), then wraps
+              negative indices.
+            - ``"clip"``: clips indices to (``0 <= i < n``).
+
+            Default: ``"wrap"``.
+
+    Returns:
+       usm_ndarray:
+          Array with shape
+          ``x.shape[:axis] + indices.shape + x.shape[axis + 1:]``
+          filled with elements from ``x``.
+    """
+    if not isinstance(x, dpt.usm_ndarray):
+        raise TypeError(
+            "Expected instance of `dpt.usm_ndarray`, got `{}`.".format(type(x))
+        )
+
+    if not isinstance(indices, dpt.usm_ndarray):
+        raise TypeError(
+            "`indices` expected `dpt.usm_ndarray`, got `{}`.".format(
+                type(indices)
+            )
+        )
+    if indices.dtype.kind not in "ui":
+        raise IndexError(
+            "`indices` expected integer data type, got `{}`".format(
+                indices.dtype
+            )
+        )
+    if indices.ndim != 1:
+        raise ValueError(
+            "`indices` expected a 1D array, got `{}`".format(indices.ndim)
+        )
+    exec_q = dpctl.utils.get_execution_queue([x.sycl_queue, indices.sycl_queue])
+    if exec_q is None:
+        raise dpctl.utils.ExecutionPlacementError
+    res_usm_type = dpctl.utils.get_coerced_usm_type(
+        [x.usm_type, indices.usm_type]
+    )
+
+    mode = _get_indexing_mode(mode)
+
+    x_ndim = x.ndim
+    if axis is None:
+        if x_ndim > 1:
+            raise ValueError(
+                "`axis` cannot be `None` for array of dimension `{}`".format(
+                    x_ndim
+                )
+            )
+        axis = 0
+
+    if x_ndim > 0:
+        axis = normalize_axis_index(operator.index(axis), x_ndim)
+        x_sh = x.shape
+        if x_sh[axis] == 0 and indices.size != 0:
+            raise IndexError("cannot take non-empty indices from an empty axis")
+        res_shape = x.shape[:axis] + indices.shape + x.shape[axis + 1 :]
+    else:
+        if axis != 0:
+            raise ValueError("`axis` must be 0 for an array of dimension 0.")
+        res_shape = indices.shape
+
+    dt = x.dtype
+
+    orig_out = out
+    if out is not None:
+        if not isinstance(out, dpt.usm_ndarray):
+            raise TypeError(
+                f"output array must be of usm_ndarray type, got {type(out)}"
+            )
+        if not out.flags.writable:
+            raise ValueError("provided `out` array is read-only")
+
+        if out.shape != res_shape:
+            raise ValueError(
+                "The shape of input and output arrays are inconsistent. "
+                f"Expected output shape is {res_shape}, got {out.shape}"
+            )
+        if dt != out.dtype:
+            raise ValueError(
+                f"Output array of type {dt} is needed, got {out.dtype}"
+            )
+        if dpctl.utils.get_execution_queue((exec_q, out.sycl_queue)) is None:
+            raise dpctl.utils.ExecutionPlacementError(
+                "Input and output allocation queues are not compatible"
+            )
+        if ti._array_overlap(x, out):
+            out = dpt.empty_like(out)
+    else:
+        out = dpt.empty(
+            res_shape, dtype=dt, usm_type=res_usm_type, sycl_queue=exec_q
+        )
+
+    _manager = dpctl.utils.SequentialOrderManager[exec_q]
+    deps_ev = _manager.submitted_events
+    hev, take_ev = ti._take(
+        x, (indices,), out, axis, mode, sycl_queue=exec_q, depends=deps_ev
+    )
+    _manager.add_event_pair(hev, take_ev)
+
+    if not (orig_out is None or out is orig_out):
+        # Copy the out data from temporary buffer to original memory
+        ht_e_cpy, cpy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
+            src=out, dst=orig_out, sycl_queue=exec_q, depends=[take_ev]
+        )
+        _manager.add_event_pair(ht_e_cpy, cpy_ev)
+        out = orig_out
+
+    return out
diff --git a/dpctl_ext/tensor/_numpy_helper.py b/dpctl_ext/tensor/_numpy_helper.py
new file mode 100644
index 000000000000..4ad735823cb3
--- /dev/null
+++ b/dpctl_ext/tensor/_numpy_helper.py
@@ -0,0 +1,45 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+
+import numpy as np
+
+_npver = np.lib.NumpyVersion(np.__version__)
+
+if _npver < "1.25.0":  # pragma: no cover
+    from numpy import AxisError
+else:
+    from numpy.exceptions import AxisError
+
+if _npver >= "2.0.0":
+    from numpy._core.numeric import normalize_axis_index, normalize_axis_tuple
+else:  # pragma: no cover
+    from numpy.core.numeric import normalize_axis_index, normalize_axis_tuple
+
+
+__all__ = ["AxisError", "normalize_axis_index", "normalize_axis_tuple"]
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/constructors.hpp b/dpctl_ext/tensor/libtensor/include/kernels/constructors.hpp
new file mode 100644
index 000000000000..22189ee3129c
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/include/kernels/constructors.hpp
@@ -0,0 +1,303 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for tensor constructors.
+//===----------------------------------------------------------------------===//
+
+#pragma once
+#include <array>
+#include <complex>
+#include <cstddef>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpctl_tensor_types.hpp"
+#include "utils/offset_utils.hpp"
+#include "utils/strided_iters.hpp"
+#include "utils/type_utils.hpp"
+
+namespace dpctl::tensor::kernels::constructors
+{
+
+using dpctl::tensor::ssize_t;
+
+/*!
+  @defgroup CtorKernels
+ */
+
+template <typename Ty>
+class full_strided_kernel;
+// template <typename Ty> class eye_kernel;
+
+using namespace dpctl::tensor::offset_utils;
+
+/* ================ Full ================== */
+
+/*!
+ * @brief Function to submit kernel to fill given contiguous memory allocation
+ * with specified value.
+ *
+ * @param exec_q  Sycl queue to which kernel is submitted for execution.
+ * @param nelems  Length of the sequence
+ * @param fill_v  Value to fill the array with
+ * @param dst_p Kernel accessible USM pointer to the start of array to be
+ * populated.
+ * @param depends  List of events to wait for before starting computations, if
+ * any.
+ *
+ * @return Event to wait on to ensure that computation completes.
+ * @defgroup CtorKernels
+ */
+template <typename dstTy>
+sycl::event full_contig_impl(sycl::queue &q,
+                             std::size_t nelems,
+                             dstTy fill_v,
+                             char *dst_p,
+                             const std::vector<sycl::event> &depends)
+{
+    dpctl::tensor::type_utils::validate_type_for_device<dstTy>(q);
+    sycl::event fill_ev = q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+        dstTy *p = reinterpret_cast<dstTy *>(dst_p);
+        cgh.fill<dstTy>(p, fill_v, nelems);
+    });
+
+    return fill_ev;
+}
+
+template <typename Ty, typename IndexerT>
+class FullStridedFunctor
+{
+private:
+    Ty *p = nullptr;
+    Ty fill_v;
+    IndexerT indexer;
+
+public:
+    FullStridedFunctor(Ty *p_, const Ty &fill_v_, const IndexerT &indexer_)
+        : p(p_), fill_v(fill_v_), indexer(indexer_)
+    {
+    }
+
+    void operator()(sycl::id<1> id) const
+    {
+        auto offset = indexer(id.get(0));
+        p[offset] = fill_v;
+    }
+};
+
+/*!
+ * @brief Function to submit kernel to fill given contiguous memory allocation
+ * with specified value.
+ *
+ * @param exec_q  Sycl queue to which kernel is submitted for execution.
+ * @param nd  Array dimensionality
+ * @param nelems  Length of the sequence
+ * @param shape_strides  Kernel accessible USM pointer to packed shape and
+ * strides of array.
+ * @param fill_v  Value to fill the array with
+ * @param dst_p  Kernel accessible USM pointer to the start of array to be
+ * populated.
+ * @param depends  List of events to wait for before starting computations, if
+ * any.
+ *
+ * @return Event to wait on to ensure that computation completes.
+ * @defgroup CtorKernels
+ */
+template <typename dstTy>
+sycl::event full_strided_impl(sycl::queue &q,
+                              int nd,
+                              std::size_t nelems,
+                              const ssize_t *shape_strides,
+                              dstTy fill_v,
+                              char *dst_p,
+                              const std::vector<sycl::event> &depends)
+{
+    dpctl::tensor::type_utils::validate_type_for_device<dstTy>(q);
+
+    dstTy *dst_tp = reinterpret_cast<dstTy *>(dst_p);
+
+    using dpctl::tensor::offset_utils::StridedIndexer;
+    const StridedIndexer strided_indexer(nd, 0, shape_strides);
+
+    sycl::event fill_ev = q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+
+        using KernelName = full_strided_kernel<dstTy>;
+        using Impl = FullStridedFunctor<dstTy, StridedIndexer>;
+
+        cgh.parallel_for<KernelName>(sycl::range<1>{nelems},
+                                     Impl(dst_tp, fill_v, strided_indexer));
+    });
+
+    return fill_ev;
+}
+
+/* =========================== Tril and triu ============================== */
+
+// define function type
+typedef sycl::event (*tri_fn_ptr_t)(sycl::queue &,
+                                    ssize_t,   // inner_range  //ssize_t
+                                    ssize_t,   // outer_range
+                                    char *,    // src_data_ptr
+                                    char *,    // dst_data_ptr
+                                    ssize_t,   // nd
+                                    ssize_t *, // shape_and_strides
+                                    ssize_t,   // k
+                                    const std::vector<sycl::event> &,
+                                    const std::vector<sycl::event> &);
+
+/*!
+ * @brief Function to copy triangular matrices from source stack to destination
+ * stack.
+ *
+ * @param exec_q  Sycl queue to which kernel is submitted for execution.
+ * @param inner_range  Number of elements in each matrix.
+ * @param outer_range  Number of matrices to copy.
+ * @param src_p  Kernel accessible USM pointer for the source array.
+ * @param dst_p  Kernel accessible USM pointer for the destination array.
+ * @param nd  The array dimensionality of source and destination arrays.
+ * @param shape_and_strides  Kernel accessible USM pointer to packed shape and
+ * strides of arrays.
+ * @param k Position of the diagonal above/below which to copy filling the rest
+ * with zero elements.
+ * @param depends  List of events to wait for before starting computations, if
+ * any.
+ * @param additional_depends  List of additional events to wait for before
+ * starting computations, if any.
+ *
+ * @return  Event to wait on to ensure that computation completes.
+ * @defgroup CtorKernels
+ */
+template <typename Ty, bool>
+class tri_kernel;
+template <typename Ty, bool upper>
+sycl::event tri_impl(sycl::queue &exec_q,
+                     ssize_t inner_range,
+                     ssize_t outer_range,
+                     char *src_p,
+                     char *dst_p,
+                     ssize_t nd,
+                     ssize_t *shape_and_strides,
+                     ssize_t k,
+                     const std::vector<sycl::event> &depends,
+                     const std::vector<sycl::event> &additional_depends)
+{
+    static constexpr int d2 = 2;
+    ssize_t src_s = nd;
+    ssize_t dst_s = 2 * nd;
+    ssize_t nd_1 = nd - 1;
+    ssize_t nd_2 = nd - 2;
+    Ty *src = reinterpret_cast<Ty *>(src_p);
+    Ty *dst = reinterpret_cast<Ty *>(dst_p);
+
+    dpctl::tensor::type_utils::validate_type_for_device<Ty>(exec_q);
+
+    sycl::event tri_ev = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+        cgh.depends_on(additional_depends);
+
+        cgh.parallel_for<tri_kernel<Ty, upper>>(
+            sycl::range<1>(inner_range * outer_range), [=](sycl::id<1> idx) {
+                ssize_t outer_gid = idx[0] / inner_range;
+                ssize_t inner_gid = idx[0] - inner_range * outer_gid;
+
+                ssize_t src_inner_offset = 0, dst_inner_offset = 0;
+                bool to_copy{false};
+
+                {
+                    using dpctl::tensor::strides::CIndexer_array;
+                    CIndexer_array<d2, ssize_t> indexer_i(
+                        {shape_and_strides[nd_2], shape_and_strides[nd_1]});
+                    indexer_i.set(inner_gid);
+                    const std::array<ssize_t, d2> &inner = indexer_i.get();
+                    src_inner_offset =
+                        inner[0] * shape_and_strides[src_s + nd_2] +
+                        inner[1] * shape_and_strides[src_s + nd_1];
+                    dst_inner_offset =
+                        inner[0] * shape_and_strides[dst_s + nd_2] +
+                        inner[1] * shape_and_strides[dst_s + nd_1];
+
+                    if constexpr (upper)
+                        to_copy = (inner[0] + k >= inner[1]);
+                    else
+                        to_copy = (inner[0] + k <= inner[1]);
+                }
+
+                ssize_t src_offset = 0;
+                ssize_t dst_offset = 0;
+                {
+                    using dpctl::tensor::strides::CIndexer_vector;
+                    CIndexer_vector<ssize_t> outer(nd - d2);
+                    outer.get_displacement(
+                        outer_gid, shape_and_strides, shape_and_strides + src_s,
+                        shape_and_strides + dst_s, src_offset, dst_offset);
+                }
+
+                src_offset += src_inner_offset;
+                dst_offset += dst_inner_offset;
+
+                dst[dst_offset] = (to_copy) ? src[src_offset] : Ty(0);
+            });
+    });
+    return tri_ev;
+}
+
+/*!
+ * @brief  Factory to get function pointer of type `fnT` for data type `Ty`.
+ * @ingroup CtorKernels
+ */
+template <typename fnT, typename Ty>
+struct TrilGenericFactory
+{
+    fnT get()
+    {
+        fnT f = tri_impl<Ty, /*tril*/ true>;
+        return f;
+    }
+};
+
+/*!
+ * @brief  Factory to get function pointer of type `fnT` for data type `Ty`.
+ * @ingroup CtorKernels
+ */
+template <typename fnT, typename Ty>
+struct TriuGenericFactory
+{
+    fnT get()
+    {
+        fnT f = tri_impl<Ty, /*triu*/ false>;
+        return f;
+    }
+};
+
+} // namespace dpctl::tensor::kernels::constructors
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/integer_advanced_indexing.hpp b/dpctl_ext/tensor/libtensor/include/kernels/integer_advanced_indexing.hpp
new file mode 100644
index 000000000000..7be2b3ea8591
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/include/kernels/integer_advanced_indexing.hpp
@@ -0,0 +1,418 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for advanced tensor index operations.
+//===----------------------------------------------------------------------===//
+
+#pragma once
+#include <complex>
+#include <cstddef>
+#include <type_traits>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpctl_tensor_types.hpp"
+#include "utils/indexing_utils.hpp"
+#include "utils/offset_utils.hpp"
+#include "utils/type_utils.hpp"
+
+namespace dpctl::tensor::kernels::indexing
+{
+
+using dpctl::tensor::ssize_t;
+
+template <typename ProjectorT,
+          typename OrthogIndexer,
+          typename IndicesIndexer,
+          typename AxesIndexer,
+          typename T,
+          typename indT>
+class TakeFunctor
+{
+private:
+    const char *src_ = nullptr;
+    char *dst_ = nullptr;
+    char **ind_ = nullptr;
+    int k_ = 0;
+    std::size_t ind_nelems_ = 0;
+    const ssize_t *axes_shape_and_strides_ = nullptr;
+    OrthogIndexer orthog_strider;
+    IndicesIndexer ind_strider;
+    AxesIndexer axes_strider;
+
+public:
+    TakeFunctor(const char *src_cp,
+                char *dst_cp,
+                char **ind_cp,
+                int k,
+                std::size_t ind_nelems,
+                const ssize_t *axes_shape_and_strides,
+                const OrthogIndexer &orthog_strider_,
+                const IndicesIndexer &ind_strider_,
+                const AxesIndexer &axes_strider_)
+        : src_(src_cp), dst_(dst_cp), ind_(ind_cp), k_(k),
+          ind_nelems_(ind_nelems),
+          axes_shape_and_strides_(axes_shape_and_strides),
+          orthog_strider(orthog_strider_), ind_strider(ind_strider_),
+          axes_strider(axes_strider_)
+    {
+    }
+
+    void operator()(sycl::id<1> id) const
+    {
+        const T *src = reinterpret_cast<const T *>(src_);
+        T *dst = reinterpret_cast<T *>(dst_);
+
+        ssize_t i_orthog = id / ind_nelems_;
+        ssize_t i_along = id - (i_orthog * ind_nelems_);
+
+        auto orthog_offsets = orthog_strider(i_orthog);
+
+        ssize_t src_offset = orthog_offsets.get_first_offset();
+        ssize_t dst_offset = orthog_offsets.get_second_offset();
+
+        static constexpr ProjectorT proj{};
+        for (int axis_idx = 0; axis_idx < k_; ++axis_idx) {
+            indT *ind_data = reinterpret_cast<indT *>(ind_[axis_idx]);
+
+            ssize_t ind_offset = ind_strider(i_along, axis_idx);
+            // proj produces an index in the range of the given axis
+            ssize_t projected_idx =
+                proj(axes_shape_and_strides_[axis_idx], ind_data[ind_offset]);
+            src_offset +=
+                projected_idx * axes_shape_and_strides_[k_ + axis_idx];
+        }
+
+        dst_offset += axes_strider(i_along);
+
+        dst[dst_offset] = src[src_offset];
+    }
+};
+
+template <typename ProjectorT,
+          typename OrthogIndexer,
+          typename IndicesIndexer,
+          typename AxesIndexer,
+          typename T,
+          typename indT>
+class take_kernel;
+
+typedef sycl::event (*take_fn_ptr_t)(sycl::queue &,
+                                     std::size_t,
+                                     std::size_t,
+                                     int,
+                                     int,
+                                     int,
+                                     const ssize_t *,
+                                     const ssize_t *,
+                                     const ssize_t *,
+                                     const char *,
+                                     char *,
+                                     char **,
+                                     ssize_t,
+                                     ssize_t,
+                                     const ssize_t *,
+                                     const std::vector<sycl::event> &);
+
+template <typename ProjectorT, typename Ty, typename indT>
+sycl::event take_impl(sycl::queue &q,
+                      std::size_t orthog_nelems,
+                      std::size_t ind_nelems,
+                      int nd,
+                      int ind_nd,
+                      int k,
+                      const ssize_t *orthog_shape_and_strides,
+                      const ssize_t *axes_shape_and_strides,
+                      const ssize_t *ind_shape_and_strides,
+                      const char *src_p,
+                      char *dst_p,
+                      char **ind_p,
+                      ssize_t src_offset,
+                      ssize_t dst_offset,
+                      const ssize_t *ind_offsets,
+                      const std::vector<sycl::event> &depends)
+{
+    dpctl::tensor::type_utils::validate_type_for_device<Ty>(q);
+
+    sycl::event take_ev = q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+
+        using OrthogIndexerT =
+            dpctl::tensor::offset_utils::TwoOffsets_StridedIndexer;
+        const OrthogIndexerT orthog_indexer{nd, src_offset, dst_offset,
+                                            orthog_shape_and_strides};
+
+        using NthStrideIndexerT = dpctl::tensor::offset_utils::NthStrideOffset;
+        const NthStrideIndexerT indices_indexer{ind_nd, ind_offsets,
+                                                ind_shape_and_strides};
+
+        using AxesIndexerT = dpctl::tensor::offset_utils::StridedIndexer;
+        const AxesIndexerT axes_indexer{ind_nd, 0,
+                                        axes_shape_and_strides + (2 * k)};
+
+        using KernelName =
+            take_kernel<ProjectorT, OrthogIndexerT, NthStrideIndexerT,
+                        AxesIndexerT, Ty, indT>;
+
+        const std::size_t gws = orthog_nelems * ind_nelems;
+
+        cgh.parallel_for<KernelName>(
+            sycl::range<1>(gws),
+            TakeFunctor<ProjectorT, OrthogIndexerT, NthStrideIndexerT,
+                        AxesIndexerT, Ty, indT>(
+                src_p, dst_p, ind_p, k, ind_nelems, axes_shape_and_strides,
+                orthog_indexer, indices_indexer, axes_indexer));
+    });
+
+    return take_ev;
+}
+
+template <typename ProjectorT,
+          typename OrthogIndexer,
+          typename IndicesIndexer,
+          typename AxesIndexer,
+          typename T,
+          typename indT>
+class PutFunctor
+{
+private:
+    char *dst_ = nullptr;
+    const char *val_ = nullptr;
+    char **ind_ = nullptr;
+    int k_ = 0;
+    std::size_t ind_nelems_ = 0;
+    const ssize_t *axes_shape_and_strides_ = nullptr;
+    OrthogIndexer orthog_strider;
+    IndicesIndexer ind_strider;
+    AxesIndexer axes_strider;
+
+public:
+    PutFunctor(char *dst_cp,
+               const char *val_cp,
+               char **ind_cp,
+               int k,
+               std::size_t ind_nelems,
+               const ssize_t *axes_shape_and_strides,
+               const OrthogIndexer &orthog_strider_,
+               const IndicesIndexer &ind_strider_,
+               const AxesIndexer &axes_strider_)
+        : dst_(dst_cp), val_(val_cp), ind_(ind_cp), k_(k),
+          ind_nelems_(ind_nelems),
+          axes_shape_and_strides_(axes_shape_and_strides),
+          orthog_strider(orthog_strider_), ind_strider(ind_strider_),
+          axes_strider(axes_strider_)
+    {
+    }
+
+    void operator()(sycl::id<1> id) const
+    {
+        T *dst = reinterpret_cast<T *>(dst_);
+        const T *val = reinterpret_cast<const T *>(val_);
+
+        ssize_t i_orthog = id / ind_nelems_;
+        ssize_t i_along = id - (i_orthog * ind_nelems_);
+
+        auto orthog_offsets = orthog_strider(i_orthog);
+
+        ssize_t dst_offset = orthog_offsets.get_first_offset();
+        ssize_t val_offset = orthog_offsets.get_second_offset();
+
+        static constexpr ProjectorT proj{};
+        for (int axis_idx = 0; axis_idx < k_; ++axis_idx) {
+            indT *ind_data = reinterpret_cast<indT *>(ind_[axis_idx]);
+
+            ssize_t ind_offset = ind_strider(i_along, axis_idx);
+
+            // proj produces an index in the range of the given axis
+            ssize_t projected_idx =
+                proj(axes_shape_and_strides_[axis_idx], ind_data[ind_offset]);
+            dst_offset +=
+                projected_idx * axes_shape_and_strides_[k_ + axis_idx];
+        }
+
+        val_offset += axes_strider(i_along);
+
+        dst[dst_offset] = val[val_offset];
+    }
+};
+
+template <typename ProjectorT,
+          typename OrthogIndexer,
+          typename IndicesIndexer,
+          typename AxesIndexer,
+          typename T,
+          typename indT>
+class put_kernel;
+
+typedef sycl::event (*put_fn_ptr_t)(sycl::queue &,
+                                    std::size_t,
+                                    std::size_t,
+                                    int,
+                                    int,
+                                    int,
+                                    const ssize_t *,
+                                    const ssize_t *,
+                                    const ssize_t *,
+                                    char *,
+                                    const char *,
+                                    char **,
+                                    ssize_t,
+                                    ssize_t,
+                                    const ssize_t *,
+                                    const std::vector<sycl::event> &);
+
+template <typename ProjectorT, typename Ty, typename indT>
+sycl::event put_impl(sycl::queue &q,
+                     std::size_t orthog_nelems,
+                     std::size_t ind_nelems,
+                     int nd,
+                     int ind_nd,
+                     int k,
+                     const ssize_t *orthog_shape_and_strides,
+                     const ssize_t *axes_shape_and_strides,
+                     const ssize_t *ind_shape_and_strides,
+                     char *dst_p,
+                     const char *val_p,
+                     char **ind_p,
+                     ssize_t dst_offset,
+                     ssize_t val_offset,
+                     const ssize_t *ind_offsets,
+                     const std::vector<sycl::event> &depends)
+{
+    dpctl::tensor::type_utils::validate_type_for_device<Ty>(q);
+
+    sycl::event put_ev = q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+
+        using OrthogIndexerT =
+            dpctl::tensor::offset_utils::TwoOffsets_StridedIndexer;
+        const OrthogIndexerT orthog_indexer{nd, dst_offset, val_offset,
+                                            orthog_shape_and_strides};
+
+        using NthStrideIndexerT = dpctl::tensor::offset_utils::NthStrideOffset;
+        const NthStrideIndexerT indices_indexer{ind_nd, ind_offsets,
+                                                ind_shape_and_strides};
+
+        using AxesIndexerT = dpctl::tensor::offset_utils::StridedIndexer;
+        const AxesIndexerT axes_indexer{ind_nd, 0,
+                                        axes_shape_and_strides + (2 * k)};
+
+        using KernelName =
+            put_kernel<ProjectorT, OrthogIndexerT, NthStrideIndexerT,
+                       AxesIndexerT, Ty, indT>;
+
+        const std::size_t gws = orthog_nelems * ind_nelems;
+
+        cgh.parallel_for<KernelName>(
+            sycl::range<1>(gws),
+            PutFunctor<ProjectorT, OrthogIndexerT, NthStrideIndexerT,
+                       AxesIndexerT, Ty, indT>(
+                dst_p, val_p, ind_p, k, ind_nelems, axes_shape_and_strides,
+                orthog_indexer, indices_indexer, axes_indexer));
+    });
+
+    return put_ev;
+}
+
+template <typename fnT, typename T, typename indT>
+struct TakeWrapFactory
+{
+    fnT get()
+    {
+        if constexpr (std::is_integral<indT>::value &&
+                      !std::is_same<indT, bool>::value) {
+            using dpctl::tensor::indexing_utils::WrapIndex;
+            fnT fn = take_impl<WrapIndex<indT>, T, indT>;
+            return fn;
+        }
+        else {
+            fnT fn = nullptr;
+            return fn;
+        }
+    }
+};
+
+template <typename fnT, typename T, typename indT>
+struct TakeClipFactory
+{
+    fnT get()
+    {
+        if constexpr (std::is_integral<indT>::value &&
+                      !std::is_same<indT, bool>::value) {
+            using dpctl::tensor::indexing_utils::ClipIndex;
+            fnT fn = take_impl<ClipIndex<indT>, T, indT>;
+            return fn;
+        }
+        else {
+            fnT fn = nullptr;
+            return fn;
+        }
+    }
+};
+
+template <typename fnT, typename T, typename indT>
+struct PutWrapFactory
+{
+    fnT get()
+    {
+        if constexpr (std::is_integral<indT>::value &&
+                      !std::is_same<indT, bool>::value) {
+            using dpctl::tensor::indexing_utils::WrapIndex;
+            fnT fn = put_impl<WrapIndex<indT>, T, indT>;
+            return fn;
+        }
+        else {
+            fnT fn = nullptr;
+            return fn;
+        }
+    }
+};
+
+template <typename fnT, typename T, typename indT>
+struct PutClipFactory
+{
+    fnT get()
+    {
+        if constexpr (std::is_integral<indT>::value &&
+                      !std::is_same<indT, bool>::value) {
+            using dpctl::tensor::indexing_utils::ClipIndex;
+            fnT fn = put_impl<ClipIndex<indT>, T, indT>;
+            return fn;
+        }
+        else {
+            fnT fn = nullptr;
+            return fn;
+        }
+    }
+};
+
+} // namespace dpctl::tensor::kernels::indexing
diff --git a/dpctl_ext/tensor/libtensor/source/full_ctor.cpp b/dpctl_ext/tensor/libtensor/source/full_ctor.cpp
new file mode 100644
index 000000000000..aef57836666e
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/full_ctor.cpp
@@ -0,0 +1,311 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===--------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions
+//===--------------------------------------------------------------------===//
+
+#include <complex>
+#include <cstddef>
+#include <cstdint>
+#include <tuple>
+#include <utility>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/complex.h>
+#include <pybind11/pybind11.h>
+
+#include "kernels/constructors.hpp"
+#include "utils/output_validation.hpp"
+#include "utils/type_dispatch.hpp"
+#include "utils/type_utils.hpp"
+
+#include "full_ctor.hpp"
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace dpctl::tensor::py_internal
+{
+
+using dpctl::utils::keep_args_alive;
+
+typedef sycl::event (*full_contig_fn_ptr_t)(sycl::queue &,
+                                            std::size_t,
+                                            const py::object &,
+                                            char *,
+                                            const std::vector<sycl::event> &);
+
+/*!
+ * @brief Function to submit kernel to fill given contiguous memory allocation
+ * with specified value.
+ *
+ * @param exec_q  Sycl queue to which kernel is submitted for execution.
+ * @param nelems  Length of the sequence
+ * @param py_value  Python object representing the value to fill the array with.
+ * Must be convertible to `dstTy`.
+ * @param dst_p  Kernel accessible USM pointer to the start of array to be
+ * populated.
+ * @param depends  List of events to wait for before starting computations, if
+ * any.
+ *
+ * @return Event to wait on to ensure that computation completes.
+ * @defgroup CtorKernels
+ */
+template <typename dstTy>
+sycl::event full_contig_impl(sycl::queue &exec_q,
+                             std::size_t nelems,
+                             const py::object &py_value,
+                             char *dst_p,
+                             const std::vector<sycl::event> &depends)
+{
+    dstTy fill_v = py::cast<dstTy>(py_value);
+
+    sycl::event fill_ev;
+
+    if constexpr (sizeof(dstTy) == sizeof(char)) {
+        const auto memset_val = sycl::bit_cast<unsigned char>(fill_v);
+        fill_ev = exec_q.submit([&](sycl::handler &cgh) {
+            cgh.depends_on(depends);
+
+            cgh.memset(reinterpret_cast<void *>(dst_p), memset_val,
+                       nelems * sizeof(dstTy));
+        });
+    }
+    else {
+        bool is_zero = false;
+        if constexpr (sizeof(dstTy) == 1) {
+            is_zero = (std::uint8_t{0} == sycl::bit_cast<std::uint8_t>(fill_v));
+        }
+        else if constexpr (sizeof(dstTy) == 2) {
+            is_zero =
+                (std::uint16_t{0} == sycl::bit_cast<std::uint16_t>(fill_v));
+        }
+        else if constexpr (sizeof(dstTy) == 4) {
+            is_zero =
+                (std::uint32_t{0} == sycl::bit_cast<std::uint32_t>(fill_v));
+        }
+        else if constexpr (sizeof(dstTy) == 8) {
+            is_zero =
+                (std::uint64_t{0} == sycl::bit_cast<std::uint64_t>(fill_v));
+        }
+        else if constexpr (sizeof(dstTy) == 16) {
+            struct UInt128
+            {
+
+                constexpr UInt128() : v1{}, v2{} {}
+                UInt128(const UInt128 &) = default;
+
+                operator bool() const
+                {
+                    return bool(!v1) && bool(!v2);
+                }
+
+                std::uint64_t v1;
+                std::uint64_t v2;
+            };
+            is_zero = static_cast<bool>(sycl::bit_cast<UInt128>(fill_v));
+        }
+
+        if (is_zero) {
+            static constexpr int memset_val = 0;
+            fill_ev = exec_q.submit([&](sycl::handler &cgh) {
+                cgh.depends_on(depends);
+
+                cgh.memset(reinterpret_cast<void *>(dst_p), memset_val,
+                           nelems * sizeof(dstTy));
+            });
+        }
+        else {
+            using dpctl::tensor::kernels::constructors::full_contig_impl;
+
+            fill_ev =
+                full_contig_impl<dstTy>(exec_q, nelems, fill_v, dst_p, depends);
+        }
+    }
+
+    return fill_ev;
+}
+
+template <typename fnT, typename Ty>
+struct FullContigFactory
+{
+    fnT get()
+    {
+        fnT f = full_contig_impl<Ty>;
+        return f;
+    }
+};
+
+typedef sycl::event (*full_strided_fn_ptr_t)(sycl::queue &,
+                                             int,
+                                             std::size_t,
+                                             py::ssize_t *,
+                                             const py::object &,
+                                             char *,
+                                             const std::vector<sycl::event> &);
+
+/*!
+ * @brief Function to submit kernel to fill given strided memory allocation
+ * with specified value.
+ *
+ * @param exec_q  Sycl queue to which kernel is submitted for execution.
+ * @param nd  Array dimensionality
+ * @param nelems  Length of the sequence
+ * @param shape_strides  Kernel accessible USM pointer to packed shape and
+ * strides of array.
+ * @param py_value  Python object representing the value to fill the array with.
+ * Must be convertible to `dstTy`.
+ * @param dst_p  Kernel accessible USM pointer to the start of array to be
+ * populated.
+ * @param depends  List of events to wait for before starting computations, if
+ * any.
+ *
+ * @return Event to wait on to ensure that computation completes.
+ * @defgroup CtorKernels
+ */
+template <typename dstTy>
+sycl::event full_strided_impl(sycl::queue &exec_q,
+                              int nd,
+                              std::size_t nelems,
+                              py::ssize_t *shape_strides,
+                              const py::object &py_value,
+                              char *dst_p,
+                              const std::vector<sycl::event> &depends)
+{
+    dstTy fill_v = py::cast<dstTy>(py_value);
+
+    using dpctl::tensor::kernels::constructors::full_strided_impl;
+    sycl::event fill_ev = full_strided_impl<dstTy>(
+        exec_q, nd, nelems, shape_strides, fill_v, dst_p, depends);
+
+    return fill_ev;
+}
+
+template <typename fnT, typename Ty>
+struct FullStridedFactory
+{
+    fnT get()
+    {
+        fnT f = full_strided_impl<Ty>;
+        return f;
+    }
+};
+
+static full_contig_fn_ptr_t full_contig_dispatch_vector[td_ns::num_types];
+static full_strided_fn_ptr_t full_strided_dispatch_vector[td_ns::num_types];
+
+std::pair<sycl::event, sycl::event>
+    usm_ndarray_full(const py::object &py_value,
+                     const dpctl::tensor::usm_ndarray &dst,
+                     sycl::queue &exec_q,
+                     const std::vector<sycl::event> &depends)
+{
+    // py_value should be coercible into data type of dst
+
+    py::ssize_t dst_nelems = dst.get_size();
+
+    if (dst_nelems == 0) {
+        // nothing to do
+        return std::make_pair(sycl::event(), sycl::event());
+    }
+
+    if (!dpctl::utils::queues_are_compatible(exec_q, {dst})) {
+        throw py::value_error(
+            "Execution queue is not compatible with the allocation queue");
+    }
+
+    dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst);
+
+    auto array_types = td_ns::usm_ndarray_types();
+    int dst_typenum = dst.get_typenum();
+    int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum);
+
+    char *dst_data = dst.get_data();
+
+    if (dst_nelems == 1 || dst.is_c_contiguous() || dst.is_f_contiguous()) {
+        auto fn = full_contig_dispatch_vector[dst_typeid];
+
+        sycl::event full_contig_event =
+            fn(exec_q, static_cast<std::size_t>(dst_nelems), py_value, dst_data,
+               depends);
+
+        return std::make_pair(
+            keep_args_alive(exec_q, {dst}, {full_contig_event}),
+            full_contig_event);
+    }
+    else {
+        int nd = dst.get_ndim();
+        auto const &dst_shape = dst.get_shape_vector();
+        auto const &dst_strides = dst.get_strides_vector();
+
+        auto fn = full_strided_dispatch_vector[dst_typeid];
+
+        std::vector<sycl::event> host_task_events;
+        host_task_events.reserve(2);
+        using dpctl::tensor::offset_utils::device_allocate_and_pack;
+        auto ptr_size_event_tuple = device_allocate_and_pack<py::ssize_t>(
+            exec_q, host_task_events, dst_shape, dst_strides);
+        auto shape_strides_owner = std::move(std::get<0>(ptr_size_event_tuple));
+        const sycl::event &copy_shape_ev = std::get<2>(ptr_size_event_tuple);
+        py::ssize_t *shape_strides = shape_strides_owner.get();
+
+        const sycl::event &full_strided_ev =
+            fn(exec_q, nd, dst_nelems, shape_strides, py_value, dst_data,
+               {copy_shape_ev});
+
+        // free shape_strides
+        const auto &temporaries_cleanup_ev =
+            dpctl::tensor::alloc_utils::async_smart_free(
+                exec_q, {full_strided_ev}, shape_strides_owner);
+        host_task_events.push_back(temporaries_cleanup_ev);
+
+        return std::make_pair(keep_args_alive(exec_q, {dst}, host_task_events),
+                              full_strided_ev);
+    }
+}
+
+void init_full_ctor_dispatch_vectors(void)
+{
+    using namespace td_ns;
+
+    DispatchVectorBuilder<full_contig_fn_ptr_t, FullContigFactory, num_types>
+        dvb1;
+    dvb1.populate_dispatch_vector(full_contig_dispatch_vector);
+
+    DispatchVectorBuilder<full_strided_fn_ptr_t, FullStridedFactory, num_types>
+        dvb2;
+    dvb2.populate_dispatch_vector(full_strided_dispatch_vector);
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/full_ctor.hpp b/dpctl_ext/tensor/libtensor/source/full_ctor.hpp
new file mode 100644
index 000000000000..18c15de87a40
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/full_ctor.hpp
@@ -0,0 +1,57 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===--------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions
+//===--------------------------------------------------------------------===//
+
+#pragma once
+#include <utility>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern std::pair<sycl::event, sycl::event>
+    usm_ndarray_full(const py::object &py_value,
+                     const dpctl::tensor::usm_ndarray &dst,
+                     sycl::queue &exec_q,
+                     const std::vector<sycl::event> &depends = {});
+
+extern void init_full_ctor_dispatch_vectors(void);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/integer_advanced_indexing.cpp b/dpctl_ext/tensor/libtensor/source/integer_advanced_indexing.cpp
new file mode 100644
index 000000000000..925cc2e895ed
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/integer_advanced_indexing.cpp
@@ -0,0 +1,817 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines implementation functions of dpctl.tensor.take and
+/// dpctl.tensor.put
+//===----------------------------------------------------------------------===//
+
+#include <algorithm>
+#include <complex>
+#include <cstddef>
+#include <cstdint>
+#include <iterator>
+#include <memory>
+#include <stdexcept>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/complex.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "kernels/integer_advanced_indexing.hpp"
+#include "utils/memory_overlap.hpp"
+#include "utils/output_validation.hpp"
+#include "utils/sycl_alloc_utils.hpp"
+#include "utils/type_dispatch.hpp"
+#include "utils/type_utils.hpp"
+
+#include "integer_advanced_indexing.hpp"
+
+#define INDEXING_MODES 2
+#define WRAP_MODE      0
+#define CLIP_MODE      1
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+using dpctl::tensor::kernels::indexing::put_fn_ptr_t;
+using dpctl::tensor::kernels::indexing::take_fn_ptr_t;
+
+static take_fn_ptr_t take_dispatch_table[INDEXING_MODES][td_ns::num_types]
+                                        [td_ns::num_types];
+
+static put_fn_ptr_t put_dispatch_table[INDEXING_MODES][td_ns::num_types]
+                                      [td_ns::num_types];
+
+namespace py = pybind11;
+
+using dpctl::utils::keep_args_alive;
+
+std::vector<sycl::event>
+    _populate_kernel_params(sycl::queue &exec_q,
+                            std::vector<sycl::event> &host_task_events,
+                            char **device_ind_ptrs,
+                            py::ssize_t *device_ind_sh_st,
+                            py::ssize_t *device_ind_offsets,
+                            py::ssize_t *device_orthog_sh_st,
+                            py::ssize_t *device_along_sh_st,
+                            const py::ssize_t *inp_shape,
+                            const py::ssize_t *arr_shape,
+                            std::vector<py::ssize_t> &inp_strides,
+                            std::vector<py::ssize_t> &arr_strides,
+                            std::vector<py::ssize_t> &ind_sh_sts,
+                            std::vector<char *> &ind_ptrs,
+                            std::vector<py::ssize_t> &ind_offsets,
+                            int axis_start,
+                            int k,
+                            int ind_nd,
+                            int inp_nd,
+                            int orthog_sh_elems,
+                            int ind_sh_elems)
+{
+
+    using usm_host_allocator_T =
+        dpctl::tensor::alloc_utils::usm_host_allocator<char *>;
+    using ptrT = std::vector<char *, usm_host_allocator_T>;
+
+    usm_host_allocator_T ptr_allocator(exec_q);
+    std::shared_ptr<ptrT> host_ind_ptrs_shp =
+        std::make_shared<ptrT>(k, ptr_allocator);
+
+    using usm_host_allocatorT =
+        dpctl::tensor::alloc_utils::usm_host_allocator<py::ssize_t>;
+    using shT = std::vector<py::ssize_t, usm_host_allocatorT>;
+
+    usm_host_allocatorT sz_allocator(exec_q);
+    std::shared_ptr<shT> host_ind_sh_st_shp =
+        std::make_shared<shT>(ind_sh_elems * (k + 1), sz_allocator);
+
+    std::shared_ptr<shT> host_ind_offsets_shp =
+        std::make_shared<shT>(k, sz_allocator);
+
+    std::shared_ptr<shT> host_orthog_sh_st_shp =
+        std::make_shared<shT>(3 * orthog_sh_elems, sz_allocator);
+
+    std::shared_ptr<shT> host_along_sh_st_shp =
+        std::make_shared<shT>(2 * (k + ind_sh_elems), sz_allocator);
+
+    std::copy(ind_sh_sts.begin(), ind_sh_sts.end(),
+              host_ind_sh_st_shp->begin());
+    std::copy(ind_ptrs.begin(), ind_ptrs.end(), host_ind_ptrs_shp->begin());
+    std::copy(ind_offsets.begin(), ind_offsets.end(),
+              host_ind_offsets_shp->begin());
+
+    const sycl::event &device_ind_ptrs_copy_ev = exec_q.copy<char *>(
+        host_ind_ptrs_shp->data(), device_ind_ptrs, host_ind_ptrs_shp->size());
+
+    const sycl::event &device_ind_sh_st_copy_ev =
+        exec_q.copy<py::ssize_t>(host_ind_sh_st_shp->data(), device_ind_sh_st,
+                                 host_ind_sh_st_shp->size());
+
+    const sycl::event &device_ind_offsets_copy_ev = exec_q.copy<py::ssize_t>(
+        host_ind_offsets_shp->data(), device_ind_offsets,
+        host_ind_offsets_shp->size());
+
+    int orthog_nd = inp_nd - k;
+
+    if (orthog_nd > 0) {
+        if (axis_start > 0) {
+            std::copy(inp_shape, inp_shape + axis_start,
+                      host_orthog_sh_st_shp->begin());
+            std::copy(inp_strides.begin(), inp_strides.begin() + axis_start,
+                      host_orthog_sh_st_shp->begin() + orthog_sh_elems);
+            std::copy(arr_strides.begin(), arr_strides.begin() + axis_start,
+                      host_orthog_sh_st_shp->begin() + 2 * orthog_sh_elems);
+        }
+        if (inp_nd > (axis_start + k)) {
+            std::copy(inp_shape + axis_start + k, inp_shape + inp_nd,
+                      host_orthog_sh_st_shp->begin() + axis_start);
+            std::copy(inp_strides.begin() + axis_start + k, inp_strides.end(),
+                      host_orthog_sh_st_shp->begin() + orthog_sh_elems +
+                          axis_start);
+
+            std::copy(arr_strides.begin() + axis_start + ind_nd,
+                      arr_strides.end(),
+                      host_orthog_sh_st_shp->begin() + 2 * orthog_sh_elems +
+                          axis_start);
+        }
+    }
+
+    if (inp_nd > 0) {
+        std::copy(inp_shape + axis_start, inp_shape + axis_start + k,
+                  host_along_sh_st_shp->begin());
+
+        std::copy(inp_strides.begin() + axis_start,
+                  inp_strides.begin() + axis_start + k,
+                  host_along_sh_st_shp->begin() + k);
+    }
+
+    if (ind_nd > 0) {
+        std::copy(arr_shape + axis_start, arr_shape + axis_start + ind_nd,
+                  host_along_sh_st_shp->begin() + 2 * k);
+        std::copy(arr_strides.begin() + axis_start,
+                  arr_strides.begin() + axis_start + ind_nd,
+                  host_along_sh_st_shp->begin() + 2 * k + ind_nd);
+    }
+
+    const sycl::event &device_orthog_sh_st_copy_ev = exec_q.copy<py::ssize_t>(
+        host_orthog_sh_st_shp->data(), device_orthog_sh_st,
+        host_orthog_sh_st_shp->size());
+
+    const sycl::event &device_along_sh_st_copy_ev = exec_q.copy<py::ssize_t>(
+        host_along_sh_st_shp->data(), device_along_sh_st,
+        host_along_sh_st_shp->size());
+
+    const sycl::event &shared_ptr_cleanup_ev =
+        exec_q.submit([&](sycl::handler &cgh) {
+            cgh.depends_on({device_along_sh_st_copy_ev,
+                            device_orthog_sh_st_copy_ev,
+                            device_ind_offsets_copy_ev,
+                            device_ind_sh_st_copy_ev, device_ind_ptrs_copy_ev});
+            cgh.host_task(
+                [host_ind_offsets_shp = std::move(host_ind_offsets_shp),
+                 host_ind_sh_st_shp = std::move(host_ind_sh_st_shp),
+                 host_ind_ptrs_shp = std::move(host_ind_ptrs_shp),
+                 host_orthog_sh_st_shp = std::move(host_orthog_sh_st_shp),
+                 host_along_sh_st_shp = std::move(host_along_sh_st_shp)] {});
+        });
+    host_task_events.push_back(shared_ptr_cleanup_ev);
+
+    std::vector<sycl::event> sh_st_pack_deps{
+        device_ind_ptrs_copy_ev, device_ind_sh_st_copy_ev,
+        device_ind_offsets_copy_ev, device_orthog_sh_st_copy_ev,
+        device_along_sh_st_copy_ev};
+    return sh_st_pack_deps;
+}
+
+/* Utility to parse python object py_ind into vector of `usm_ndarray`s */
+std::vector<dpctl::tensor::usm_ndarray> parse_py_ind(const sycl::queue &q,
+                                                     const py::object &py_ind)
+{
+    std::size_t ind_count = py::len(py_ind);
+    std::vector<dpctl::tensor::usm_ndarray> res;
+    res.reserve(ind_count);
+
+    bool nd_is_known = false;
+    int nd = -1;
+    for (std::size_t i = 0; i < ind_count; ++i) {
+        py::object el_i = py_ind[py::cast(i)];
+        dpctl::tensor::usm_ndarray arr_i =
+            py::cast<dpctl::tensor::usm_ndarray>(el_i);
+        if (!dpctl::utils::queues_are_compatible(q, {arr_i})) {
+            throw py::value_error("Index allocation queue is not compatible "
+                                  "with execution queue");
+        }
+        if (nd_is_known) {
+            if (nd != arr_i.get_ndim()) {
+                throw py::value_error(
+                    "Indices must have the same number of dimensions.");
+            }
+        }
+        else {
+            nd_is_known = true;
+            nd = arr_i.get_ndim();
+        }
+        res.push_back(arr_i);
+    }
+
+    return res;
+}
+
+std::pair<sycl::event, sycl::event>
+    usm_ndarray_take(const dpctl::tensor::usm_ndarray &src,
+                     const py::object &py_ind,
+                     const dpctl::tensor::usm_ndarray &dst,
+                     int axis_start,
+                     std::uint8_t mode,
+                     sycl::queue &exec_q,
+                     const std::vector<sycl::event> &depends)
+{
+    std::vector<dpctl::tensor::usm_ndarray> ind = parse_py_ind(exec_q, py_ind);
+
+    int k = ind.size();
+
+    if (k == 0) {
+        throw py::value_error("List of indices is empty.");
+    }
+
+    if (axis_start < 0) {
+        throw py::value_error("Axis cannot be negative.");
+    }
+
+    if (mode != 0 && mode != 1) {
+        throw py::value_error("Mode must be 0 or 1.");
+    }
+
+    dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst);
+
+    const dpctl::tensor::usm_ndarray ind_rep = ind[0];
+
+    int src_nd = src.get_ndim();
+    int dst_nd = dst.get_ndim();
+    int ind_nd = ind_rep.get_ndim();
+
+    auto sh_elems = std::max<int>(src_nd, 1);
+
+    if (axis_start + k > sh_elems) {
+        throw py::value_error("Axes are out of range for array of dimension " +
+                              std::to_string(src_nd));
+    }
+    if (src_nd == 0) {
+        if (dst_nd != ind_nd) {
+            throw py::value_error(
+                "Destination is not of appropriate dimension for take kernel.");
+        }
+    }
+    else {
+        if (dst_nd != (src_nd - k + ind_nd)) {
+            throw py::value_error(
+                "Destination is not of appropriate dimension for take kernel.");
+        }
+    }
+
+    const py::ssize_t *src_shape = src.get_shape_raw();
+    const py::ssize_t *dst_shape = dst.get_shape_raw();
+
+    bool orthog_shapes_equal(true);
+    std::size_t orthog_nelems(1);
+    for (int i = 0; i < (src_nd - k); ++i) {
+        auto idx1 = (i < axis_start) ? i : i + k;
+        auto idx2 = (i < axis_start) ? i : i + ind_nd;
+
+        orthog_nelems *= static_cast<std::size_t>(src_shape[idx1]);
+        orthog_shapes_equal =
+            orthog_shapes_equal && (src_shape[idx1] == dst_shape[idx2]);
+    }
+
+    if (!orthog_shapes_equal) {
+        throw py::value_error(
+            "Axes of basic indices are not of matching shapes.");
+    }
+
+    if (orthog_nelems == 0) {
+        return std::make_pair(sycl::event{}, sycl::event{});
+    }
+
+    char *src_data = src.get_data();
+    char *dst_data = dst.get_data();
+
+    if (!dpctl::utils::queues_are_compatible(exec_q, {src, dst})) {
+        throw py::value_error(
+            "Execution queue is not compatible with allocation queues");
+    }
+
+    auto const &overlap = dpctl::tensor::overlap::MemoryOverlap();
+    if (overlap(src, dst)) {
+        throw py::value_error("Array memory overlap.");
+    }
+
+    py::ssize_t src_offset = py::ssize_t(0);
+    py::ssize_t dst_offset = py::ssize_t(0);
+
+    int src_typenum = src.get_typenum();
+    int dst_typenum = dst.get_typenum();
+
+    auto array_types = td_ns::usm_ndarray_types();
+    int src_type_id = array_types.typenum_to_lookup_id(src_typenum);
+    int dst_type_id = array_types.typenum_to_lookup_id(dst_typenum);
+
+    if (src_type_id != dst_type_id) {
+        throw py::type_error("Array data types are not the same.");
+    }
+
+    const py::ssize_t *ind_shape = ind_rep.get_shape_raw();
+
+    int ind_typenum = ind_rep.get_typenum();
+    int ind_type_id = array_types.typenum_to_lookup_id(ind_typenum);
+
+    std::size_t ind_nelems(1);
+    for (int i = 0; i < ind_nd; ++i) {
+        ind_nelems *= static_cast<std::size_t>(ind_shape[i]);
+
+        if (!(ind_shape[i] == dst_shape[axis_start + i])) {
+            throw py::value_error(
+                "Indices shape does not match shape of axis in destination.");
+        }
+    }
+
+    dpctl::tensor::validation::AmpleMemory::throw_if_not_ample(
+        dst, orthog_nelems * ind_nelems);
+
+    int ind_sh_elems = std::max<int>(ind_nd, 1);
+
+    std::vector<char *> ind_ptrs;
+    ind_ptrs.reserve(k);
+
+    std::vector<py::ssize_t> ind_offsets;
+    ind_offsets.reserve(k);
+
+    std::vector<py::ssize_t> ind_sh_sts((k + 1) * ind_sh_elems, 0);
+    if (ind_nd > 0) {
+        std::copy(ind_shape, ind_shape + ind_nd, ind_sh_sts.begin());
+    }
+    for (int i = 0; i < k; ++i) {
+        dpctl::tensor::usm_ndarray ind_ = ind[i];
+
+        if (!dpctl::utils::queues_are_compatible(exec_q, {ind_})) {
+            throw py::value_error(
+                "Execution queue is not compatible with allocation queues");
+        }
+
+        // ndim, type, and shape are checked against the first array
+        if (i > 0) {
+            if (!(ind_.get_ndim() == ind_nd)) {
+                throw py::value_error("Index dimensions are not the same");
+            }
+
+            if (!(ind_type_id ==
+                  array_types.typenum_to_lookup_id(ind_.get_typenum()))) {
+                throw py::type_error(
+                    "Indices array data types are not all the same.");
+            }
+
+            const py::ssize_t *ind_shape_ = ind_.get_shape_raw();
+            for (int dim = 0; dim < ind_nd; ++dim) {
+                if (!(ind_shape[dim] == ind_shape_[dim])) {
+                    throw py::value_error("Indices shapes are not all equal.");
+                }
+            }
+        }
+
+        // check for overlap with destination
+        if (overlap(dst, ind_)) {
+            throw py::value_error(
+                "Arrays index overlapping segments of memory");
+        }
+
+        char *ind_data = ind_.get_data();
+
+        // strides are initialized to 0 for 0D indices, so skip here
+        if (ind_nd > 0) {
+            auto ind_strides = ind_.get_strides_vector();
+            std::copy(ind_strides.begin(), ind_strides.end(),
+                      ind_sh_sts.begin() + (i + 1) * ind_nd);
+        }
+
+        ind_ptrs.push_back(ind_data);
+        ind_offsets.push_back(py::ssize_t(0));
+    }
+
+    if (ind_nelems == 0) {
+        return std::make_pair(sycl::event{}, sycl::event{});
+    }
+
+    auto packed_ind_ptrs_owner =
+        dpctl::tensor::alloc_utils::smart_malloc_device<char *>(k, exec_q);
+    char **packed_ind_ptrs = packed_ind_ptrs_owner.get();
+
+    // rearrange to past where indices shapes are checked
+    // packed_ind_shapes_strides = [ind_shape,
+    //                              ind[0] strides,
+    //                              ...,
+    //                              ind[k] strides]
+    auto packed_ind_shapes_strides_owner =
+        dpctl::tensor::alloc_utils::smart_malloc_device<py::ssize_t>(
+            (k + 1) * ind_sh_elems, exec_q);
+    py::ssize_t *packed_ind_shapes_strides =
+        packed_ind_shapes_strides_owner.get();
+
+    auto packed_ind_offsets_owner =
+        dpctl::tensor::alloc_utils::smart_malloc_device<py::ssize_t>(k, exec_q);
+    py::ssize_t *packed_ind_offsets = packed_ind_offsets_owner.get();
+
+    int orthog_sh_elems = std::max<int>(src_nd - k, 1);
+
+    // packed_shapes_strides = [src_shape[:axis] + src_shape[axis+k:],
+    //                          src_strides[:axis] + src_strides[axis+k:],
+    //                          dst_strides[:axis] +
+    //                          dst_strides[axis+ind.ndim:]]
+    auto packed_shapes_strides_owner =
+        dpctl::tensor::alloc_utils::smart_malloc_device<py::ssize_t>(
+            3 * orthog_sh_elems, exec_q);
+    py::ssize_t *packed_shapes_strides = packed_shapes_strides_owner.get();
+
+    // packed_axes_shapes_strides = [src_shape[axis:axis+k],
+    //                               src_strides[axis:axis+k],
+    //                               dst_shape[axis:axis+ind.ndim],
+    //                               dst_strides[axis:axis+ind.ndim]]
+    auto packed_axes_shapes_strides_owner =
+        dpctl::tensor::alloc_utils::smart_malloc_device<py::ssize_t>(
+            2 * (k + ind_sh_elems), exec_q);
+    py::ssize_t *packed_axes_shapes_strides =
+        packed_axes_shapes_strides_owner.get();
+
+    auto src_strides = src.get_strides_vector();
+    auto dst_strides = dst.get_strides_vector();
+
+    std::vector<sycl::event> host_task_events;
+    host_task_events.reserve(2);
+
+    std::vector<sycl::event> pack_deps = _populate_kernel_params(
+        exec_q, host_task_events, packed_ind_ptrs, packed_ind_shapes_strides,
+        packed_ind_offsets, packed_shapes_strides, packed_axes_shapes_strides,
+        src_shape, dst_shape, src_strides, dst_strides, ind_sh_sts, ind_ptrs,
+        ind_offsets, axis_start, k, ind_nd, src_nd, orthog_sh_elems,
+        ind_sh_elems);
+
+    std::vector<sycl::event> all_deps;
+    all_deps.reserve(depends.size() + pack_deps.size());
+    all_deps.insert(std::end(all_deps), std::begin(pack_deps),
+                    std::end(pack_deps));
+    all_deps.insert(std::end(all_deps), std::begin(depends), std::end(depends));
+
+    auto fn = take_dispatch_table[mode][src_type_id][ind_type_id];
+
+    if (fn == nullptr) {
+        sycl::event::wait(host_task_events);
+        throw std::runtime_error("Indices must be integer type, got " +
+                                 std::to_string(ind_type_id));
+    }
+
+    sycl::event take_generic_ev =
+        fn(exec_q, orthog_nelems, ind_nelems, orthog_sh_elems, ind_sh_elems, k,
+           packed_shapes_strides, packed_axes_shapes_strides,
+           packed_ind_shapes_strides, src_data, dst_data, packed_ind_ptrs,
+           src_offset, dst_offset, packed_ind_offsets, all_deps);
+
+    // free packed temporaries
+    sycl::event temporaries_cleanup_ev =
+        dpctl::tensor::alloc_utils::async_smart_free(
+            exec_q, {take_generic_ev}, packed_shapes_strides_owner,
+            packed_axes_shapes_strides_owner, packed_ind_shapes_strides_owner,
+            packed_ind_ptrs_owner, packed_ind_offsets_owner);
+    host_task_events.push_back(temporaries_cleanup_ev);
+
+    sycl::event arg_cleanup_ev =
+        keep_args_alive(exec_q, {src, py_ind, dst}, host_task_events);
+
+    return std::make_pair(arg_cleanup_ev, take_generic_ev);
+}
+
+std::pair<sycl::event, sycl::event>
+    usm_ndarray_put(const dpctl::tensor::usm_ndarray &dst,
+                    const py::object &py_ind,
+                    const dpctl::tensor::usm_ndarray &val,
+                    int axis_start,
+                    std::uint8_t mode,
+                    sycl::queue &exec_q,
+                    const std::vector<sycl::event> &depends)
+{
+    std::vector<dpctl::tensor::usm_ndarray> ind = parse_py_ind(exec_q, py_ind);
+    int k = ind.size();
+
+    if (k == 0) {
+        // no indices to write to
+        throw py::value_error("List of indices is empty.");
+    }
+
+    if (axis_start < 0) {
+        throw py::value_error("Axis cannot be negative.");
+    }
+
+    if (mode != 0 && mode != 1) {
+        throw py::value_error("Mode must be 0 or 1.");
+    }
+
+    dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst);
+
+    const dpctl::tensor::usm_ndarray ind_rep = ind[0];
+
+    int dst_nd = dst.get_ndim();
+    int val_nd = val.get_ndim();
+    int ind_nd = ind_rep.get_ndim();
+
+    auto sh_elems = std::max<int>(dst_nd, 1);
+
+    if (axis_start + k > sh_elems) {
+        throw py::value_error("Axes are out of range for array of dimension " +
+                              std::to_string(dst_nd));
+    }
+    if (dst_nd == 0) {
+        if (val_nd != ind_nd) {
+            throw py::value_error("Destination is not of appropriate dimension "
+                                  "for put function.");
+        }
+    }
+    else {
+        if (val_nd != (dst_nd - k + ind_nd)) {
+            throw py::value_error("Destination is not of appropriate dimension "
+                                  "for put function.");
+        }
+    }
+
+    std::size_t dst_nelems = dst.get_size();
+
+    const py::ssize_t *dst_shape = dst.get_shape_raw();
+    const py::ssize_t *val_shape = val.get_shape_raw();
+
+    bool orthog_shapes_equal(true);
+    std::size_t orthog_nelems(1);
+    for (int i = 0; i < (dst_nd - k); ++i) {
+        auto idx1 = (i < axis_start) ? i : i + k;
+        auto idx2 = (i < axis_start) ? i : i + ind_nd;
+
+        orthog_nelems *= static_cast<std::size_t>(dst_shape[idx1]);
+        orthog_shapes_equal =
+            orthog_shapes_equal && (dst_shape[idx1] == val_shape[idx2]);
+    }
+
+    if (!orthog_shapes_equal) {
+        throw py::value_error(
+            "Axes of basic indices are not of matching shapes.");
+    }
+
+    if (orthog_nelems == 0) {
+        return std::make_pair(sycl::event(), sycl::event());
+    }
+
+    char *dst_data = dst.get_data();
+    char *val_data = val.get_data();
+
+    if (!dpctl::utils::queues_are_compatible(exec_q, {dst, val})) {
+        throw py::value_error(
+            "Execution queue is not compatible with allocation queues");
+    }
+
+    auto const &overlap = dpctl::tensor::overlap::MemoryOverlap();
+    if (overlap(val, dst)) {
+        throw py::value_error("Arrays index overlapping segments of memory");
+    }
+
+    py::ssize_t dst_offset = py::ssize_t(0);
+    py::ssize_t val_offset = py::ssize_t(0);
+
+    dpctl::tensor::validation::AmpleMemory::throw_if_not_ample(dst, dst_nelems);
+
+    int dst_typenum = dst.get_typenum();
+    int val_typenum = val.get_typenum();
+
+    auto array_types = td_ns::usm_ndarray_types();
+    int dst_type_id = array_types.typenum_to_lookup_id(dst_typenum);
+    int val_type_id = array_types.typenum_to_lookup_id(val_typenum);
+
+    if (dst_type_id != val_type_id) {
+        throw py::type_error("Array data types are not the same.");
+    }
+
+    const py::ssize_t *ind_shape = ind_rep.get_shape_raw();
+
+    int ind_typenum = ind_rep.get_typenum();
+    int ind_type_id = array_types.typenum_to_lookup_id(ind_typenum);
+
+    std::size_t ind_nelems(1);
+    for (int i = 0; i < ind_nd; ++i) {
+        ind_nelems *= static_cast<std::size_t>(ind_shape[i]);
+
+        if (!(ind_shape[i] == val_shape[axis_start + i])) {
+            throw py::value_error(
+                "Indices shapes does not match shape of axis in vals.");
+        }
+    }
+
+    auto ind_sh_elems = std::max<int>(ind_nd, 1);
+
+    std::vector<char *> ind_ptrs;
+    ind_ptrs.reserve(k);
+    std::vector<py::ssize_t> ind_offsets;
+    ind_offsets.reserve(k);
+    std::vector<py::ssize_t> ind_sh_sts((k + 1) * ind_sh_elems, py::ssize_t(0));
+    if (ind_nd > 0) {
+        std::copy(ind_shape, ind_shape + ind_sh_elems, ind_sh_sts.begin());
+    }
+    for (int i = 0; i < k; ++i) {
+        dpctl::tensor::usm_ndarray ind_ = ind[i];
+
+        if (!dpctl::utils::queues_are_compatible(exec_q, {ind_})) {
+            throw py::value_error(
+                "Execution queue is not compatible with allocation queues");
+        }
+
+        // ndim, type, and shape are checked against the first array
+        if (i > 0) {
+            if (!(ind_.get_ndim() == ind_nd)) {
+                throw py::value_error("Index dimensions are not the same");
+            }
+
+            if (!(ind_type_id ==
+                  array_types.typenum_to_lookup_id(ind_.get_typenum()))) {
+                throw py::type_error(
+                    "Indices array data types are not all the same.");
+            }
+
+            const py::ssize_t *ind_shape_ = ind_.get_shape_raw();
+            for (int dim = 0; dim < ind_nd; ++dim) {
+                if (!(ind_shape[dim] == ind_shape_[dim])) {
+                    throw py::value_error("Indices shapes are not all equal.");
+                }
+            }
+        }
+
+        // check for overlap with destination
+        if (overlap(ind_, dst)) {
+            throw py::value_error(
+                "Arrays index overlapping segments of memory");
+        }
+
+        char *ind_data = ind_.get_data();
+
+        // strides are initialized to 0 for 0D indices, so skip here
+        if (ind_nd > 0) {
+            auto ind_strides = ind_.get_strides_vector();
+            std::copy(ind_strides.begin(), ind_strides.end(),
+                      ind_sh_sts.begin() + (i + 1) * ind_nd);
+        }
+
+        ind_ptrs.push_back(ind_data);
+        ind_offsets.push_back(py::ssize_t(0));
+    }
+
+    if (ind_nelems == 0) {
+        return std::make_pair(sycl::event{}, sycl::event{});
+    }
+
+    auto packed_ind_ptrs_owner =
+        dpctl::tensor::alloc_utils::smart_malloc_device<char *>(k, exec_q);
+    char **packed_ind_ptrs = packed_ind_ptrs_owner.get();
+
+    // packed_ind_shapes_strides = [ind_shape,
+    //                              ind[0] strides,
+    //                              ...,
+    //                              ind[k] strides]
+    auto packed_ind_shapes_strides_owner =
+        dpctl::tensor::alloc_utils::smart_malloc_device<py::ssize_t>(
+            (k + 1) * ind_sh_elems, exec_q);
+    py::ssize_t *packed_ind_shapes_strides =
+        packed_ind_shapes_strides_owner.get();
+
+    auto packed_ind_offsets_owner =
+        dpctl::tensor::alloc_utils::smart_malloc_device<py::ssize_t>(k, exec_q);
+    py::ssize_t *packed_ind_offsets = packed_ind_offsets_owner.get();
+
+    int orthog_sh_elems = std::max<int>(dst_nd - k, 1);
+
+    // packed_shapes_strides = [dst_shape[:axis] + dst_shape[axis+k:],
+    //                          dst_strides[:axis] + dst_strides[axis+k:],
+    //                          val_strides[:axis] +
+    //                          val_strides[axis+ind.ndim:]]
+    auto packed_shapes_strides_owner =
+        dpctl::tensor::alloc_utils::smart_malloc_device<py::ssize_t>(
+            3 * orthog_sh_elems, exec_q);
+    py::ssize_t *packed_shapes_strides = packed_shapes_strides_owner.get();
+
+    // packed_axes_shapes_strides = [dst_shape[axis:axis+k],
+    //                               dst_strides[axis:axis+k],
+    //                               val_shape[axis:axis+ind.ndim],
+    //                               val_strides[axis:axis+ind.ndim]]
+    auto packed_axes_shapes_strides_owner =
+        dpctl::tensor::alloc_utils::smart_malloc_device<py::ssize_t>(
+            2 * (k + ind_sh_elems), exec_q);
+    py::ssize_t *packed_axes_shapes_strides =
+        packed_axes_shapes_strides_owner.get();
+
+    auto dst_strides = dst.get_strides_vector();
+    auto val_strides = val.get_strides_vector();
+
+    std::vector<sycl::event> host_task_events;
+    host_task_events.reserve(2);
+
+    std::vector<sycl::event> pack_deps = _populate_kernel_params(
+        exec_q, host_task_events, packed_ind_ptrs, packed_ind_shapes_strides,
+        packed_ind_offsets, packed_shapes_strides, packed_axes_shapes_strides,
+        dst_shape, val_shape, dst_strides, val_strides, ind_sh_sts, ind_ptrs,
+        ind_offsets, axis_start, k, ind_nd, dst_nd, orthog_sh_elems,
+        ind_sh_elems);
+
+    std::vector<sycl::event> all_deps;
+    all_deps.reserve(depends.size() + pack_deps.size());
+    all_deps.insert(std::end(all_deps), std::begin(pack_deps),
+                    std::end(pack_deps));
+    all_deps.insert(std::end(all_deps), std::begin(depends), std::end(depends));
+
+    auto fn = put_dispatch_table[mode][dst_type_id][ind_type_id];
+
+    if (fn == nullptr) {
+        sycl::event::wait(host_task_events);
+        throw std::runtime_error("Indices must be integer type, got " +
+                                 std::to_string(ind_type_id));
+    }
+
+    sycl::event put_generic_ev =
+        fn(exec_q, orthog_nelems, ind_nelems, orthog_sh_elems, ind_sh_elems, k,
+           packed_shapes_strides, packed_axes_shapes_strides,
+           packed_ind_shapes_strides, dst_data, val_data, packed_ind_ptrs,
+           dst_offset, val_offset, packed_ind_offsets, all_deps);
+
+    // free packed temporaries
+    sycl::event temporaries_cleanup_ev =
+        dpctl::tensor::alloc_utils::async_smart_free(
+            exec_q, {put_generic_ev}, packed_shapes_strides_owner,
+            packed_axes_shapes_strides_owner, packed_ind_shapes_strides_owner,
+            packed_ind_ptrs_owner, packed_ind_offsets_owner);
+    host_task_events.push_back(temporaries_cleanup_ev);
+
+    sycl::event arg_cleanup_ev =
+        keep_args_alive(exec_q, {dst, py_ind, val}, host_task_events);
+
+    return std::make_pair(arg_cleanup_ev, put_generic_ev);
+}
+
+void init_advanced_indexing_dispatch_tables(void)
+{
+    using namespace td_ns;
+
+    using dpctl::tensor::kernels::indexing::TakeClipFactory;
+    DispatchTableBuilder<take_fn_ptr_t, TakeClipFactory, num_types>
+        dtb_takeclip;
+    dtb_takeclip.populate_dispatch_table(take_dispatch_table[CLIP_MODE]);
+
+    using dpctl::tensor::kernels::indexing::TakeWrapFactory;
+    DispatchTableBuilder<take_fn_ptr_t, TakeWrapFactory, num_types>
+        dtb_takewrap;
+    dtb_takewrap.populate_dispatch_table(take_dispatch_table[WRAP_MODE]);
+
+    using dpctl::tensor::kernels::indexing::PutClipFactory;
+    DispatchTableBuilder<put_fn_ptr_t, PutClipFactory, num_types> dtb_putclip;
+    dtb_putclip.populate_dispatch_table(put_dispatch_table[CLIP_MODE]);
+
+    using dpctl::tensor::kernels::indexing::PutWrapFactory;
+    DispatchTableBuilder<put_fn_ptr_t, PutWrapFactory, num_types> dtb_putwrap;
+    dtb_putwrap.populate_dispatch_table(put_dispatch_table[WRAP_MODE]);
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/integer_advanced_indexing.hpp b/dpctl_ext/tensor/libtensor/source/integer_advanced_indexing.hpp
new file mode 100644
index 000000000000..bc0136288e1c
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/integer_advanced_indexing.hpp
@@ -0,0 +1,71 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file declares Python API for implementation functions of
+/// dpctl.tensor.take and dpctl.tensor.put
+//===----------------------------------------------------------------------===//
+
+#pragma once
+#include <cstdint>
+#include <utility>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern std::pair<sycl::event, sycl::event>
+    usm_ndarray_take(const dpctl::tensor::usm_ndarray &,
+                     const py::object &,
+                     const dpctl::tensor::usm_ndarray &,
+                     int,
+                     std::uint8_t,
+                     sycl::queue &,
+                     const std::vector<sycl::event> & = {});
+
+extern std::pair<sycl::event, sycl::event>
+    usm_ndarray_put(const dpctl::tensor::usm_ndarray &,
+                    const py::object &,
+                    const dpctl::tensor::usm_ndarray &,
+                    int,
+                    std::uint8_t,
+                    sycl::queue &,
+                    const std::vector<sycl::event> & = {});
+
+extern void init_advanced_indexing_dispatch_tables(void);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp b/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp
index 54d6adbc8f6e..0478fb19678c 100644
--- a/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp
+++ b/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp
@@ -53,17 +53,17 @@
 // #include "copy_numpy_ndarray_into_usm_ndarray.hpp"
 #include "device_support_queries.hpp"
 // #include "eye_ctor.hpp"
-// #include "full_ctor.hpp"
-// #include "integer_advanced_indexing.hpp"
+#include "full_ctor.hpp"
+#include "integer_advanced_indexing.hpp"
 #include "kernels/dpctl_tensor_types.hpp"
 // #include "linear_sequences.hpp"
 // #include "repeat.hpp"
 #include "simplify_iteration_space.hpp"
-// #include "triul_ctor.hpp"
+#include "triul_ctor.hpp"
 #include "utils/memory_overlap.hpp"
 #include "utils/strided_iters.hpp"
 // #include "where.hpp"
-// #include "zeros_ctor.hpp"
+#include "zeros_ctor.hpp"
 
 namespace py = pybind11;
 
@@ -102,15 +102,15 @@ using dpctl::tensor::py_internal::py_as_f_contig;
 
 /* ================ Full ================== */
 
-// using dpctl::tensor::py_internal::usm_ndarray_full;
+using dpctl::tensor::py_internal::usm_ndarray_full;
 
 /* ================ Zeros ================== */
 
-// using dpctl::tensor::py_internal::usm_ndarray_zeros;
+using dpctl::tensor::py_internal::usm_ndarray_zeros;
 
 /* ============== Advanced Indexing ============= */
-// using dpctl::tensor::py_internal::usm_ndarray_put;
-// using dpctl::tensor::py_internal::usm_ndarray_take;
+using dpctl::tensor::py_internal::usm_ndarray_put;
+using dpctl::tensor::py_internal::usm_ndarray_take;
 
 // using dpctl::tensor::py_internal::py_extract;
 // using dpctl::tensor::py_internal::py_mask_positions;
@@ -128,7 +128,7 @@ using dpctl::tensor::py_internal::py_as_f_contig;
 
 /* =========================== Tril and triu ============================== */
 
-// using dpctl::tensor::py_internal::usm_ndarray_triul;
+using dpctl::tensor::py_internal::usm_ndarray_triul;
 
 /* =========================== Where ============================== */
 
@@ -144,7 +144,7 @@ void init_dispatch_tables(void)
 
     init_copy_and_cast_usm_to_usm_dispatch_tables();
     // init_copy_numpy_ndarray_into_usm_ndarray_dispatch_tables();
-    // init_advanced_indexing_dispatch_tables();
+    init_advanced_indexing_dispatch_tables();
     // init_where_dispatch_tables();
     return;
 }
@@ -158,10 +158,10 @@ void init_dispatch_vectors(void)
     // init_copy_for_reshape_dispatch_vectors();
     // init_copy_for_roll_dispatch_vectors();
     // init_linear_sequences_dispatch_vectors();
-    // init_full_ctor_dispatch_vectors();
-    // init_zeros_ctor_dispatch_vectors();
+    init_full_ctor_dispatch_vectors();
+    init_zeros_ctor_dispatch_vectors();
     // init_eye_ctor_dispatch_vectors();
-    // init_triul_ctor_dispatch_vectors();
+    init_triul_ctor_dispatch_vectors();
 
     // populate_masked_extract_dispatch_vectors();
     // populate_masked_place_dispatch_vectors();
@@ -299,22 +299,20 @@ PYBIND11_MODULE(_tensor_impl, m)
     //       py::arg("shifts"), py::arg("sycl_queue"), py::arg("depends") =
     //       py::list());
 
-    // m.def("_linspace_step", &usm_ndarray_linear_sequence_step,
-    //       "Fills input 1D contiguous usm_ndarray `dst` with linear sequence "
-    //       "specified by "
-    //       "starting point `start` and step `dt`. "
-    //       "Returns a tuple of events: (ht_event, comp_event)",
-    //       py::arg("start"), py::arg("dt"), py::arg("dst"),
-    //       py::arg("sycl_queue"), py::arg("depends") = py::list());
-
-    // m.def("_linspace_affine", &usm_ndarray_linear_sequence_affine,
-    //       "Fills input 1D contiguous usm_ndarray `dst` with linear sequence "
-    //       "specified by "
-    //       "starting point `start` and end point `end`. "
-    //       "Returns a tuple of events: (ht_event, comp_event)",
-    //       py::arg("start"), py::arg("end"), py::arg("dst"),
-    //       py::arg("include_endpoint"), py::arg("sycl_queue"),
-    //       py::arg("depends") = py::list());
+    //     m.def("_linspace_step", &usm_ndarray_linear_sequence_step,
+    //           "Fills input 1D contiguous usm_ndarray `dst` with linear
+    //           sequence " "specified by " "starting point `start` and step
+    //           `dt`. " "Returns a tuple of events: (ht_event, comp_event)",
+    //           py::arg("start"), py::arg("dt"), py::arg("dst"),
+    //           py::arg("sycl_queue"), py::arg("depends") = py::list());
+
+    //     m.def("_linspace_affine", &usm_ndarray_linear_sequence_affine,
+    //           "Fills input 1D contiguous usm_ndarray `dst` with linear
+    //           sequence " "specified by " "starting point `start` and end
+    //           point `end`. " "Returns a tuple of events: (ht_event,
+    //           comp_event)", py::arg("start"), py::arg("end"), py::arg("dst"),
+    //           py::arg("include_endpoint"), py::arg("sycl_queue"),
+    //           py::arg("depends") = py::list());
 
     // m.def("_copy_numpy_ndarray_into_usm_ndarray",
     //       &copy_numpy_ndarray_into_usm_ndarray,
@@ -322,32 +320,32 @@ PYBIND11_MODULE(_tensor_impl, m)
     //       synchronously.", py::arg("src"), py::arg("dst"),
     //       py::arg("sycl_queue"), py::arg("depends") = py::list());
 
-    // m.def("_zeros_usm_ndarray", &usm_ndarray_zeros,
-    //       "Populate usm_ndarray `dst` with zeros.", py::arg("dst"),
-    //       py::arg("sycl_queue"), py::arg("depends") = py::list());
+    m.def("_zeros_usm_ndarray", &usm_ndarray_zeros,
+          "Populate usm_ndarray `dst` with zeros.", py::arg("dst"),
+          py::arg("sycl_queue"), py::arg("depends") = py::list());
 
-    // m.def("_full_usm_ndarray", &usm_ndarray_full,
-    //       "Populate usm_ndarray `dst` with given fill_value.",
-    //       py::arg("fill_value"), py::arg("dst"), py::arg("sycl_queue"),
-    //       py::arg("depends") = py::list());
+    m.def("_full_usm_ndarray", &usm_ndarray_full,
+          "Populate usm_ndarray `dst` with given fill_value.",
+          py::arg("fill_value"), py::arg("dst"), py::arg("sycl_queue"),
+          py::arg("depends") = py::list());
 
-    // m.def("_take", &usm_ndarray_take,
-    //       "Takes elements at usm_ndarray indices `ind` and axes starting "
-    //       "at axis `axis_start` from array `src` and copies them "
-    //       "into usm_ndarray `dst` synchronously."
-    //       "Returns a tuple of events: (hev, ev)",
-    //       py::arg("src"), py::arg("ind"), py::arg("dst"),
-    //       py::arg("axis_start"), py::arg("mode"), py::arg("sycl_queue"),
-    //   py::arg("depends") = py::list());
-
-    // m.def("_put", &usm_ndarray_put,
-    //       "Puts elements at usm_ndarray indices `ind` and axes starting "
-    //       "at axis `axis_start` into array `dst` from "
-    //       "usm_ndarray `val` synchronously."
-    //       "Returns a tuple of events: (hev, ev)",
-    //       py::arg("dst"), py::arg("ind"), py::arg("val"),
-    //       py::arg("axis_start"), py::arg("mode"), py::arg("sycl_queue"),
-    //       py::arg("depends") = py::list());
+    m.def("_take", &usm_ndarray_take,
+          "Takes elements at usm_ndarray indices `ind` and axes starting "
+          "at axis `axis_start` from array `src` and copies them "
+          "into usm_ndarray `dst` synchronously."
+          "Returns a tuple of events: (hev, ev)",
+          py::arg("src"), py::arg("ind"), py::arg("dst"), py::arg("axis_start"),
+          py::arg("mode"), py::arg("sycl_queue"),
+          py::arg("depends") = py::list());
+
+    m.def("_put", &usm_ndarray_put,
+          "Puts elements at usm_ndarray indices `ind` and axes starting "
+          "at axis `axis_start` into array `dst` from "
+          "usm_ndarray `val` synchronously."
+          "Returns a tuple of events: (hev, ev)",
+          py::arg("dst"), py::arg("ind"), py::arg("val"), py::arg("axis_start"),
+          py::arg("mode"), py::arg("sycl_queue"),
+          py::arg("depends") = py::list());
 
     // m.def("_eye", &usm_ndarray_eye,
     //       "Fills input 2D contiguous usm_ndarray `dst` with "
@@ -387,27 +385,27 @@ PYBIND11_MODULE(_tensor_impl, m)
           dpctl::tensor::py_internal::default_device_index_type,
           "Gives default index type supported by device.", py::arg("dev"));
 
-    // auto tril_fn = [](const dpctl::tensor::usm_ndarray &src,
-    //                   const dpctl::tensor::usm_ndarray &dst, py::ssize_t k,
-    //                   sycl::queue &exec_q,
-    //                   const std::vector<sycl::event> depends)
-    //     -> std::pair<sycl::event, sycl::event> {
-    //     return usm_ndarray_triul(exec_q, src, dst, 'l', k, depends);
-    // };
-    // m.def("_tril", tril_fn, "Tril helper function.", py::arg("src"),
-    //       py::arg("dst"), py::arg("k") = 0, py::arg("sycl_queue"),
-    //       py::arg("depends") = py::list());
+    auto tril_fn = [](const dpctl::tensor::usm_ndarray &src,
+                      const dpctl::tensor::usm_ndarray &dst, py::ssize_t k,
+                      sycl::queue &exec_q,
+                      const std::vector<sycl::event> depends)
+        -> std::pair<sycl::event, sycl::event> {
+        return usm_ndarray_triul(exec_q, src, dst, 'l', k, depends);
+    };
+    m.def("_tril", tril_fn, "Tril helper function.", py::arg("src"),
+          py::arg("dst"), py::arg("k") = 0, py::arg("sycl_queue"),
+          py::arg("depends") = py::list());
 
-    // auto triu_fn = [](const dpctl::tensor::usm_ndarray &src,
-    //                   const dpctl::tensor::usm_ndarray &dst, py::ssize_t k,
-    //                   sycl::queue &exec_q,
-    //                   const std::vector<sycl::event> depends)
-    //     -> std::pair<sycl::event, sycl::event> {
-    //     return usm_ndarray_triul(exec_q, src, dst, 'u', k, depends);
-    // };
-    // m.def("_triu", triu_fn, "Triu helper function.", py::arg("src"),
-    //       py::arg("dst"), py::arg("k") = 0, py::arg("sycl_queue"),
-    //       py::arg("depends") = py::list());
+    auto triu_fn = [](const dpctl::tensor::usm_ndarray &src,
+                      const dpctl::tensor::usm_ndarray &dst, py::ssize_t k,
+                      sycl::queue &exec_q,
+                      const std::vector<sycl::event> depends)
+        -> std::pair<sycl::event, sycl::event> {
+        return usm_ndarray_triul(exec_q, src, dst, 'u', k, depends);
+    };
+    m.def("_triu", triu_fn, "Triu helper function.", py::arg("src"),
+          py::arg("dst"), py::arg("k") = 0, py::arg("sycl_queue"),
+          py::arg("depends") = py::list());
 
     // m.def("mask_positions", &py_mask_positions, "", py::arg("mask"),
     //       py::arg("cumsum"), py::arg("sycl_queue"),
diff --git a/dpctl_ext/tensor/libtensor/source/triul_ctor.cpp b/dpctl_ext/tensor/libtensor/source/triul_ctor.cpp
new file mode 100644
index 000000000000..13e909196460
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/triul_ctor.cpp
@@ -0,0 +1,246 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===--------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions
+//===--------------------------------------------------------------------===//
+
+#include <algorithm> // for std::copy
+#include <cstddef>   // for std::size_t
+#include <iterator>  // for std::begin, std::end
+#include <memory>    // for std::make_shared
+#include <utility>   // for std::pair, std::move
+#include <vector>    // for std::vector, std::begin, std::end
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/pybind11.h>
+
+#include "kernels/constructors.hpp"
+#include "simplify_iteration_space.hpp"
+#include "utils/memory_overlap.hpp"
+#include "utils/output_validation.hpp"
+#include "utils/sycl_alloc_utils.hpp"
+#include "utils/type_dispatch.hpp"
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace dpctl::tensor::py_internal
+{
+
+using dpctl::utils::keep_args_alive;
+
+using dpctl::tensor::kernels::constructors::tri_fn_ptr_t;
+
+static tri_fn_ptr_t tril_generic_dispatch_vector[td_ns::num_types];
+static tri_fn_ptr_t triu_generic_dispatch_vector[td_ns::num_types];
+
+std::pair<sycl::event, sycl::event>
+    usm_ndarray_triul(sycl::queue &exec_q,
+                      const dpctl::tensor::usm_ndarray &src,
+                      const dpctl::tensor::usm_ndarray &dst,
+                      char part,
+                      py::ssize_t k = 0,
+                      const std::vector<sycl::event> &depends = {})
+{
+    // array dimensions must be the same
+    int src_nd = src.get_ndim();
+    int dst_nd = dst.get_ndim();
+    if (src_nd != dst_nd) {
+        throw py::value_error("Array dimensions are not the same.");
+    }
+
+    if (src_nd < 2) {
+        throw py::value_error("Array dimensions less than 2.");
+    }
+
+    // shapes must be the same
+    const py::ssize_t *src_shape = src.get_shape_raw();
+    const py::ssize_t *dst_shape = dst.get_shape_raw();
+
+    bool shapes_equal(true);
+    std::size_t src_nelems(1);
+
+    for (int i = 0; shapes_equal && i < src_nd; ++i) {
+        src_nelems *= static_cast<std::size_t>(src_shape[i]);
+        shapes_equal = shapes_equal && (src_shape[i] == dst_shape[i]);
+    }
+    if (!shapes_equal) {
+        throw py::value_error("Array shapes are not the same.");
+    }
+
+    if (src_nelems == 0) {
+        // nothing to do
+        return std::make_pair(sycl::event(), sycl::event());
+    }
+
+    char *src_data = src.get_data();
+    char *dst_data = dst.get_data();
+
+    // check that arrays do not overlap, and concurrent copying is safe.
+    auto const &overlap = dpctl::tensor::overlap::MemoryOverlap();
+    if (overlap(src, dst)) {
+        // TODO: could use a temporary, but this is done by the caller
+        throw py::value_error("Arrays index overlapping segments of memory");
+    }
+
+    auto array_types = td_ns::usm_ndarray_types();
+
+    int src_typenum = src.get_typenum();
+    int dst_typenum = dst.get_typenum();
+    int src_typeid = array_types.typenum_to_lookup_id(src_typenum);
+    int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum);
+
+    if (dst_typeid != src_typeid) {
+        throw py::value_error("Array dtype are not the same.");
+    }
+
+    // check same queues
+    if (!dpctl::utils::queues_are_compatible(exec_q, {src, dst})) {
+        throw py::value_error(
+            "Execution queue context is not the same as allocation contexts");
+    }
+
+    dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst);
+
+    auto src_strides = src.get_strides_vector();
+    auto dst_strides = dst.get_strides_vector();
+
+    using shT = std::vector<py::ssize_t>;
+    shT simplified_shape;
+    shT simplified_src_strides;
+    shT simplified_dst_strides;
+    py::ssize_t src_offset(0);
+    py::ssize_t dst_offset(0);
+
+    int nd = src_nd - 2;
+    const py::ssize_t *shape = src_shape;
+
+    const shT iter_src_strides(std::begin(src_strides),
+                               std::begin(src_strides) + nd);
+    const shT iter_dst_strides(std::begin(dst_strides),
+                               std::begin(dst_strides) + nd);
+
+    simplify_iteration_space(nd, shape, iter_src_strides, iter_dst_strides,
+                             // output
+                             simplified_shape, simplified_src_strides,
+                             simplified_dst_strides, src_offset, dst_offset);
+
+    if (src_offset != 0 || dst_offset != 0) {
+        throw py::value_error("Reversed slice for dst is not supported");
+    }
+
+    nd += 2;
+
+    using usm_host_allocatorT =
+        dpctl::tensor::alloc_utils::usm_host_allocator<py::ssize_t>;
+    using usmshT = std::vector<py::ssize_t, usm_host_allocatorT>;
+
+    usm_host_allocatorT allocator(exec_q);
+    auto shp_host_shape_and_strides =
+        std::make_shared<usmshT>(3 * nd, allocator);
+
+    std::copy(simplified_shape.begin(), simplified_shape.end(),
+              shp_host_shape_and_strides->begin());
+    (*shp_host_shape_and_strides)[nd - 2] = src_shape[src_nd - 2];
+    (*shp_host_shape_and_strides)[nd - 1] = src_shape[src_nd - 1];
+
+    std::copy(simplified_src_strides.begin(), simplified_src_strides.end(),
+              shp_host_shape_and_strides->begin() + nd);
+    (*shp_host_shape_and_strides)[2 * nd - 2] = src_strides[src_nd - 2];
+    (*shp_host_shape_and_strides)[2 * nd - 1] = src_strides[src_nd - 1];
+
+    std::copy(simplified_dst_strides.begin(), simplified_dst_strides.end(),
+              shp_host_shape_and_strides->begin() + 2 * nd);
+    (*shp_host_shape_and_strides)[3 * nd - 2] = dst_strides[src_nd - 2];
+    (*shp_host_shape_and_strides)[3 * nd - 1] = dst_strides[src_nd - 1];
+
+    auto dev_shape_and_strides_owner =
+        dpctl::tensor::alloc_utils::smart_malloc_device<py::ssize_t>(3 * nd,
+                                                                     exec_q);
+    py::ssize_t *dev_shape_and_strides = dev_shape_and_strides_owner.get();
+
+    const sycl::event &copy_shape_and_strides = exec_q.copy<py::ssize_t>(
+        shp_host_shape_and_strides->data(), dev_shape_and_strides, 3 * nd);
+
+    py::ssize_t inner_range = src_shape[src_nd - 1] * src_shape[src_nd - 2];
+    py::ssize_t outer_range = src_nelems / inner_range;
+
+    sycl::event tri_ev;
+    if (part == 'l') {
+        auto fn = tril_generic_dispatch_vector[src_typeid];
+        tri_ev =
+            fn(exec_q, inner_range, outer_range, src_data, dst_data, nd,
+               dev_shape_and_strides, k, depends, {copy_shape_and_strides});
+    }
+    else {
+        auto fn = triu_generic_dispatch_vector[src_typeid];
+        tri_ev =
+            fn(exec_q, inner_range, outer_range, src_data, dst_data, nd,
+               dev_shape_and_strides, k, depends, {copy_shape_and_strides});
+    }
+
+    const auto &temporaries_cleanup_ev = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(tri_ev);
+        const auto &ctx = exec_q.get_context();
+        using dpctl::tensor::alloc_utils::sycl_free_noexcept;
+        cgh.host_task(
+            [shp_host_shape_and_strides = std::move(shp_host_shape_and_strides),
+             dev_shape_and_strides, ctx]() {
+                // capture of shp_host_shape_and_strides ensure the underlying
+                // vector exists for the entire execution of copying kernel
+                sycl_free_noexcept(dev_shape_and_strides, ctx);
+            });
+    });
+    // since host_task now owns USM allocation, release ownership by smart
+    // pointer
+    dev_shape_and_strides_owner.release();
+
+    return std::make_pair(
+        keep_args_alive(exec_q, {src, dst}, {temporaries_cleanup_ev}), tri_ev);
+}
+
+void init_triul_ctor_dispatch_vectors(void)
+{
+
+    using namespace td_ns;
+    using dpctl::tensor::kernels::constructors::TrilGenericFactory;
+    using dpctl::tensor::kernels::constructors::TriuGenericFactory;
+
+    DispatchVectorBuilder<tri_fn_ptr_t, TrilGenericFactory, num_types> dvb1;
+    dvb1.populate_dispatch_vector(tril_generic_dispatch_vector);
+
+    DispatchVectorBuilder<tri_fn_ptr_t, TriuGenericFactory, num_types> dvb2;
+    dvb2.populate_dispatch_vector(triu_generic_dispatch_vector);
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/triul_ctor.hpp b/dpctl_ext/tensor/libtensor/source/triul_ctor.hpp
new file mode 100644
index 000000000000..47cc4ce8892d
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/triul_ctor.hpp
@@ -0,0 +1,58 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===--------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions
+//===--------------------------------------------------------------------===//
+
+#pragma once
+#include <sycl/sycl.hpp>
+#include <utility>
+#include <vector>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern std::pair<sycl::event, sycl::event>
+    usm_ndarray_triul(sycl::queue &exec_q,
+                      const dpctl::tensor::usm_ndarray &src,
+                      const dpctl::tensor::usm_ndarray &dst,
+                      char part,
+                      py::ssize_t k = 0,
+                      const std::vector<sycl::event> &depends = {});
+
+extern void init_triul_ctor_dispatch_vectors(void);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/zeros_ctor.cpp b/dpctl_ext/tensor/libtensor/source/zeros_ctor.cpp
new file mode 100644
index 000000000000..2eb05e49f382
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/zeros_ctor.cpp
@@ -0,0 +1,161 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===--------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions
+//===--------------------------------------------------------------------===//
+
+#include <complex>
+#include <cstddef>
+#include <stdexcept>
+#include <utility>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/complex.h>
+#include <pybind11/pybind11.h>
+
+#include "utils/output_validation.hpp"
+#include "utils/type_dispatch.hpp"
+
+#include "zeros_ctor.hpp"
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace dpctl::tensor::py_internal
+{
+
+using dpctl::utils::keep_args_alive;
+
+typedef sycl::event (*zeros_contig_fn_ptr_t)(sycl::queue &,
+                                             std::size_t,
+                                             char *,
+                                             const std::vector<sycl::event> &);
+
+/*!
+ * @brief Function to submit kernel to fill given contiguous memory allocation
+ * with zeros.
+ *
+ * @param exec_q  Sycl queue to which kernel is submitted for execution.
+ * @param nelems  Length of the sequence
+ * @param dst_p Kernel accessible USM pointer to the start of array to be
+ * populated.
+ * @param depends  List of events to wait for before starting computations, if
+ * any.
+ *
+ * @return Event to wait on to ensure that computation completes.
+ * @defgroup CtorKernels
+ */
+template <typename dstTy>
+sycl::event zeros_contig_impl(sycl::queue &exec_q,
+                              std::size_t nelems,
+                              char *dst_p,
+                              const std::vector<sycl::event> &depends)
+{
+
+    static constexpr int memset_val(0);
+    sycl::event fill_ev = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+
+        cgh.memset(reinterpret_cast<void *>(dst_p), memset_val,
+                   nelems * sizeof(dstTy));
+    });
+
+    return fill_ev;
+}
+
+template <typename fnT, typename Ty>
+struct ZerosContigFactory
+{
+    fnT get()
+    {
+        fnT f = zeros_contig_impl<Ty>;
+        return f;
+    }
+};
+
+static zeros_contig_fn_ptr_t zeros_contig_dispatch_vector[td_ns::num_types];
+
+std::pair<sycl::event, sycl::event>
+    usm_ndarray_zeros(const dpctl::tensor::usm_ndarray &dst,
+                      sycl::queue &exec_q,
+                      const std::vector<sycl::event> &depends)
+{
+    py::ssize_t dst_nelems = dst.get_size();
+
+    if (dst_nelems == 0) {
+        // nothing to do
+        return std::make_pair(sycl::event(), sycl::event());
+    }
+
+    if (!dpctl::utils::queues_are_compatible(exec_q, {dst})) {
+        throw py::value_error(
+            "Execution queue is not compatible with the allocation queue");
+    }
+
+    dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst);
+
+    auto array_types = td_ns::usm_ndarray_types();
+    int dst_typenum = dst.get_typenum();
+    int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum);
+
+    char *dst_data = dst.get_data();
+
+    if (dst_nelems == 1 || dst.is_c_contiguous() || dst.is_f_contiguous()) {
+        auto fn = zeros_contig_dispatch_vector[dst_typeid];
+
+        sycl::event zeros_contig_event =
+            fn(exec_q, static_cast<std::size_t>(dst_nelems), dst_data, depends);
+
+        return std::make_pair(
+            keep_args_alive(exec_q, {dst}, {zeros_contig_event}),
+            zeros_contig_event);
+    }
+    else {
+        throw std::runtime_error(
+            "Only population of contiguous usm_ndarray objects is supported.");
+    }
+}
+
+void init_zeros_ctor_dispatch_vectors(void)
+{
+    using namespace td_ns;
+
+    DispatchVectorBuilder<zeros_contig_fn_ptr_t, ZerosContigFactory, num_types>
+        dvb;
+    dvb.populate_dispatch_vector(zeros_contig_dispatch_vector);
+
+    return;
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/zeros_ctor.hpp b/dpctl_ext/tensor/libtensor/source/zeros_ctor.hpp
new file mode 100644
index 000000000000..51a1903a0f36
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/zeros_ctor.hpp
@@ -0,0 +1,54 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===--------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions
+//===--------------------------------------------------------------------===//
+
+#pragma once
+#include <utility>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/pybind11.h>
+
+namespace dpctl::tensor::py_internal
+{
+
+extern std::pair<sycl::event, sycl::event>
+    usm_ndarray_zeros(const dpctl::tensor::usm_ndarray &dst,
+                      sycl::queue &exec_q,
+                      const std::vector<sycl::event> &depends = {});
+
+extern void init_zeros_ctor_dispatch_vectors(void);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpnp/dpnp_algo/dpnp_fill.py b/dpnp/dpnp_algo/dpnp_fill.py
index 112ea3af0fdb..4137a2794747 100644
--- a/dpnp/dpnp_algo/dpnp_fill.py
+++ b/dpnp/dpnp_algo/dpnp_fill.py
@@ -31,14 +31,17 @@
 import dpctl.tensor as dpt
 import dpctl.utils as dpu
 from dpctl.tensor._ctors import _cast_fill_val
-from dpctl.tensor._tensor_impl import (
+
+import dpnp
+
+# TODO: revert to `from dpctl.tensor...`
+# when dpnp fully migrates dpctl/tensor
+from dpctl_ext.tensor._tensor_impl import (
     _copy_usm_ndarray_into_usm_ndarray,
     _full_usm_ndarray,
     _zeros_usm_ndarray,
 )
 
-import dpnp
-
 
 def dpnp_fill(arr, val):
     arr = dpnp.get_usm_ndarray(arr)
diff --git a/dpnp/dpnp_container.py b/dpnp/dpnp_container.py
index 4975db17c717..c8e28529cd57 100644
--- a/dpnp/dpnp_container.py
+++ b/dpnp/dpnp_container.py
@@ -38,6 +38,7 @@
 import dpctl.tensor as dpt
 import dpctl.utils as dpu
 
+import dpctl_ext.tensor as dpt_ext
 import dpnp
 from dpnp.dpnp_array import dpnp_array
 
@@ -228,7 +229,7 @@ def full(
         fill_value = fill_value.get_array()
 
     """Creates `dpnp_array` having a specified shape, filled with fill_value."""
-    array_obj = dpt.full(
+    array_obj = dpt_ext.full(
         shape,
         fill_value,
         dtype=dtype,
@@ -269,13 +270,13 @@ def ones(
 
 def tril(x1, /, *, k=0):
     """Creates `dpnp_array` as lower triangular part of an input array."""
-    array_obj = dpt.tril(dpnp.get_usm_ndarray(x1), k=k)
+    array_obj = dpt_ext.tril(dpnp.get_usm_ndarray(x1), k=k)
     return dpnp_array._create_from_usm_ndarray(array_obj)
 
 
 def triu(x1, /, *, k=0):
     """Creates `dpnp_array` as upper triangular part of an input array."""
-    array_obj = dpt.triu(dpnp.get_usm_ndarray(x1), k=k)
+    array_obj = dpt_ext.triu(dpnp.get_usm_ndarray(x1), k=k)
     return dpnp_array._create_from_usm_ndarray(array_obj)
 
 
diff --git a/dpnp/dpnp_iface.py b/dpnp/dpnp_iface.py
index 50b474014666..533bdc36c617 100644
--- a/dpnp/dpnp_iface.py
+++ b/dpnp/dpnp_iface.py
@@ -50,6 +50,7 @@
 import numpy
 from dpctl.tensor._device import normalize_queue_device
 
+# pylint: disable=no-name-in-module
 # TODO: revert to `import dpctl.tensor...`
 # when dpnp fully migrates dpctl/tensor
 import dpctl_ext.tensor._tensor_impl as ti
diff --git a/dpnp/dpnp_iface_indexing.py b/dpnp/dpnp_iface_indexing.py
index 7718412701e8..583561573b85 100644
--- a/dpnp/dpnp_iface_indexing.py
+++ b/dpnp/dpnp_iface_indexing.py
@@ -45,13 +45,18 @@
 from collections.abc import Iterable
 
 import dpctl.tensor as dpt
-import dpctl.tensor._tensor_impl as ti
 import dpctl.utils as dpu
 import numpy
 from dpctl.tensor._copy_utils import _nonzero_impl
 from dpctl.tensor._indexing_functions import _get_indexing_mode
 from dpctl.tensor._numpy_helper import normalize_axis_index
 
+import dpctl_ext.tensor as dpt_ext
+
+# pylint: disable=no-name-in-module
+# TODO: revert to `import dpctl.tensor...`
+# when dpnp fully migrates dpctl/tensor
+import dpctl_ext.tensor._tensor_impl as ti
 import dpnp
 
 # pylint: disable=no-name-in-module
@@ -813,7 +818,7 @@ def extract(condition, a):
         usm_a = dpt.reshape(usm_a, -1)
         usm_cond = dpt.reshape(usm_cond, -1)
 
-        usm_res = dpt.take(usm_a, dpt.nonzero(usm_cond)[0])
+        usm_res = dpt_ext.take(usm_a, dpt.nonzero(usm_cond)[0])
     else:
         if usm_cond.shape != usm_a.shape:
             usm_a = dpt.reshape(usm_a, -1)
@@ -1713,7 +1718,7 @@ def put(a, ind, v, /, *, axis=None, mode="wrap"):
     if axis is None and usm_a.ndim > 1:
         usm_a = dpt.reshape(usm_a, -1)
 
-    dpt.put(usm_a, usm_ind, usm_v, axis=axis, mode=mode)
+    dpt_ext.put(usm_a, usm_ind, usm_v, axis=axis, mode=mode)
     if in_usm_a._pointer != usm_a._pointer:  # pylint: disable=protected-access
         in_usm_a[:] = dpt.reshape(usm_a, in_usm_a.shape, copy=False)
 
diff --git a/dpnp/fft/dpnp_utils_fft.py b/dpnp/fft/dpnp_utils_fft.py
index 709494e6255e..534b9404254f 100644
--- a/dpnp/fft/dpnp_utils_fft.py
+++ b/dpnp/fft/dpnp_utils_fft.py
@@ -42,6 +42,10 @@
 from collections.abc import Sequence
 
 import dpctl
+
+# pylint: disable=no-name-in-module
+# TODO: remove it when ti.__linspace_step
+# is migrated to dpctl_ext/tensor
 import dpctl.tensor._tensor_impl as ti
 import dpctl.utils as dpu
 import numpy
@@ -51,6 +55,10 @@
 )
 from dpctl.utils import ExecutionPlacementError
 
+# pylint: disable=no-name-in-module
+# TODO: revert to `import dpctl.tensor...`
+# when dpnp fully migrates dpctl/tensor
+import dpctl_ext.tensor._tensor_impl as ti_ext
 import dpnp
 import dpnp.backend.extensions.fft._fft_impl as fi
 
@@ -196,8 +204,8 @@ def _compute_result(dsc, a, out, forward, c2c, out_strides):
         out_usm = None if out is None else dpnp.get_usm_ndarray(out)
         if (
             out is not None
-            and out_usm.strides == tuple(out_strides)
-            and not ti._array_overlap(a_usm, out_usm)
+            and out.strides == tuple(out_strides)
+            and not ti_ext._array_overlap(a_usm, dpnp.get_usm_ndarray(out))
         ):
             res_usm = out_usm
             result = out
@@ -530,7 +538,7 @@ def _truncate_or_pad(a, shape, axes):
             )
             _manager = dpu.SequentialOrderManager[exec_q]
             dep_evs = _manager.submitted_events
-            ht_copy_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
+            ht_copy_ev, copy_ev = ti_ext._copy_usm_ndarray_into_usm_ndarray(
                 src=dpnp.get_usm_ndarray(a),
                 dst=z.get_array()[tuple(index)],
                 sycl_queue=exec_q,
diff --git a/dpnp/linalg/dpnp_utils_linalg.py b/dpnp/linalg/dpnp_utils_linalg.py
index 6881c7787e9f..c6897e7b0614 100644
--- a/dpnp/linalg/dpnp_utils_linalg.py
+++ b/dpnp/linalg/dpnp_utils_linalg.py
@@ -42,12 +42,15 @@
 
 from typing import NamedTuple
 
-import dpctl.tensor._tensor_impl as ti
 import dpctl.utils as dpu
 import numpy
 from dpctl.tensor._numpy_helper import normalize_axis_index
 from numpy import prod
 
+# pylint: disable=no-name-in-module
+# TODO: revert to `import dpctl.tensor...`
+# when dpnp fully migrates dpctl/tensor
+import dpctl_ext.tensor._tensor_impl as ti
 import dpnp
 import dpnp.backend.extensions.lapack._lapack_impl as li
 from dpnp.dpnp_utils import get_usm_allocations
diff --git a/dpnp/scipy/linalg/_utils.py b/dpnp/scipy/linalg/_utils.py
index 9e5ae405ccc5..f7bdd5330d42 100644
--- a/dpnp/scipy/linalg/_utils.py
+++ b/dpnp/scipy/linalg/_utils.py
@@ -45,6 +45,7 @@
 
 import dpctl.utils as dpu
 
+# pylint: disable=no-name-in-module
 # TODO: revert to `import dpctl.tensor...`
 # when dpnp fully migrates dpctl/tensor
 import dpctl_ext.tensor._tensor_impl as ti

From 195b89330a174c3d614db7b6207469413ae6bc55 Mon Sep 17 00:00:00 2001
From: vlad-perevezentsev <vladislav.perevezentsev@intel.com>
Date: Tue, 3 Mar 2026 11:25:26 +0100
Subject: [PATCH 04/43] Extend `._tensor_impl` with copy functions  (#2774)

This PR extends `_tensor_impl` in `dpctl_ext.tensor` with the copy
functions (`_copy_usm_ndarray_for_reshape` ,
`_copy_numpy_ndarray_into_usm_ndarray`. `_copy_usm_ndarray_for_roll_1d`,
`_copy_usm_ndarray_for_roll_nd`)

It also adds `asnumpy(), astype(), copy(), from_numpy(), to_numpy(),
roll(), and reshape()` to `dpctl_ext.tensor` and updates the
corresponding dpnp functions to use these implementations internally
---
 dpctl_ext/tensor/CMakeLists.txt               |   6 +-
 dpctl_ext/tensor/__init__.py                  |  18 +
 dpctl_ext/tensor/_copy_utils.py               | 755 ++++++++++++++++++
 dpctl_ext/tensor/_ctors.py                    |   5 +-
 dpctl_ext/tensor/_indexing_functions.py       |   5 +-
 dpctl_ext/tensor/_manipulation_functions.py   | 120 +++
 dpctl_ext/tensor/_reshape.py                  | 209 +++++
 .../libtensor/source/copy_for_reshape.cpp     | 184 +++++
 .../libtensor/source/copy_for_reshape.hpp     |  54 ++
 .../tensor/libtensor/source/copy_for_roll.cpp | 400 ++++++++++
 .../tensor/libtensor/source/copy_for_roll.hpp |  65 ++
 .../copy_numpy_ndarray_into_usm_ndarray.cpp   | 368 +++++++++
 .../copy_numpy_ndarray_into_usm_ndarray.hpp   |  57 ++
 .../tensor/libtensor/source/tensor_ctors.cpp  |  71 +-
 dpnp/dpnp_algo/dpnp_arraycreation.py          |   9 +-
 dpnp/dpnp_algo/dpnp_elementwise_common.py     |  13 +-
 dpnp/dpnp_algo/dpnp_fill.py                   |   6 +-
 dpnp/dpnp_array.py                            |   5 +-
 dpnp/dpnp_container.py                        |   4 +-
 dpnp/dpnp_iface.py                            |   3 +-
 dpnp/dpnp_iface_arraycreation.py              |  15 +-
 dpnp/dpnp_iface_indexing.py                   |  39 +-
 dpnp/dpnp_iface_manipulation.py               |  11 +-
 dpnp/dpnp_iface_sorting.py                    |   5 +-
 dpnp/dpnp_iface_statistics.py                 |   5 +-
 dpnp/tests/test_arraycreation.py              |   5 +-
 26 files changed, 2351 insertions(+), 86 deletions(-)
 create mode 100644 dpctl_ext/tensor/_copy_utils.py
 create mode 100644 dpctl_ext/tensor/_manipulation_functions.py
 create mode 100644 dpctl_ext/tensor/_reshape.py
 create mode 100644 dpctl_ext/tensor/libtensor/source/copy_for_reshape.cpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/copy_for_reshape.hpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/copy_for_roll.cpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/copy_for_roll.hpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/copy_numpy_ndarray_into_usm_ndarray.cpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/copy_numpy_ndarray_into_usm_ndarray.hpp

diff --git a/dpctl_ext/tensor/CMakeLists.txt b/dpctl_ext/tensor/CMakeLists.txt
index fd781a9f9586..93555981deaa 100644
--- a/dpctl_ext/tensor/CMakeLists.txt
+++ b/dpctl_ext/tensor/CMakeLists.txt
@@ -48,9 +48,9 @@ set(_tensor_impl_sources
     # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/accumulators.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_and_cast_usm_to_usm.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_as_contig.cpp
-    # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_numpy_ndarray_into_usm_ndarray.cpp
-    # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_for_reshape.cpp
-    # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_for_roll.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_numpy_ndarray_into_usm_ndarray.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_for_reshape.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_for_roll.cpp
     # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/linear_sequences.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/integer_advanced_indexing.cpp
     # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/boolean_advanced_indexing.cpp
diff --git a/dpctl_ext/tensor/__init__.py b/dpctl_ext/tensor/__init__.py
index 3c6939eff7a0..edb2c096bad1 100644
--- a/dpctl_ext/tensor/__init__.py
+++ b/dpctl_ext/tensor/__init__.py
@@ -27,6 +27,13 @@
 # *****************************************************************************
 
 
+from dpctl_ext.tensor._copy_utils import (
+    asnumpy,
+    astype,
+    copy,
+    from_numpy,
+    to_numpy,
+)
 from dpctl_ext.tensor._ctors import (
     full,
     tril,
@@ -36,11 +43,22 @@
     put,
     take,
 )
+from dpctl_ext.tensor._manipulation_functions import (
+    roll,
+)
+from dpctl_ext.tensor._reshape import reshape
 
 __all__ = [
+    "asnumpy",
+    "astype",
+    "copy",
+    "from_numpy",
     "full",
     "put",
+    "reshape",
+    "roll",
     "take",
+    "to_numpy",
     "tril",
     "triu",
 ]
diff --git a/dpctl_ext/tensor/_copy_utils.py b/dpctl_ext/tensor/_copy_utils.py
new file mode 100644
index 000000000000..c62218893a2c
--- /dev/null
+++ b/dpctl_ext/tensor/_copy_utils.py
@@ -0,0 +1,755 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+import builtins
+
+import dpctl
+import dpctl.memory as dpm
+import dpctl.tensor as dpt
+import dpctl.utils
+import numpy as np
+from dpctl.tensor._data_types import _get_dtype
+from dpctl.tensor._device import normalize_queue_device
+from dpctl.tensor._type_utils import _dtype_supported_by_device_impl
+
+# TODO: revert to `import dpctl.tensor...`
+# when dpnp fully migrates dpctl/tensor
+import dpctl_ext.tensor._tensor_impl as ti
+
+__doc__ = (
+    "Implementation module for copy- and cast- operations on "
+    ":class:`dpctl.tensor.usm_ndarray`."
+)
+
+int32_t_max = 1 + np.iinfo(np.int32).max
+
+
+def _copy_to_numpy(ary):
+    if not isinstance(ary, dpt.usm_ndarray):
+        raise TypeError(f"Expected dpctl.tensor.usm_ndarray, got {type(ary)}")
+    if ary.size == 0:
+        # no data needs to be copied for zero sized array
+        return np.ndarray(ary.shape, dtype=ary.dtype)
+    nb = ary.usm_data.nbytes
+    q = ary.sycl_queue
+    hh = dpm.MemoryUSMHost(nb, queue=q)
+    h = np.ndarray(nb, dtype="u1", buffer=hh).view(ary.dtype)
+    itsz = ary.itemsize
+    strides_bytes = tuple(si * itsz for si in ary.strides)
+    offset = ary._element_offset * itsz
+    # ensure that content of ary.usm_data is final
+    q.wait()
+    hh.copy_from_device(ary.usm_data)
+    return np.ndarray(
+        ary.shape,
+        dtype=ary.dtype,
+        buffer=h,
+        strides=strides_bytes,
+        offset=offset,
+    )
+
+
+def _copy_from_numpy(np_ary, usm_type="device", sycl_queue=None):
+    """Copies numpy array `np_ary` into a new usm_ndarray"""
+    # This may perform a copy to meet stated requirements
+    Xnp = np.require(np_ary, requirements=["A", "E"])
+    alloc_q = normalize_queue_device(sycl_queue=sycl_queue, device=None)
+    dt = Xnp.dtype
+    if dt.char in "dD" and alloc_q.sycl_device.has_aspect_fp64 is False:
+        Xusm_dtype = (
+            dpt.dtype("float32") if dt.char == "d" else dpt.dtype("complex64")
+        )
+    else:
+        Xusm_dtype = dt
+    Xusm = dpt.empty(
+        Xnp.shape, dtype=Xusm_dtype, usm_type=usm_type, sycl_queue=sycl_queue
+    )
+    _copy_from_numpy_into(Xusm, Xnp)
+    return Xusm
+
+
+def _copy_from_numpy_into(dst, np_ary):
+    """Copies `np_ary` into `dst` of type :class:`dpctl.tensor.usm_ndarray"""
+    if not isinstance(np_ary, np.ndarray):
+        raise TypeError(f"Expected numpy.ndarray, got {type(np_ary)}")
+    if not isinstance(dst, dpt.usm_ndarray):
+        raise TypeError(f"Expected usm_ndarray, got {type(dst)}")
+    if np_ary.flags["OWNDATA"]:
+        Xnp = np_ary
+    else:
+        # Determine base of input array
+        base = np_ary.base
+        while isinstance(base, np.ndarray):
+            base = base.base
+        if isinstance(base, dpm._memory._Memory):
+            # we must perform a copy, since subsequent
+            # _copy_numpy_ndarray_into_usm_ndarray is implemented using
+            # sycl::buffer, and using USM-pointers with sycl::buffer
+            # results is undefined behavior
+            Xnp = np_ary.copy()
+        else:
+            Xnp = np_ary
+    src_ary = np.broadcast_to(Xnp, dst.shape)
+    copy_q = dst.sycl_queue
+    if copy_q.sycl_device.has_aspect_fp64 is False:
+        src_ary_dt_c = src_ary.dtype.char
+        if src_ary_dt_c == "d":
+            src_ary = src_ary.astype(np.float32)
+        elif src_ary_dt_c == "D":
+            src_ary = src_ary.astype(np.complex64)
+    _manager = dpctl.utils.SequentialOrderManager[copy_q]
+    dep_ev = _manager.submitted_events
+    # synchronizing call
+    ti._copy_numpy_ndarray_into_usm_ndarray(
+        src=src_ary, dst=dst, sycl_queue=copy_q, depends=dep_ev
+    )
+
+
+def from_numpy(np_ary, /, *, device=None, usm_type="device", sycl_queue=None):
+    """
+    from_numpy(arg, device=None, usm_type="device", sycl_queue=None)
+
+    Creates :class:`dpctl.tensor.usm_ndarray` from instance of
+    :class:`numpy.ndarray`.
+
+    Args:
+        arg:
+            Input convertible to :class:`numpy.ndarray`
+        device (object): array API specification of device where the
+            output array is created. Device can be specified by
+            a filter selector string, an instance of
+            :class:`dpctl.SyclDevice`, an instance of
+            :class:`dpctl.SyclQueue`, or an instance of
+            :class:`dpctl.tensor.Device`. If the value is ``None``,
+            returned array is created on the default-selected device.
+            Default: ``None``
+        usm_type (str): The requested USM allocation type for the
+            output array. Recognized values are ``"device"``,
+            ``"shared"``, or ``"host"``
+        sycl_queue (:class:`dpctl.SyclQueue`, optional):
+            A SYCL queue that determines output array allocation device
+            as well as execution placement of data movement operations.
+            The ``device`` and ``sycl_queue`` arguments
+            are equivalent. Only one of them should be specified. If both
+            are provided, they must be consistent and result in using the
+            same execution queue. Default: ``None``
+
+    The returned array has the same shape, and the same data type kind.
+    If the device does not support the data type of input array, a
+    closest support data type of the same kind may be returned, e.g.
+    input array of type ``float16`` may be upcast to ``float32`` if the
+    target device does not support 16-bit floating point type.
+    """
+    q = normalize_queue_device(sycl_queue=sycl_queue, device=device)
+    return _copy_from_numpy(np_ary, usm_type=usm_type, sycl_queue=q)
+
+
+def to_numpy(usm_ary, /):
+    """
+    to_numpy(usm_ary)
+
+    Copies content of :class:`dpctl.tensor.usm_ndarray` instance ``usm_ary``
+    into :class:`numpy.ndarray` instance of the same shape and same data type.
+
+    Args:
+        usm_ary (usm_ndarray):
+            Input array
+    Returns:
+        :class:`numpy.ndarray`:
+            An instance of :class:`numpy.ndarray` populated with content of
+            ``usm_ary``
+    """
+    return _copy_to_numpy(usm_ary)
+
+
+def asnumpy(usm_ary):
+    """
+    asnumpy(usm_ary)
+
+    Copies content of :class:`dpctl.tensor.usm_ndarray` instance ``usm_ary``
+    into :class:`numpy.ndarray` instance of the same shape and same data
+    type.
+
+    Args:
+        usm_ary (usm_ndarray):
+            Input array
+    Returns:
+        :class:`numpy.ndarray`:
+            An instance of :class:`numpy.ndarray` populated with content
+            of ``usm_ary``
+    """
+    return _copy_to_numpy(usm_ary)
+
+
+class Dummy:
+    """Helper class with specified ``__sycl_usm_array_interface__`` attribute"""
+
+    def __init__(self, iface):
+        self.__sycl_usm_array_interface__ = iface
+
+
+def _copy_overlapping(dst, src):
+    """Assumes src and dst have the same shape."""
+    q = normalize_queue_device(sycl_queue=dst.sycl_queue)
+    tmp = dpt.usm_ndarray(
+        src.shape,
+        dtype=src.dtype,
+        buffer="device",
+        order="C",
+        buffer_ctor_kwargs={"queue": q},
+    )
+    _manager = dpctl.utils.SequentialOrderManager[q]
+    dep_evs = _manager.submitted_events
+    hcp1, cp1 = ti._copy_usm_ndarray_into_usm_ndarray(
+        src=src, dst=tmp, sycl_queue=q, depends=dep_evs
+    )
+    _manager.add_event_pair(hcp1, cp1)
+    hcp2, cp2 = ti._copy_usm_ndarray_into_usm_ndarray(
+        src=tmp, dst=dst, sycl_queue=q, depends=[cp1]
+    )
+    _manager.add_event_pair(hcp2, cp2)
+
+
+def _copy_same_shape(dst, src):
+    """Assumes src and dst have the same shape."""
+    # check that memory regions do not overlap
+    if ti._array_overlap(dst, src):
+        if src._pointer == dst._pointer and (
+            src is dst
+            or (src.strides == dst.strides and src.dtype == dst.dtype)
+        ):
+            return
+        _copy_overlapping(src=src, dst=dst)
+        return
+
+    copy_q = dst.sycl_queue
+    _manager = dpctl.utils.SequentialOrderManager[copy_q]
+    dep_evs = _manager.submitted_events
+    hev, cpy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
+        src=src, dst=dst, sycl_queue=copy_q, depends=dep_evs
+    )
+    _manager.add_event_pair(hev, cpy_ev)
+
+
+if hasattr(np, "broadcast_shapes"):
+
+    def _broadcast_shapes(sh1, sh2):
+        return np.broadcast_shapes(sh1, sh2)
+
+else:
+
+    def _broadcast_shapes(sh1, sh2):
+        # use arrays with zero strides, whose memory footprint
+        # is independent of the number of array elements
+        return np.broadcast(
+            np.empty(sh1, dtype=[]),
+            np.empty(sh2, dtype=[]),
+        ).shape
+
+
+def _broadcast_strides(X_shape, X_strides, res_ndim):
+    """
+    Broadcasts strides to match the given dimensions;
+    returns tuple type strides.
+    """
+    out_strides = [0] * res_ndim
+    X_shape_len = len(X_shape)
+    str_dim = -X_shape_len
+    for i in range(X_shape_len):
+        shape_value = X_shape[i]
+        if not shape_value == 1:
+            out_strides[str_dim] = X_strides[i]
+        str_dim += 1
+
+    return tuple(out_strides)
+
+
+def _copy_from_usm_ndarray_to_usm_ndarray(dst, src):
+    if any(
+        not isinstance(arg, dpt.usm_ndarray)
+        for arg in (
+            dst,
+            src,
+        )
+    ):
+        raise TypeError(
+            "Both types are expected to be dpctl.tensor.usm_ndarray, "
+            f"got {type(dst)} and {type(src)}."
+        )
+
+    if dst.ndim == src.ndim and dst.shape == src.shape:
+        _copy_same_shape(dst, src)
+        return
+
+    try:
+        common_shape = _broadcast_shapes(dst.shape, src.shape)
+    except ValueError as exc:
+        raise ValueError("Shapes of two arrays are not compatible") from exc
+
+    if dst.size < src.size and dst.size < np.prod(common_shape):
+        raise ValueError("Destination is smaller ")
+
+    if len(common_shape) > dst.ndim:
+        ones_count = len(common_shape) - dst.ndim
+        for k in range(ones_count):
+            if common_shape[k] != 1:
+                raise ValueError
+        common_shape = common_shape[ones_count:]
+
+    if src.ndim < len(common_shape):
+        new_src_strides = _broadcast_strides(
+            src.shape, src.strides, len(common_shape)
+        )
+        src_same_shape = dpt.usm_ndarray(
+            common_shape,
+            dtype=src.dtype,
+            buffer=src,
+            strides=new_src_strides,
+            offset=src._element_offset,
+        )
+    elif src.ndim == len(common_shape):
+        new_src_strides = _broadcast_strides(
+            src.shape, src.strides, len(common_shape)
+        )
+        src_same_shape = dpt.usm_ndarray(
+            common_shape,
+            dtype=src.dtype,
+            buffer=src,
+            strides=new_src_strides,
+            offset=src._element_offset,
+        )
+    else:
+        # since broadcasting succeeded, src.ndim is greater because of
+        # leading sequence of ones, so we trim it
+        n = len(common_shape)
+        new_src_strides = _broadcast_strides(
+            src.shape[-n:], src.strides[-n:], n
+        )
+        src_same_shape = dpt.usm_ndarray(
+            common_shape,
+            dtype=src.dtype,
+            buffer=src.usm_data,
+            strides=new_src_strides,
+            offset=src._element_offset,
+        )
+
+    _copy_same_shape(dst, src_same_shape)
+
+
+def _make_empty_like_orderK(x, dt, usm_type, dev):
+    """
+    Returns empty array with shape and strides like `x`, with dtype `dt`,
+    USM type `usm_type`, on device `dev`.
+    """
+    st = list(x.strides)
+    perm = sorted(
+        range(x.ndim),
+        key=lambda d: builtins.abs(st[d]) if x.shape[d] > 1 else 0,
+        reverse=True,
+    )
+    inv_perm = sorted(range(x.ndim), key=lambda i: perm[i])
+    sh = x.shape
+    sh_sorted = tuple(sh[i] for i in perm)
+    R = dpt.empty(sh_sorted, dtype=dt, usm_type=usm_type, device=dev, order="C")
+    if min(st) < 0:
+        st_sorted = [st[i] for i in perm]
+        sl = tuple(
+            (
+                slice(None, None, -1)
+                if st_sorted[i] < 0
+                else slice(None, None, None)
+            )
+            for i in range(x.ndim)
+        )
+        R = R[sl]
+    return dpt.permute_dims(R, inv_perm)
+
+
+def _empty_like_orderK(x, dt, usm_type=None, dev=None):
+    """
+    Returns empty array like `x`, using order='K'
+
+    For an array `x` that was obtained by permutation of a contiguous
+    array the returned array will have the same shape and the same
+    strides as `x`.
+    """
+    if not isinstance(x, dpt.usm_ndarray):
+        raise TypeError(f"Expected usm_ndarray, got {type(x)}")
+    if usm_type is None:
+        usm_type = x.usm_type
+    if dev is None:
+        dev = x.device
+    fl = x.flags
+    if fl["C"] or x.size <= 1:
+        return dpt.empty_like(
+            x, dtype=dt, usm_type=usm_type, device=dev, order="C"
+        )
+    elif fl["F"]:
+        return dpt.empty_like(
+            x, dtype=dt, usm_type=usm_type, device=dev, order="F"
+        )
+    return _make_empty_like_orderK(x, dt, usm_type, dev)
+
+
+def _from_numpy_empty_like_orderK(x, dt, usm_type, dev):
+    """
+    Returns empty usm_ndarray like NumPy array `x`, using order='K'
+
+    For an array `x` that was obtained by permutation of a contiguous
+    array the returned array will have the same shape and the same
+    strides as `x`.
+    """
+    if not isinstance(x, np.ndarray):
+        raise TypeError(f"Expected numpy.ndarray, got {type(x)}")
+    fl = x.flags
+    if fl["C"] or x.size <= 1:
+        return dpt.empty(
+            x.shape, dtype=dt, usm_type=usm_type, device=dev, order="C"
+        )
+    elif fl["F"]:
+        return dpt.empty(
+            x.shape, dtype=dt, usm_type=usm_type, device=dev, order="F"
+        )
+    return _make_empty_like_orderK(x, dt, usm_type, dev)
+
+
+def _empty_like_pair_orderK(X1, X2, dt, res_shape, usm_type, dev):
+    if not isinstance(X1, dpt.usm_ndarray):
+        raise TypeError(f"Expected usm_ndarray, got {type(X1)}")
+    if not isinstance(X2, dpt.usm_ndarray):
+        raise TypeError(f"Expected usm_ndarray, got {type(X2)}")
+    nd1 = X1.ndim
+    nd2 = X2.ndim
+    if nd1 > nd2 and X1.shape == res_shape:
+        return _empty_like_orderK(X1, dt, usm_type, dev)
+    elif nd1 < nd2 and X2.shape == res_shape:
+        return _empty_like_orderK(X2, dt, usm_type, dev)
+    fl1 = X1.flags
+    fl2 = X2.flags
+    if fl1["C"] or fl2["C"]:
+        return dpt.empty(
+            res_shape, dtype=dt, usm_type=usm_type, device=dev, order="C"
+        )
+    if fl1["F"] and fl2["F"]:
+        return dpt.empty(
+            res_shape, dtype=dt, usm_type=usm_type, device=dev, order="F"
+        )
+    st1 = list(X1.strides)
+    st2 = list(X2.strides)
+    max_ndim = max(nd1, nd2)
+    st1 += [0] * (max_ndim - len(st1))
+    st2 += [0] * (max_ndim - len(st2))
+    sh1 = list(X1.shape) + [0] * (max_ndim - nd1)
+    sh2 = list(X2.shape) + [0] * (max_ndim - nd2)
+    perm = sorted(
+        range(max_ndim),
+        key=lambda d: (
+            builtins.abs(st1[d]) if sh1[d] > 1 else 0,
+            builtins.abs(st2[d]) if sh2[d] > 1 else 0,
+        ),
+        reverse=True,
+    )
+    inv_perm = sorted(range(max_ndim), key=lambda i: perm[i])
+    st1_sorted = [st1[i] for i in perm]
+    st2_sorted = [st2[i] for i in perm]
+    sh = res_shape
+    sh_sorted = tuple(sh[i] for i in perm)
+    R = dpt.empty(sh_sorted, dtype=dt, usm_type=usm_type, device=dev, order="C")
+    if max(min(st1_sorted), min(st2_sorted)) < 0:
+        sl = tuple(
+            (
+                slice(None, None, -1)
+                if (st1_sorted[i] < 0 and st2_sorted[i] < 0)
+                else slice(None, None, None)
+            )
+            for i in range(nd1)
+        )
+        R = R[sl]
+    return dpt.permute_dims(R, inv_perm)
+
+
+def _empty_like_triple_orderK(X1, X2, X3, dt, res_shape, usm_type, dev):
+    if not isinstance(X1, dpt.usm_ndarray):
+        raise TypeError(f"Expected usm_ndarray, got {type(X1)}")
+    if not isinstance(X2, dpt.usm_ndarray):
+        raise TypeError(f"Expected usm_ndarray, got {type(X2)}")
+    if not isinstance(X3, dpt.usm_ndarray):
+        raise TypeError(f"Expected usm_ndarray, got {type(X3)}")
+    nd1 = X1.ndim
+    nd2 = X2.ndim
+    nd3 = X3.ndim
+    if X1.shape == res_shape and X2.shape == res_shape and len(res_shape) > nd3:
+        return _empty_like_pair_orderK(X1, X2, dt, res_shape, usm_type, dev)
+    elif (
+        X2.shape == res_shape and X3.shape == res_shape and len(res_shape) > nd1
+    ):
+        return _empty_like_pair_orderK(X2, X3, dt, res_shape, usm_type, dev)
+    elif (
+        X1.shape == res_shape and X3.shape == res_shape and len(res_shape) > nd2
+    ):
+        return _empty_like_pair_orderK(X1, X3, dt, res_shape, usm_type, dev)
+    fl1 = X1.flags
+    fl2 = X2.flags
+    fl3 = X3.flags
+    if fl1["C"] or fl2["C"] or fl3["C"]:
+        return dpt.empty(
+            res_shape, dtype=dt, usm_type=usm_type, device=dev, order="C"
+        )
+    if fl1["F"] and fl2["F"] and fl3["F"]:
+        return dpt.empty(
+            res_shape, dtype=dt, usm_type=usm_type, device=dev, order="F"
+        )
+    st1 = list(X1.strides)
+    st2 = list(X2.strides)
+    st3 = list(X3.strides)
+    max_ndim = max(nd1, nd2, nd3)
+    st1 += [0] * (max_ndim - len(st1))
+    st2 += [0] * (max_ndim - len(st2))
+    st3 += [0] * (max_ndim - len(st3))
+    sh1 = list(X1.shape) + [0] * (max_ndim - nd1)
+    sh2 = list(X2.shape) + [0] * (max_ndim - nd2)
+    sh3 = list(X3.shape) + [0] * (max_ndim - nd3)
+    perm = sorted(
+        range(max_ndim),
+        key=lambda d: (
+            builtins.abs(st1[d]) if sh1[d] > 1 else 0,
+            builtins.abs(st2[d]) if sh2[d] > 1 else 0,
+            builtins.abs(st3[d]) if sh3[d] > 1 else 0,
+        ),
+        reverse=True,
+    )
+    inv_perm = sorted(range(max_ndim), key=lambda i: perm[i])
+    st1_sorted = [st1[i] for i in perm]
+    st2_sorted = [st2[i] for i in perm]
+    st3_sorted = [st3[i] for i in perm]
+    sh = res_shape
+    sh_sorted = tuple(sh[i] for i in perm)
+    R = dpt.empty(sh_sorted, dtype=dt, usm_type=usm_type, device=dev, order="C")
+    if max(min(st1_sorted), min(st2_sorted), min(st3_sorted)) < 0:
+        sl = tuple(
+            (
+                slice(None, None, -1)
+                if (
+                    st1_sorted[i] < 0
+                    and st2_sorted[i] < 0
+                    and st3_sorted[i] < 0
+                )
+                else slice(None, None, None)
+            )
+            for i in range(nd1)
+        )
+        R = R[sl]
+    return dpt.permute_dims(R, inv_perm)
+
+
+def copy(usm_ary, /, *, order="K"):
+    """copy(ary, order="K")
+
+    Creates a copy of given instance of :class:`dpctl.tensor.usm_ndarray`.
+
+    Args:
+        ary (usm_ndarray):
+            Input array
+        order (``"C"``, ``"F"``, ``"A"``, ``"K"``, optional):
+            Controls the memory layout of the output array
+    Returns:
+        usm_ndarray:
+            A copy of the input array.
+
+    Memory layout of the copy is controlled by ``order`` keyword,
+    following NumPy's conventions. The ``order`` keywords can be
+    one of the following:
+
+    .. list-table::
+
+        * - ``"C"``
+          - C-contiguous memory layout
+        * - ``"F"``
+          - Fortran-contiguous memory layout
+        * - ``"A"``
+          - Fortran-contiguous if the input array is also Fortran-contiguous,
+            otherwise C-contiguous
+        * - ``"K"``
+          - match the layout of ``usm_ary`` as closely as possible.
+
+    """
+    if len(order) == 0 or order[0] not in "KkAaCcFf":
+        raise ValueError(
+            "Unrecognized order keyword value, expecting 'K', 'A', 'F', or 'C'."
+        )
+    order = order[0].upper()
+    if not isinstance(usm_ary, dpt.usm_ndarray):
+        raise TypeError(
+            f"Expected object of type dpt.usm_ndarray, got {type(usm_ary)}"
+        )
+    copy_order = "C"
+    if order == "C":
+        pass
+    elif order == "F":
+        copy_order = order
+    elif order == "A":
+        if usm_ary.flags.f_contiguous:
+            copy_order = "F"
+    elif order == "K":
+        if usm_ary.flags.f_contiguous:
+            copy_order = "F"
+    else:
+        raise ValueError(
+            "Unrecognized value of the order keyword. "
+            "Recognized values are 'A', 'C', 'F', or 'K'"
+        )
+    if order == "K":
+        R = _empty_like_orderK(usm_ary, usm_ary.dtype)
+    else:
+        R = dpt.usm_ndarray(
+            usm_ary.shape,
+            dtype=usm_ary.dtype,
+            buffer=usm_ary.usm_type,
+            order=copy_order,
+            buffer_ctor_kwargs={"queue": usm_ary.sycl_queue},
+        )
+    _copy_same_shape(R, usm_ary)
+    return R
+
+
+def astype(
+    usm_ary, newdtype, /, *, order="K", casting="unsafe", copy=True, device=None
+):
+    """astype(array, new_dtype, order="K", casting="unsafe", \
+            copy=True, device=None)
+
+    Returns a copy of the :class:`dpctl.tensor.usm_ndarray`, cast to a
+    specified type.
+
+    Args:
+        array (usm_ndarray):
+            An input array.
+        new_dtype (dtype):
+            The data type of the resulting array. If `None`, gives default
+            floating point type supported by device where the resulting array
+            will be located.
+        order ({"C", "F", "A", "K"}, optional):
+            Controls memory layout of the resulting array if a copy
+            is returned.
+        casting ({'no', 'equiv', 'safe', 'same_kind', 'unsafe'}, optional):
+            Controls what kind of data casting may occur. Please see
+            :meth:`numpy.ndarray.astype` for description of casting modes.
+        copy (bool, optional):
+            By default, `astype` always returns a newly allocated array.
+            If this keyword is set to `False`, a view of the input array
+            may be returned when possible.
+        device (object): array API specification of device where the
+            output array is created. Device can be specified by
+            a filter selector string, an instance of
+            :class:`dpctl.SyclDevice`, an instance of
+            :class:`dpctl.SyclQueue`, or an instance of
+            :class:`dpctl.tensor.Device`. If the value is `None`,
+            returned array is created on the same device as `array`.
+            Default: `None`.
+
+    Returns:
+        usm_ndarray:
+            An array with requested data type.
+
+    A view can be returned, if possible, when `copy=False` is used.
+    """
+    if not isinstance(usm_ary, dpt.usm_ndarray):
+        return TypeError(
+            f"Expected object of type dpt.usm_ndarray, got {type(usm_ary)}"
+        )
+    if len(order) == 0 or order[0] not in "KkAaCcFf":
+        raise ValueError(
+            "Unrecognized order keyword value, expecting 'K', 'A', 'F', or 'C'."
+        )
+    order = order[0].upper()
+    ary_dtype = usm_ary.dtype
+    if device is not None:
+        if not isinstance(device, dpctl.SyclQueue):
+            if isinstance(device, dpt.Device):
+                device = device.sycl_queue
+            else:
+                device = dpt.Device.create_device(device).sycl_queue
+        d = device.sycl_device
+        target_dtype = _get_dtype(newdtype, device)
+        if not _dtype_supported_by_device_impl(
+            target_dtype, d.has_aspect_fp16, d.has_aspect_fp64
+        ):
+            raise ValueError(
+                f"Requested dtype '{target_dtype}' is not supported by the "
+                "target device"
+            )
+        usm_ary = usm_ary.to_device(device)
+    else:
+        target_dtype = _get_dtype(newdtype, usm_ary.sycl_queue)
+
+    if not dpt.can_cast(ary_dtype, target_dtype, casting=casting):
+        raise TypeError(
+            f"Can not cast from {ary_dtype} to {newdtype} "
+            f"according to rule {casting}."
+        )
+    c_contig = usm_ary.flags.c_contiguous
+    f_contig = usm_ary.flags.f_contiguous
+    needs_copy = copy or not ary_dtype == target_dtype
+    if not needs_copy and (order != "K"):
+        # ensure that order="F" for C-contig input triggers copy,
+        # and order="C" for F-contig input triggers copy too.
+        # 1D arrays which are both C- and F- contig should not
+        # force copying for neither order="F", nor order="C", see gh-1926
+        needs_copy = (
+            c_contig and not f_contig and order not in ["A", "C"]
+        ) or (not c_contig and f_contig and order not in ["A", "F"])
+    if not needs_copy:
+        return usm_ary
+    copy_order = "C"
+    if order == "C":
+        pass
+    elif order == "F":
+        copy_order = order
+    elif order == "A":
+        if usm_ary.flags.f_contiguous:
+            copy_order = "F"
+    elif order == "K":
+        if usm_ary.flags.f_contiguous:
+            copy_order = "F"
+    else:
+        raise ValueError(
+            "Unrecognized value of the order keyword. "
+            "Recognized values are 'A', 'C', 'F', or 'K'"
+        )
+    if order == "K":
+        R = _empty_like_orderK(usm_ary, target_dtype)
+    else:
+        R = dpt.usm_ndarray(
+            usm_ary.shape,
+            dtype=target_dtype,
+            buffer=usm_ary.usm_type,
+            order=copy_order,
+            buffer_ctor_kwargs={"queue": usm_ary.sycl_queue},
+        )
+    _copy_from_usm_ndarray_to_usm_ndarray(R, usm_ary)
+    return R
diff --git a/dpctl_ext/tensor/_ctors.py b/dpctl_ext/tensor/_ctors.py
index a0e7b28e66ff..5a39e9367e9c 100644
--- a/dpctl_ext/tensor/_ctors.py
+++ b/dpctl_ext/tensor/_ctors.py
@@ -36,6 +36,9 @@
 from dpctl.tensor._data_types import _get_dtype
 from dpctl.tensor._device import normalize_queue_device
 
+# TODO: revert to `import dpctl.tensor...`
+# when dpnp fully migrates dpctl/tensor
+import dpctl_ext.tensor as dpt_ext
 import dpctl_ext.tensor._tensor_impl as ti
 
 
@@ -147,7 +150,7 @@ def full(
             usm_type=usm_type,
             sycl_queue=sycl_queue,
         )
-        return dpt.copy(dpt.broadcast_to(X, shape), order=order)
+        return dpt_ext.copy(dpt.broadcast_to(X, shape), order=order)
     else:
         _validate_fill_value(fill_value)
 
diff --git a/dpctl_ext/tensor/_indexing_functions.py b/dpctl_ext/tensor/_indexing_functions.py
index 106df09cf97e..df4f3e953042 100644
--- a/dpctl_ext/tensor/_indexing_functions.py
+++ b/dpctl_ext/tensor/_indexing_functions.py
@@ -32,6 +32,9 @@
 import dpctl.tensor as dpt
 import dpctl.utils
 
+# TODO: revert to `import dpctl.tensor...`
+# when dpnp fully migrates dpctl/tensor
+import dpctl_ext.tensor as dpt_ext
 import dpctl_ext.tensor._tensor_impl as ti
 
 from ._numpy_helper import normalize_axis_index
@@ -185,7 +188,7 @@ def put_vec_duplicates(vec, ind, vals):
     if vals.dtype == x.dtype:
         rhs = vals
     else:
-        rhs = dpt.astype(vals, x.dtype)
+        rhs = dpt_ext.astype(vals, x.dtype)
     rhs = dpt.broadcast_to(rhs, val_shape)
 
     _manager = dpctl.utils.SequentialOrderManager[exec_q]
diff --git a/dpctl_ext/tensor/_manipulation_functions.py b/dpctl_ext/tensor/_manipulation_functions.py
new file mode 100644
index 000000000000..fa8fc27876b3
--- /dev/null
+++ b/dpctl_ext/tensor/_manipulation_functions.py
@@ -0,0 +1,120 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+import operator
+
+import dpctl.tensor as dpt
+import dpctl.utils as dputils
+import numpy as np
+
+# TODO: revert to `import dpctl.tensor...`
+# when dpnp fully migrates dpctl/tensor
+import dpctl_ext.tensor._tensor_impl as ti
+
+from ._numpy_helper import normalize_axis_tuple
+
+__doc__ = (
+    "Implementation module for array manipulation "
+    "functions in :module:`dpctl.tensor`"
+)
+
+
+def roll(x, /, shift, *, axis=None):
+    """
+    roll(x, shift, axis)
+
+    Rolls array elements along a specified axis.
+    Array elements that roll beyond the last position are re-introduced
+    at the first position. Array elements that roll beyond the first position
+    are re-introduced at the last position.
+
+    Args:
+        x (usm_ndarray): input array
+        shift (Union[int, Tuple[int,...]]): number of places by which the
+            elements are shifted. If `shift` is a tuple, then `axis` must be a
+            tuple of the same size, and each of the given axes must be shifted
+            by the corresponding element in `shift`. If `shift` is an `int`
+            and `axis` a tuple, then the same `shift` must be used for all
+            specified axes. If a `shift` is positive, then array elements is
+            shifted positively (toward larger indices) along the dimension of
+            `axis`.
+            If a `shift` is negative, then array elements must be shifted
+            negatively (toward smaller indices) along the dimension of `axis`.
+        axis (Optional[Union[int, Tuple[int,...]]]): axis (or axes) along which
+            elements to shift. If `axis` is `None`, the array is
+            flattened, shifted, and then restored to its original shape.
+            Default: `None`.
+
+    Returns:
+        usm_ndarray:
+            An array having the same `dtype`, `usm_type` and
+            `device` attributes as `x` and whose elements are shifted relative
+            to `x`.
+    """
+    if not isinstance(x, dpt.usm_ndarray):
+        raise TypeError(f"Expected usm_ndarray type, got {type(x)}.")
+    exec_q = x.sycl_queue
+    _manager = dputils.SequentialOrderManager[exec_q]
+    if axis is None:
+        shift = operator.index(shift)
+        res = dpt.empty(
+            x.shape, dtype=x.dtype, usm_type=x.usm_type, sycl_queue=exec_q
+        )
+        sz = operator.index(x.size)
+        shift = (shift % sz) if sz > 0 else 0
+        dep_evs = _manager.submitted_events
+        hev, roll_ev = ti._copy_usm_ndarray_for_roll_1d(
+            src=x,
+            dst=res,
+            shift=shift,
+            sycl_queue=exec_q,
+            depends=dep_evs,
+        )
+        _manager.add_event_pair(hev, roll_ev)
+        return res
+    axis = normalize_axis_tuple(axis, x.ndim, allow_duplicate=True)
+    broadcasted = np.broadcast(shift, axis)
+    if broadcasted.ndim > 1:
+        raise ValueError("'shift' and 'axis' should be scalars or 1D sequences")
+    shifts = [
+        0,
+    ] * x.ndim
+    shape = x.shape
+    for sh, ax in broadcasted:
+        n_i = operator.index(shape[ax])
+        shifted = shifts[ax] + operator.index(sh)
+        shifts[ax] = (shifted % n_i) if n_i > 0 else 0
+    res = dpt.empty(
+        x.shape, dtype=x.dtype, usm_type=x.usm_type, sycl_queue=exec_q
+    )
+    dep_evs = _manager.submitted_events
+    ht_e, roll_ev = ti._copy_usm_ndarray_for_roll_nd(
+        src=x, dst=res, shifts=shifts, sycl_queue=exec_q, depends=dep_evs
+    )
+    _manager.add_event_pair(ht_e, roll_ev)
+    return res
diff --git a/dpctl_ext/tensor/_reshape.py b/dpctl_ext/tensor/_reshape.py
new file mode 100644
index 000000000000..61aa6c9c754f
--- /dev/null
+++ b/dpctl_ext/tensor/_reshape.py
@@ -0,0 +1,209 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+import operator
+
+import dpctl.tensor as dpt
+import dpctl.utils
+import numpy as np
+
+# TODO: revert to `from dpctl.tensor._tensor_impl...`
+# when dpnp fully migrates dpctl/tensor
+from dpctl_ext.tensor._tensor_impl import (
+    _copy_usm_ndarray_for_reshape,
+    _ravel_multi_index,
+    _unravel_index,
+)
+
+__doc__ = "Implementation module for :func:`dpctl.tensor.reshape`."
+
+
+def _make_unit_indexes(shape):
+    """
+    Construct a diagonal matrix with with one on the diagonal
+    except if the corresponding element of shape is 1.
+    """
+    nd = len(shape)
+    mi = np.zeros((nd, nd), dtype="u4")
+    for i, dim in enumerate(shape):
+        mi[i, i] = 1 if dim > 1 else 0
+    return mi
+
+
+def ti_unravel_index(flat_index, shape, order="C"):
+    return _unravel_index(flat_index, shape, order)
+
+
+def ti_ravel_multi_index(multi_index, shape, order="C"):
+    return _ravel_multi_index(multi_index, shape, order)
+
+
+def reshaped_strides(old_sh, old_sts, new_sh, order="C"):
+    """
+    When reshaping array with `old_sh` shape and `old_sts` strides
+    into the new shape `new_sh`, returns the new stride if the reshape
+    can be a view, otherwise returns `None`.
+    """
+    eye_new_mi = _make_unit_indexes(new_sh)
+    new_sts = [
+        sum(
+            st_i * ind_i
+            for st_i, ind_i in zip(
+                old_sts, ti_unravel_index(flat_index, old_sh, order=order)
+            )
+        )
+        for flat_index in [
+            ti_ravel_multi_index(unitvec, new_sh, order=order)
+            for unitvec in eye_new_mi
+        ]
+    ]
+    eye_old_mi = _make_unit_indexes(old_sh)
+    check_sts = [
+        sum(
+            st_i * ind_i
+            for st_i, ind_i in zip(
+                new_sts, ti_unravel_index(flat_index, new_sh, order=order)
+            )
+        )
+        for flat_index in [
+            ti_ravel_multi_index(unitvec, old_sh, order=order)
+            for unitvec in eye_old_mi
+        ]
+    ]
+    valid = all(
+        check_st == old_st or old_dim == 1
+        for check_st, old_st, old_dim in zip(check_sts, old_sts, old_sh)
+    )
+    return new_sts if valid else None
+
+
+def reshape(X, /, shape, *, order="C", copy=None):
+    """reshape(x, shape, order="C")
+
+    Reshapes array ``x`` into new shape.
+
+    Args:
+        x (usm_ndarray):
+            input array
+        shape (Tuple[int]):
+            the desired shape of the resulting array.
+        order ("C", "F", optional):
+            memory layout of the resulting array
+            if a copy is found to be necessary. Supported
+            choices are ``"C"`` for C-contiguous, or row-major layout;
+            and ``"F"`` for F-contiguous, or column-major layout.
+
+    Returns:
+        usm_ndarray:
+            Reshaped array is a view, if possible,
+            and a copy otherwise with memory layout as indicated
+            by ``order`` keyword.
+    """
+    if not isinstance(X, dpt.usm_ndarray):
+        raise TypeError
+    if not isinstance(shape, (list, tuple)):
+        shape = (shape,)
+    if order in "cfCF":
+        order = order.upper()
+    else:
+        raise ValueError(
+            f"Keyword 'order' not recognized. Expecting 'C' or 'F', got {order}"
+        )
+    if copy not in (True, False, None):
+        raise ValueError(
+            f"Keyword 'copy' not recognized. Expecting True, False, "
+            f"or None, got {copy}"
+        )
+    shape = [operator.index(d) for d in shape]
+    negative_ones_count = 0
+    for nshi in shape:
+        if nshi == -1:
+            negative_ones_count = negative_ones_count + 1
+        if (nshi < -1) or negative_ones_count > 1:
+            raise ValueError(
+                "Target shape should have at most 1 negative "
+                "value which can only be -1"
+            )
+    if negative_ones_count:
+        sz = -np.prod(shape)
+        if sz == 0:
+            raise ValueError(
+                f"Can not reshape array of size {X.size} into "
+                f"shape {tuple(i for i in shape if i >= 0)}"
+            )
+        v = X.size // sz
+        shape = [v if d == -1 else d for d in shape]
+    if X.size != np.prod(shape):
+        raise ValueError(f"Can not reshape into {shape}")
+    if X.size:
+        newsts = reshaped_strides(X.shape, X.strides, shape, order=order)
+    else:
+        newsts = (1,) * len(shape)
+    copy_required = newsts is None
+    if copy_required and (copy is False):
+        raise ValueError(
+            "Reshaping the array requires a copy, but no copying was "
+            "requested by using copy=False"
+        )
+    copy_q = X.sycl_queue
+    if copy_required or (copy is True):
+        # must perform a copy
+        copy_q = X.sycl_queue
+        flat_res = dpt.usm_ndarray(
+            (X.size,),
+            dtype=X.dtype,
+            buffer=X.usm_type,
+            buffer_ctor_kwargs={"queue": copy_q},
+        )
+        _manager = dpctl.utils.SequentialOrderManager[copy_q]
+        dep_evs = _manager.submitted_events
+        if order == "C":
+            hev, r_e = _copy_usm_ndarray_for_reshape(
+                src=X, dst=flat_res, sycl_queue=copy_q, depends=dep_evs
+            )
+        else:
+            X_t = dpt.permute_dims(X, range(X.ndim - 1, -1, -1))
+            hev, r_e = _copy_usm_ndarray_for_reshape(
+                src=X_t, dst=flat_res, sycl_queue=copy_q, depends=dep_evs
+            )
+        _manager.add_event_pair(hev, r_e)
+        return dpt.usm_ndarray(
+            tuple(shape), dtype=X.dtype, buffer=flat_res, order=order
+        )
+    # can form a view
+    if (len(shape) == X.ndim) and all(
+        s1 == s2 for s1, s2 in zip(shape, X.shape)
+    ):
+        return X
+    return dpt.usm_ndarray(
+        shape,
+        dtype=X.dtype,
+        buffer=X,
+        strides=tuple(newsts),
+        offset=X._element_offset,
+    )
diff --git a/dpctl_ext/tensor/libtensor/source/copy_for_reshape.cpp b/dpctl_ext/tensor/libtensor/source/copy_for_reshape.cpp
new file mode 100644
index 000000000000..524bfcfdb98b
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/copy_for_reshape.cpp
@@ -0,0 +1,184 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions
+//===----------------------------------------------------------------------===//
+
+#include <iterator>
+#include <tuple>
+#include <utility>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/pybind11.h>
+
+#include "copy_for_reshape.hpp"
+#include "kernels/copy_and_cast.hpp"
+#include "utils/offset_utils.hpp"
+#include "utils/output_validation.hpp"
+#include "utils/sycl_alloc_utils.hpp"
+#include "utils/type_dispatch.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+using dpctl::tensor::kernels::copy_and_cast::copy_for_reshape_fn_ptr_t;
+using dpctl::utils::keep_args_alive;
+
+// define static vector
+static copy_for_reshape_fn_ptr_t
+    copy_for_reshape_generic_dispatch_vector[td_ns::num_types];
+
+/*
+ * Copies src into dst (same data type) of different shapes by using flat
+ * iterations.
+ *
+ * Equivalent to the following loop:
+ *
+ * for i for range(src.size):
+ *     dst[np.multi_index(i, dst.shape)] = src[np.multi_index(i, src.shape)]
+ */
+std::pair<sycl::event, sycl::event>
+    copy_usm_ndarray_for_reshape(const dpctl::tensor::usm_ndarray &src,
+                                 const dpctl::tensor::usm_ndarray &dst,
+                                 sycl::queue &exec_q,
+                                 const std::vector<sycl::event> &depends)
+{
+    py::ssize_t src_nelems = src.get_size();
+    py::ssize_t dst_nelems = dst.get_size();
+
+    // Must have the same number of elements
+    if (src_nelems != dst_nelems) {
+        throw py::value_error(
+            "copy_usm_ndarray_for_reshape requires src and dst to "
+            "have the same number of elements.");
+    }
+
+    int src_typenum = src.get_typenum();
+    int dst_typenum = dst.get_typenum();
+
+    // typenames must be the same
+    if (src_typenum != dst_typenum) {
+        throw py::value_error(
+            "copy_usm_ndarray_for_reshape requires src and dst to "
+            "have the same type.");
+    }
+
+    if (src_nelems == 0) {
+        return std::make_pair(sycl::event(), sycl::event());
+    }
+
+    dpctl::tensor::validation::AmpleMemory::throw_if_not_ample(dst, src_nelems);
+
+    // check same contexts
+    if (!dpctl::utils::queues_are_compatible(exec_q, {src, dst})) {
+        throw py::value_error(
+            "Execution queue is not compatible with allocation queues");
+    }
+
+    dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst);
+
+    if (src_nelems == 1) {
+        // handle special case of 1-element array
+        int src_elemsize = src.get_elemsize();
+        const char *src_data = src.get_data();
+        char *dst_data = dst.get_data();
+        sycl::event copy_ev =
+            exec_q.copy<char>(src_data, dst_data, src_elemsize, depends);
+        return std::make_pair(keep_args_alive(exec_q, {src, dst}, {copy_ev}),
+                              copy_ev);
+    }
+
+    // dimensions may be different
+    int src_nd = src.get_ndim();
+    int dst_nd = dst.get_ndim();
+
+    auto array_types = td_ns::usm_ndarray_types();
+    int type_id = array_types.typenum_to_lookup_id(src_typenum);
+
+    auto fn = copy_for_reshape_generic_dispatch_vector[type_id];
+
+    auto src_shape = src.get_shape_vector();
+    auto src_strides = src.get_strides_vector();
+
+    auto dst_shape = dst.get_shape_vector();
+    auto dst_strides = dst.get_strides_vector();
+
+    std::vector<sycl::event> host_task_events;
+    host_task_events.reserve(2);
+
+    // shape_strides = [src_shape, src_strides, dst_shape, dst_strides]
+    using dpctl::tensor::offset_utils::device_allocate_and_pack;
+    auto ptr_size_event_tuple = device_allocate_and_pack<py::ssize_t>(
+        exec_q, host_task_events, src_shape, src_strides, dst_shape,
+        dst_strides);
+    auto copy_shape_ev = std::get<2>(ptr_size_event_tuple);
+    auto shape_strides_owner = std::move(std::get<0>(ptr_size_event_tuple));
+    const py::ssize_t *shape_strides = shape_strides_owner.get();
+
+    const char *src_data = src.get_data();
+    char *dst_data = dst.get_data();
+
+    std::vector<sycl::event> all_deps(depends.size() + 1);
+    all_deps.push_back(copy_shape_ev);
+    all_deps.insert(std::end(all_deps), std::begin(depends), std::end(depends));
+
+    sycl::event copy_for_reshape_event =
+        fn(exec_q, src_nelems, src_nd, dst_nd, shape_strides, src_data,
+           dst_data, all_deps);
+
+    sycl::event temporaries_cleanup_ev =
+        dpctl::tensor::alloc_utils::async_smart_free(
+            exec_q, {copy_for_reshape_event}, shape_strides_owner);
+
+    host_task_events.push_back(temporaries_cleanup_ev);
+
+    return std::make_pair(keep_args_alive(exec_q, {src, dst}, host_task_events),
+                          copy_for_reshape_event);
+}
+
+void init_copy_for_reshape_dispatch_vectors(void)
+{
+    using namespace td_ns;
+    using dpctl::tensor::kernels::copy_and_cast::CopyForReshapeGenericFactory;
+
+    DispatchVectorBuilder<copy_for_reshape_fn_ptr_t,
+                          CopyForReshapeGenericFactory, num_types>
+        dvb;
+    dvb.populate_dispatch_vector(copy_for_reshape_generic_dispatch_vector);
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/copy_for_reshape.hpp b/dpctl_ext/tensor/libtensor/source/copy_for_reshape.hpp
new file mode 100644
index 000000000000..c5af885ad6cd
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/copy_for_reshape.hpp
@@ -0,0 +1,54 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions
+//===----------------------------------------------------------------------===//
+
+#pragma once
+#include <utility>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+extern std::pair<sycl::event, sycl::event>
+    copy_usm_ndarray_for_reshape(const dpctl::tensor::usm_ndarray &src,
+                                 const dpctl::tensor::usm_ndarray &dst,
+                                 sycl::queue &exec_q,
+                                 const std::vector<sycl::event> &depends = {});
+
+extern void init_copy_for_reshape_dispatch_vectors();
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/copy_for_roll.cpp b/dpctl_ext/tensor/libtensor/source/copy_for_roll.cpp
new file mode 100644
index 000000000000..a187b2247677
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/copy_for_roll.cpp
@@ -0,0 +1,400 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions
+//===----------------------------------------------------------------------===//
+
+#include <algorithm>
+#include <cstddef>
+#include <iterator>
+#include <tuple>
+#include <utility>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/pybind11.h>
+
+#include "copy_for_roll.hpp"
+#include "kernels/copy_and_cast.hpp"
+#include "utils/offset_utils.hpp"
+#include "utils/output_validation.hpp"
+#include "utils/sycl_alloc_utils.hpp"
+#include "utils/type_dispatch.hpp"
+
+#include "simplify_iteration_space.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+using dpctl::tensor::kernels::copy_and_cast::copy_for_roll_contig_fn_ptr_t;
+using dpctl::tensor::kernels::copy_and_cast::
+    copy_for_roll_ndshift_strided_fn_ptr_t;
+using dpctl::tensor::kernels::copy_and_cast::copy_for_roll_strided_fn_ptr_t;
+using dpctl::utils::keep_args_alive;
+
+// define static vector
+static copy_for_roll_strided_fn_ptr_t
+    copy_for_roll_strided_dispatch_vector[td_ns::num_types];
+
+static copy_for_roll_contig_fn_ptr_t
+    copy_for_roll_contig_dispatch_vector[td_ns::num_types];
+
+static copy_for_roll_ndshift_strided_fn_ptr_t
+    copy_for_roll_ndshift_dispatch_vector[td_ns::num_types];
+
+/*
+ * Copies src into dst (same data type) of different shapes by using flat
+ * iterations.
+ *
+ * Equivalent to the following loop:
+ *
+ * for i for range(src.size):
+ *     dst[np.multi_index(i, dst.shape)] = src[np.multi_index(i, src.shape)]
+ */
+std::pair<sycl::event, sycl::event>
+    copy_usm_ndarray_for_roll_1d(const dpctl::tensor::usm_ndarray &src,
+                                 const dpctl::tensor::usm_ndarray &dst,
+                                 py::ssize_t shift,
+                                 sycl::queue &exec_q,
+                                 const std::vector<sycl::event> &depends)
+{
+    int src_nd = src.get_ndim();
+    int dst_nd = dst.get_ndim();
+
+    // Must have the same number of dimensions
+    if (src_nd != dst_nd) {
+        throw py::value_error(
+            "copy_usm_ndarray_for_roll_1d requires src and dst to "
+            "have the same number of dimensions.");
+    }
+
+    const py::ssize_t *src_shape_ptr = src.get_shape_raw();
+    const py::ssize_t *dst_shape_ptr = dst.get_shape_raw();
+
+    if (!std::equal(src_shape_ptr, src_shape_ptr + src_nd, dst_shape_ptr)) {
+        throw py::value_error(
+            "copy_usm_ndarray_for_roll_1d requires src and dst to "
+            "have the same shape.");
+    }
+
+    py::ssize_t src_nelems = src.get_size();
+
+    int src_typenum = src.get_typenum();
+    int dst_typenum = dst.get_typenum();
+
+    // typenames must be the same
+    if (src_typenum != dst_typenum) {
+        throw py::value_error(
+            "copy_usm_ndarray_for_roll_1d requires src and dst to "
+            "have the same type.");
+    }
+
+    if (src_nelems == 0) {
+        return std::make_pair(sycl::event(), sycl::event());
+    }
+
+    dpctl::tensor::validation::AmpleMemory::throw_if_not_ample(dst, src_nelems);
+
+    // check same contexts
+    if (!dpctl::utils::queues_are_compatible(exec_q, {src, dst})) {
+        throw py::value_error(
+            "Execution queue is not compatible with allocation queues");
+    }
+
+    dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst);
+
+    if (src_nelems == 1) {
+        // handle special case of 1-element array
+        int src_elemsize = src.get_elemsize();
+        const char *src_data = src.get_data();
+        char *dst_data = dst.get_data();
+        sycl::event copy_ev =
+            exec_q.copy<char>(src_data, dst_data, src_elemsize, depends);
+        return std::make_pair(keep_args_alive(exec_q, {src, dst}, {copy_ev}),
+                              copy_ev);
+    }
+
+    auto array_types = td_ns::usm_ndarray_types();
+    int type_id = array_types.typenum_to_lookup_id(src_typenum);
+
+    const bool is_src_c_contig = src.is_c_contiguous();
+    const bool is_src_f_contig = src.is_f_contiguous();
+
+    const bool is_dst_c_contig = dst.is_c_contiguous();
+    const bool is_dst_f_contig = dst.is_f_contiguous();
+
+    const bool both_c_contig = is_src_c_contig && is_dst_c_contig;
+    const bool both_f_contig = is_src_f_contig && is_dst_f_contig;
+
+    // normalize shift parameter to be 0 <= offset < src_nelems
+    std::size_t offset =
+        (shift > 0) ? (shift % src_nelems) : src_nelems + (shift % src_nelems);
+
+    const char *src_data = src.get_data();
+    char *dst_data = dst.get_data();
+
+    if (both_c_contig || both_f_contig) {
+        auto fn = copy_for_roll_contig_dispatch_vector[type_id];
+
+        if (fn != nullptr) {
+            static constexpr py::ssize_t zero_offset = 0;
+
+            sycl::event copy_for_roll_ev =
+                fn(exec_q, offset, src_nelems, src_data, zero_offset, dst_data,
+                   zero_offset, depends);
+
+            sycl::event ht_ev =
+                keep_args_alive(exec_q, {src, dst}, {copy_for_roll_ev});
+
+            return std::make_pair(ht_ev, copy_for_roll_ev);
+        }
+    }
+
+    auto const &src_strides = src.get_strides_vector();
+    auto const &dst_strides = dst.get_strides_vector();
+
+    using shT = std::vector<py::ssize_t>;
+    shT simplified_shape;
+    shT simplified_src_strides;
+    shT simplified_dst_strides;
+    py::ssize_t src_offset(0);
+    py::ssize_t dst_offset(0);
+
+    int nd = src_nd;
+    const py::ssize_t *shape = src_shape_ptr;
+
+    // nd, simplified_* and *_offset are modified by reference
+    dpctl::tensor::py_internal::simplify_iteration_space(
+        nd, shape, src_strides, dst_strides,
+        // output
+        simplified_shape, simplified_src_strides, simplified_dst_strides,
+        src_offset, dst_offset);
+
+    if (nd == 1 && simplified_src_strides[0] == 1 &&
+        simplified_dst_strides[0] == 1) {
+        auto fn = copy_for_roll_contig_dispatch_vector[type_id];
+
+        if (fn != nullptr) {
+
+            sycl::event copy_for_roll_ev =
+                fn(exec_q, offset, src_nelems, src_data, src_offset, dst_data,
+                   dst_offset, depends);
+
+            sycl::event ht_ev =
+                keep_args_alive(exec_q, {src, dst}, {copy_for_roll_ev});
+
+            return std::make_pair(ht_ev, copy_for_roll_ev);
+        }
+    }
+
+    auto fn = copy_for_roll_strided_dispatch_vector[type_id];
+
+    std::vector<sycl::event> host_task_events;
+    host_task_events.reserve(2);
+
+    // shape_strides = [src_shape, src_strides, dst_strides]
+    using dpctl::tensor::offset_utils::device_allocate_and_pack;
+    auto ptr_size_event_tuple = device_allocate_and_pack<py::ssize_t>(
+        exec_q, host_task_events, simplified_shape, simplified_src_strides,
+        simplified_dst_strides);
+    auto shape_strides_owner = std::move(std::get<0>(ptr_size_event_tuple));
+    sycl::event copy_shape_ev = std::get<2>(ptr_size_event_tuple);
+    const py::ssize_t *shape_strides = shape_strides_owner.get();
+
+    std::vector<sycl::event> all_deps(depends.size() + 1);
+    all_deps.push_back(copy_shape_ev);
+    all_deps.insert(std::end(all_deps), std::begin(depends), std::end(depends));
+
+    sycl::event copy_for_roll_event =
+        fn(exec_q, offset, src_nelems, src_nd, shape_strides, src_data,
+           src_offset, dst_data, dst_offset, all_deps);
+
+    sycl::event temporaries_cleanup_ev =
+        dpctl::tensor::alloc_utils::async_smart_free(
+            exec_q, {copy_for_roll_event}, shape_strides_owner);
+    host_task_events.push_back(temporaries_cleanup_ev);
+
+    return std::make_pair(keep_args_alive(exec_q, {src, dst}, host_task_events),
+                          copy_for_roll_event);
+}
+
+std::pair<sycl::event, sycl::event>
+    copy_usm_ndarray_for_roll_nd(const dpctl::tensor::usm_ndarray &src,
+                                 const dpctl::tensor::usm_ndarray &dst,
+                                 const std::vector<py::ssize_t> &shifts,
+                                 sycl::queue &exec_q,
+                                 const std::vector<sycl::event> &depends)
+{
+    int src_nd = src.get_ndim();
+    int dst_nd = dst.get_ndim();
+
+    // Must have the same number of dimensions
+    if (src_nd != dst_nd) {
+        throw py::value_error(
+            "copy_usm_ndarray_for_roll_nd requires src and dst to "
+            "have the same number of dimensions.");
+    }
+
+    if (static_cast<std::size_t>(src_nd) != shifts.size()) {
+        throw py::value_error(
+            "copy_usm_ndarray_for_roll_nd requires shifts to "
+            "contain an integral shift for each array dimension.");
+    }
+
+    const py::ssize_t *src_shape_ptr = src.get_shape_raw();
+    const py::ssize_t *dst_shape_ptr = dst.get_shape_raw();
+
+    if (!std::equal(src_shape_ptr, src_shape_ptr + src_nd, dst_shape_ptr)) {
+        throw py::value_error(
+            "copy_usm_ndarray_for_roll_nd requires src and dst to "
+            "have the same shape.");
+    }
+
+    py::ssize_t src_nelems = src.get_size();
+
+    int src_typenum = src.get_typenum();
+    int dst_typenum = dst.get_typenum();
+
+    // typenames must be the same
+    if (src_typenum != dst_typenum) {
+        throw py::value_error(
+            "copy_usm_ndarray_for_roll_nd requires src and dst to "
+            "have the same type.");
+    }
+
+    if (src_nelems == 0) {
+        return std::make_pair(sycl::event(), sycl::event());
+    }
+
+    dpctl::tensor::validation::AmpleMemory::throw_if_not_ample(dst, src_nelems);
+
+    // check for compatible queues
+    if (!dpctl::utils::queues_are_compatible(exec_q, {src, dst})) {
+        throw py::value_error(
+            "Execution queue is not compatible with allocation queues");
+    }
+
+    if (src_nelems == 1) {
+        // handle special case of 1-element array
+        int src_elemsize = src.get_elemsize();
+        const char *src_data = src.get_data();
+        char *dst_data = dst.get_data();
+        sycl::event copy_ev =
+            exec_q.copy<char>(src_data, dst_data, src_elemsize, depends);
+        return std::make_pair(keep_args_alive(exec_q, {src, dst}, {copy_ev}),
+                              copy_ev);
+    }
+
+    auto array_types = td_ns::usm_ndarray_types();
+    int type_id = array_types.typenum_to_lookup_id(src_typenum);
+
+    std::vector<py::ssize_t> normalized_shifts{};
+    normalized_shifts.reserve(src_nd);
+
+    for (int i = 0; i < src_nd; ++i) {
+        // normalize shift parameter to be 0 <= offset < dim
+        py::ssize_t dim = src_shape_ptr[i];
+        std::size_t offset =
+            (shifts[i] >= 0) ? (shifts[i] % dim) : dim + (shifts[i] % dim);
+
+        normalized_shifts.push_back(offset);
+    }
+
+    const char *src_data = src.get_data();
+    char *dst_data = dst.get_data();
+
+    auto const &src_strides = src.get_strides_vector();
+    auto const &dst_strides = dst.get_strides_vector();
+    auto const &common_shape = src.get_shape_vector();
+
+    static constexpr py::ssize_t src_offset = 0;
+    static constexpr py::ssize_t dst_offset = 0;
+
+    auto fn = copy_for_roll_ndshift_dispatch_vector[type_id];
+
+    std::vector<sycl::event> host_task_events;
+    host_task_events.reserve(2);
+
+    // shape_strides = [src_shape, src_strides, dst_strides]
+    using dpctl::tensor::offset_utils::device_allocate_and_pack;
+    auto ptr_size_event_tuple = device_allocate_and_pack<py::ssize_t>(
+        exec_q, host_task_events, common_shape, src_strides, dst_strides,
+        normalized_shifts);
+    auto shape_strides_shifts_owner =
+        std::move(std::get<0>(ptr_size_event_tuple));
+    sycl::event copy_shape_ev = std::get<2>(ptr_size_event_tuple);
+    const py::ssize_t *shape_strides_shifts = shape_strides_shifts_owner.get();
+
+    std::vector<sycl::event> all_deps(depends.size() + 1);
+    all_deps.push_back(copy_shape_ev);
+    all_deps.insert(std::end(all_deps), std::begin(depends), std::end(depends));
+
+    sycl::event copy_for_roll_event =
+        fn(exec_q, src_nelems, src_nd, shape_strides_shifts, src_data,
+           src_offset, dst_data, dst_offset, all_deps);
+
+    auto temporaries_cleanup_ev = dpctl::tensor::alloc_utils::async_smart_free(
+        exec_q, {copy_for_roll_event}, shape_strides_shifts_owner);
+    host_task_events.push_back(temporaries_cleanup_ev);
+
+    return std::make_pair(keep_args_alive(exec_q, {src, dst}, host_task_events),
+                          copy_for_roll_event);
+}
+
+void init_copy_for_roll_dispatch_vectors(void)
+{
+    using namespace td_ns;
+    using dpctl::tensor::kernels::copy_and_cast::CopyForRollStridedFactory;
+
+    DispatchVectorBuilder<copy_for_roll_strided_fn_ptr_t,
+                          CopyForRollStridedFactory, num_types>
+        dvb1;
+    dvb1.populate_dispatch_vector(copy_for_roll_strided_dispatch_vector);
+
+    using dpctl::tensor::kernels::copy_and_cast::CopyForRollContigFactory;
+    DispatchVectorBuilder<copy_for_roll_contig_fn_ptr_t,
+                          CopyForRollContigFactory, num_types>
+        dvb2;
+    dvb2.populate_dispatch_vector(copy_for_roll_contig_dispatch_vector);
+
+    using dpctl::tensor::kernels::copy_and_cast::CopyForRollNDShiftFactory;
+    DispatchVectorBuilder<copy_for_roll_ndshift_strided_fn_ptr_t,
+                          CopyForRollNDShiftFactory, num_types>
+        dvb3;
+    dvb3.populate_dispatch_vector(copy_for_roll_ndshift_dispatch_vector);
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/copy_for_roll.hpp b/dpctl_ext/tensor/libtensor/source/copy_for_roll.hpp
new file mode 100644
index 000000000000..cffbf9f6f0d6
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/copy_for_roll.hpp
@@ -0,0 +1,65 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions
+//===----------------------------------------------------------------------===//
+
+#pragma once
+#include <utility>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern std::pair<sycl::event, sycl::event>
+    copy_usm_ndarray_for_roll_1d(const dpctl::tensor::usm_ndarray &src,
+                                 const dpctl::tensor::usm_ndarray &dst,
+                                 py::ssize_t shift,
+                                 sycl::queue &exec_q,
+                                 const std::vector<sycl::event> &depends = {});
+
+extern std::pair<sycl::event, sycl::event>
+    copy_usm_ndarray_for_roll_nd(const dpctl::tensor::usm_ndarray &src,
+                                 const dpctl::tensor::usm_ndarray &dst,
+                                 const std::vector<py::ssize_t> &shifts,
+                                 sycl::queue &exec_q,
+                                 const std::vector<sycl::event> &depends = {});
+
+extern void init_copy_for_roll_dispatch_vectors();
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/copy_numpy_ndarray_into_usm_ndarray.cpp b/dpctl_ext/tensor/libtensor/source/copy_numpy_ndarray_into_usm_ndarray.cpp
new file mode 100644
index 000000000000..e97e8aeb1ca1
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/copy_numpy_ndarray_into_usm_ndarray.cpp
@@ -0,0 +1,368 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions
+//===----------------------------------------------------------------------===//
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <iterator>
+#include <stdexcept>
+#include <tuple>
+#include <utility>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+
+#include "kernels/copy_and_cast.hpp"
+#include "utils/offset_utils.hpp"
+#include "utils/output_validation.hpp"
+#include "utils/sycl_alloc_utils.hpp"
+#include "utils/type_dispatch.hpp"
+
+#include "copy_numpy_ndarray_into_usm_ndarray.hpp"
+#include "simplify_iteration_space.hpp"
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace dpctl::tensor::py_internal
+{
+
+using dpctl::tensor::kernels::copy_and_cast::
+    copy_and_cast_from_host_blocking_fn_ptr_t;
+
+static copy_and_cast_from_host_blocking_fn_ptr_t
+    copy_and_cast_from_host_blocking_dispatch_table[td_ns::num_types]
+                                                   [td_ns::num_types];
+
+using dpctl::tensor::kernels::copy_and_cast::
+    copy_and_cast_from_host_contig_blocking_fn_ptr_t;
+
+static copy_and_cast_from_host_contig_blocking_fn_ptr_t
+    copy_and_cast_from_host_contig_blocking_dispatch_table[td_ns::num_types]
+                                                          [td_ns::num_types];
+
+void copy_numpy_ndarray_into_usm_ndarray(
+    const py::array &npy_src,
+    const dpctl::tensor::usm_ndarray &dst,
+    sycl::queue &exec_q,
+    const std::vector<sycl::event> &depends)
+{
+    int src_ndim = npy_src.ndim();
+    int dst_ndim = dst.get_ndim();
+
+    if (src_ndim != dst_ndim) {
+        throw py::value_error("Source ndarray and destination usm_ndarray have "
+                              "different array ranks, "
+                              "i.e. different number of indices needed to "
+                              "address array elements.");
+    }
+
+    const py::ssize_t *src_shape = npy_src.shape();
+    const py::ssize_t *dst_shape = dst.get_shape_raw();
+    bool shapes_equal(true);
+    std::size_t src_nelems(1);
+    for (int i = 0; shapes_equal && (i < src_ndim); ++i) {
+        shapes_equal = shapes_equal && (src_shape[i] == dst_shape[i]);
+        src_nelems *= static_cast<std::size_t>(src_shape[i]);
+    }
+
+    if (!shapes_equal) {
+        throw py::value_error("Source ndarray and destination usm_ndarray have "
+                              "difference shapes.");
+    }
+
+    if (src_nelems == 0) {
+        // nothing to do
+        return;
+    }
+
+    dpctl::tensor::validation::AmpleMemory::throw_if_not_ample(dst, src_nelems);
+
+    if (!dpctl::utils::queues_are_compatible(exec_q, {dst})) {
+        throw py::value_error("Execution queue is not compatible with the "
+                              "allocation queue");
+    }
+
+    dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst);
+
+    // here we assume that NumPy's type numbers agree with ours for types
+    // supported in both
+    int src_typenum =
+        py::detail::array_descriptor_proxy(npy_src.dtype().ptr())->type_num;
+    int dst_typenum = dst.get_typenum();
+
+    const auto &array_types = td_ns::usm_ndarray_types();
+    int src_type_id = array_types.typenum_to_lookup_id(src_typenum);
+    int dst_type_id = array_types.typenum_to_lookup_id(dst_typenum);
+
+    py::buffer_info src_pybuf = npy_src.request();
+    const char *const src_data = static_cast<const char *const>(src_pybuf.ptr);
+    char *dst_data = dst.get_data();
+
+    int src_flags = npy_src.flags();
+
+    // check for applicability of special cases:
+    //      (same type && (both C-contiguous || both F-contiguous)
+    const bool both_c_contig =
+        ((src_flags & py::array::c_style) && dst.is_c_contiguous());
+    const bool both_f_contig =
+        ((src_flags & py::array::f_style) && dst.is_f_contiguous());
+
+    const bool same_data_types = (src_type_id == dst_type_id);
+
+    if (both_c_contig || both_f_contig) {
+        if (same_data_types) {
+            int src_elem_size = npy_src.itemsize();
+
+            sycl::event copy_ev =
+                exec_q.memcpy(static_cast<void *>(dst_data),
+                              static_cast<const void *>(src_data),
+                              src_nelems * src_elem_size, depends);
+
+            {
+                // wait for copy_ev to complete
+                // release GIL to allow other threads (host_tasks)
+                // a chance to acquire GIL
+                py::gil_scoped_release lock{};
+                copy_ev.wait();
+            }
+
+            return;
+        }
+        else {
+            py::gil_scoped_release lock{};
+
+            auto copy_and_cast_from_host_contig_blocking_fn =
+                copy_and_cast_from_host_contig_blocking_dispatch_table
+                    [dst_type_id][src_type_id];
+
+            static constexpr py::ssize_t zero_offset(0);
+
+            copy_and_cast_from_host_contig_blocking_fn(
+                exec_q, src_nelems, src_data, zero_offset, dst_data,
+                zero_offset, depends);
+
+            return;
+        }
+    }
+
+    auto const &dst_strides =
+        dst.get_strides_vector(); // N.B.: strides in elements
+
+    using shT = std::vector<py::ssize_t>;
+    shT simplified_shape;
+    shT simplified_src_strides;
+    shT simplified_dst_strides;
+    py::ssize_t src_offset(0);
+    py::ssize_t dst_offset(0);
+
+    int nd = src_ndim;
+    const py::ssize_t *shape = src_shape;
+
+    const py::ssize_t *src_strides_p =
+        npy_src.strides();                         // N.B.: strides in bytes
+    py::ssize_t src_itemsize = npy_src.itemsize(); // item size in bytes
+
+    bool is_src_c_contig = ((src_flags & py::array::c_style) != 0);
+    bool is_src_f_contig = ((src_flags & py::array::f_style) != 0);
+
+    shT src_strides_in_elems;
+    if (src_strides_p) {
+        src_strides_in_elems.resize(nd);
+        // copy and convert strides from bytes to elements
+        std::transform(
+            src_strides_p, src_strides_p + nd, std::begin(src_strides_in_elems),
+            [src_itemsize](py::ssize_t el) {
+                py::ssize_t q = el / src_itemsize;
+                if (q * src_itemsize != el) {
+                    throw std::runtime_error(
+                        "NumPy array strides are not multiple of itemsize");
+                }
+                return q;
+            });
+    }
+    else {
+        if (is_src_c_contig) {
+            src_strides_in_elems =
+                dpctl::tensor::c_contiguous_strides(nd, src_shape);
+        }
+        else if (is_src_f_contig) {
+            src_strides_in_elems =
+                dpctl::tensor::f_contiguous_strides(nd, src_shape);
+        }
+        else {
+            throw py::value_error("NumPy source array has null strides but is "
+                                  "neither C- nor F-contiguous.");
+        }
+    }
+
+    // nd, simplified_* vectors and offsets are modified by reference
+    simplify_iteration_space(nd, shape, src_strides_in_elems, dst_strides,
+                             // outputs
+                             simplified_shape, simplified_src_strides,
+                             simplified_dst_strides, src_offset, dst_offset);
+
+    assert(simplified_shape.size() == static_cast<std::size_t>(nd));
+    assert(simplified_src_strides.size() == static_cast<std::size_t>(nd));
+    assert(simplified_dst_strides.size() == static_cast<std::size_t>(nd));
+
+    // handle nd == 0
+    if (nd == 0) {
+        nd = 1;
+        simplified_shape.reserve(nd);
+        simplified_shape.push_back(1);
+
+        simplified_src_strides.reserve(nd);
+        simplified_src_strides.push_back(1);
+
+        simplified_dst_strides.reserve(nd);
+        simplified_dst_strides.push_back(1);
+    }
+
+    const bool is_contig_vector =
+        ((nd == 1) && (simplified_src_strides.front() == 1) &&
+         (simplified_dst_strides.front() == 1));
+
+    const bool can_use_memcpy = (same_data_types && is_contig_vector &&
+                                 (src_offset == 0) && (dst_offset == 0));
+
+    if (can_use_memcpy) {
+        int src_elem_size = npy_src.itemsize();
+
+        sycl::event copy_ev = exec_q.memcpy(
+            static_cast<void *>(dst_data), static_cast<const void *>(src_data),
+            src_nelems * src_elem_size, depends);
+
+        {
+            // wait for copy_ev to complete
+            // release GIL to allow other threads (host_tasks)
+            // a chance to acquire GIL
+            py::gil_scoped_release lock{};
+
+            copy_ev.wait();
+        }
+
+        return;
+    }
+
+    // Minimum and maximum element offsets for source np.ndarray
+    py::ssize_t npy_src_min_nelem_offset(src_offset);
+    py::ssize_t npy_src_max_nelem_offset(src_offset);
+    for (int i = 0; i < nd; ++i) {
+        if (simplified_src_strides[i] < 0) {
+            npy_src_min_nelem_offset +=
+                simplified_src_strides[i] * (simplified_shape[i] - 1);
+        }
+        else {
+            npy_src_max_nelem_offset +=
+                simplified_src_strides[i] * (simplified_shape[i] - 1);
+        }
+    }
+
+    if (is_contig_vector) {
+        // release GIL for the blocking call
+        py::gil_scoped_release lock{};
+
+        auto copy_and_cast_from_host_contig_blocking_fn =
+            copy_and_cast_from_host_contig_blocking_dispatch_table[dst_type_id]
+                                                                  [src_type_id];
+
+        copy_and_cast_from_host_contig_blocking_fn(exec_q, src_nelems, src_data,
+                                                   src_offset, dst_data,
+                                                   dst_offset, depends);
+
+        return;
+    }
+
+    std::vector<sycl::event> host_task_events;
+    host_task_events.reserve(1);
+
+    // Copy shape strides into device memory
+    using dpctl::tensor::offset_utils::device_allocate_and_pack;
+    auto ptr_size_event_tuple = device_allocate_and_pack<py::ssize_t>(
+        exec_q, host_task_events, simplified_shape, simplified_src_strides,
+        simplified_dst_strides);
+    auto shape_strides_owner = std::move(std::get<0>(ptr_size_event_tuple));
+    const sycl::event &copy_shape_ev = std::get<2>(ptr_size_event_tuple);
+    const py::ssize_t *shape_strides = shape_strides_owner.get();
+
+    {
+        // release GIL for the blocking call
+        py::gil_scoped_release lock{};
+
+        // Get implementation function pointer
+        auto copy_and_cast_from_host_blocking_fn =
+            copy_and_cast_from_host_blocking_dispatch_table[dst_type_id]
+                                                           [src_type_id];
+
+        copy_and_cast_from_host_blocking_fn(
+            exec_q, src_nelems, nd, shape_strides, src_data, src_offset,
+            npy_src_min_nelem_offset, npy_src_max_nelem_offset, dst_data,
+            dst_offset, depends, {copy_shape_ev});
+
+        // invoke USM deleter in smart pointer while GIL is held
+        shape_strides_owner.reset(nullptr);
+    }
+
+    return;
+}
+
+void init_copy_numpy_ndarray_into_usm_ndarray_dispatch_tables(void)
+{
+    using namespace td_ns;
+    using dpctl::tensor::kernels::copy_and_cast::CopyAndCastFromHostFactory;
+
+    DispatchTableBuilder<copy_and_cast_from_host_blocking_fn_ptr_t,
+                         CopyAndCastFromHostFactory, num_types>
+        dtb_copy_from_numpy;
+
+    dtb_copy_from_numpy.populate_dispatch_table(
+        copy_and_cast_from_host_blocking_dispatch_table);
+
+    using dpctl::tensor::kernels::copy_and_cast::
+        CopyAndCastFromHostContigFactory;
+
+    DispatchTableBuilder<copy_and_cast_from_host_contig_blocking_fn_ptr_t,
+                         CopyAndCastFromHostContigFactory, num_types>
+        dtb_copy_from_numpy_contig;
+
+    dtb_copy_from_numpy_contig.populate_dispatch_table(
+        copy_and_cast_from_host_contig_blocking_dispatch_table);
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/copy_numpy_ndarray_into_usm_ndarray.hpp b/dpctl_ext/tensor/libtensor/source/copy_numpy_ndarray_into_usm_ndarray.hpp
new file mode 100644
index 000000000000..f2de95f97cca
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/copy_numpy_ndarray_into_usm_ndarray.hpp
@@ -0,0 +1,57 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions
+//===----------------------------------------------------------------------===//
+
+#pragma once
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern void copy_numpy_ndarray_into_usm_ndarray(
+    const py::array &npy_src,
+    const dpctl::tensor::usm_ndarray &dst,
+    sycl::queue &exec_q,
+    const std::vector<sycl::event> &depends = {});
+
+extern void init_copy_numpy_ndarray_into_usm_ndarray_dispatch_tables(void);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp b/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp
index 0478fb19678c..3e5be4d9e8fe 100644
--- a/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp
+++ b/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp
@@ -48,9 +48,9 @@
 // #include "clip.hpp"
 #include "copy_and_cast_usm_to_usm.hpp"
 #include "copy_as_contig.hpp"
-// #include "copy_for_reshape.hpp"
-// #include "copy_for_roll.hpp"
-// #include "copy_numpy_ndarray_into_usm_ndarray.hpp"
+#include "copy_for_reshape.hpp"
+#include "copy_for_roll.hpp"
+#include "copy_numpy_ndarray_into_usm_ndarray.hpp"
 #include "device_support_queries.hpp"
 // #include "eye_ctor.hpp"
 #include "full_ctor.hpp"
@@ -84,16 +84,16 @@ using dpctl::tensor::py_internal::py_as_f_contig;
 
 /* =========================== Copy for reshape ============================= */
 
-// using dpctl::tensor::py_internal::copy_usm_ndarray_for_reshape;
+using dpctl::tensor::py_internal::copy_usm_ndarray_for_reshape;
 
 /* =========================== Copy for roll ============================= */
 
-// using dpctl::tensor::py_internal::copy_usm_ndarray_for_roll_1d;
-// using dpctl::tensor::py_internal::copy_usm_ndarray_for_roll_nd;
+using dpctl::tensor::py_internal::copy_usm_ndarray_for_roll_1d;
+using dpctl::tensor::py_internal::copy_usm_ndarray_for_roll_nd;
 
 /* ============= Copy from numpy.ndarray to usm_ndarray ==================== */
 
-// using dpctl::tensor::py_internal::copy_numpy_ndarray_into_usm_ndarray;
+using dpctl::tensor::py_internal::copy_numpy_ndarray_into_usm_ndarray;
 
 /* ============= linear-sequence ==================== */
 
@@ -143,7 +143,7 @@ void init_dispatch_tables(void)
     using namespace dpctl::tensor::py_internal;
 
     init_copy_and_cast_usm_to_usm_dispatch_tables();
-    // init_copy_numpy_ndarray_into_usm_ndarray_dispatch_tables();
+    init_copy_numpy_ndarray_into_usm_ndarray_dispatch_tables();
     init_advanced_indexing_dispatch_tables();
     // init_where_dispatch_tables();
     return;
@@ -155,8 +155,8 @@ void init_dispatch_vectors(void)
     using namespace dpctl::tensor::py_internal;
 
     init_copy_as_contig_dispatch_vectors();
-    // init_copy_for_reshape_dispatch_vectors();
-    // init_copy_for_roll_dispatch_vectors();
+    init_copy_for_reshape_dispatch_vectors();
+    init_copy_for_roll_dispatch_vectors();
     // init_linear_sequences_dispatch_vectors();
     init_full_ctor_dispatch_vectors();
     init_zeros_ctor_dispatch_vectors();
@@ -276,28 +276,29 @@ PYBIND11_MODULE(_tensor_impl, m)
         },
         "");
 
-    // m.def("_copy_usm_ndarray_for_reshape", &copy_usm_ndarray_for_reshape,
-    //       "Copies from usm_ndarray `src` into usm_ndarray `dst` with the same
-    //       " "number of elements using underlying 'C'-contiguous order for
-    //       flat " "traversal. " "Returns a tuple of events: (ht_event,
-    //       comp_event)", py::arg("src"), py::arg("dst"),
-    //       py::arg("sycl_queue"), py::arg("depends") = py::list());
+    m.def("_copy_usm_ndarray_for_reshape", &copy_usm_ndarray_for_reshape,
+          "Copies from usm_ndarray `src` into usm_ndarray `dst` with the same "
+          "number of elements using underlying 'C'-contiguous order for flat "
+          "traversal. "
+          "Returns a tuple of events: (ht_event, comp_event)",
+          py::arg("src"), py::arg("dst"), py::arg("sycl_queue"),
+          py::arg("depends") = py::list());
 
-    // m.def("_copy_usm_ndarray_for_roll_1d", &copy_usm_ndarray_for_roll_1d,
-    //       "Copies from usm_ndarray `src` into usm_ndarray `dst` with the same
-    //       " "shapes using underlying 'C'-contiguous order for flat "
-    //       "traversal with shift. "
-    //       "Returns a tuple of events: (ht_event, comp_event)",
-    //       py::arg("src"), py::arg("dst"), py::arg("shift"),
-    //       py::arg("sycl_queue"), py::arg("depends") = py::list());
+    m.def("_copy_usm_ndarray_for_roll_1d", &copy_usm_ndarray_for_roll_1d,
+          "Copies from usm_ndarray `src` into usm_ndarray `dst` with the same "
+          "shapes using underlying 'C'-contiguous order for flat "
+          "traversal with shift. "
+          "Returns a tuple of events: (ht_event, comp_event)",
+          py::arg("src"), py::arg("dst"), py::arg("shift"),
+          py::arg("sycl_queue"), py::arg("depends") = py::list());
 
-    // m.def("_copy_usm_ndarray_for_roll_nd", &copy_usm_ndarray_for_roll_nd,
-    //       "Copies from usm_ndarray `src` into usm_ndarray `dst` with the same
-    //       " "shapes using underlying 'C'-contiguous order for " "traversal
-    //       with shifts along each axis. " "Returns a tuple of events:
-    //       (ht_event, comp_event)", py::arg("src"), py::arg("dst"),
-    //       py::arg("shifts"), py::arg("sycl_queue"), py::arg("depends") =
-    //       py::list());
+    m.def("_copy_usm_ndarray_for_roll_nd", &copy_usm_ndarray_for_roll_nd,
+          "Copies from usm_ndarray `src` into usm_ndarray `dst` with the same "
+          "shapes using underlying 'C'-contiguous order for "
+          "traversal with shifts along each axis. "
+          "Returns a tuple of events: (ht_event, comp_event)",
+          py::arg("src"), py::arg("dst"), py::arg("shifts"),
+          py::arg("sycl_queue"), py::arg("depends") = py::list());
 
     //     m.def("_linspace_step", &usm_ndarray_linear_sequence_step,
     //           "Fills input 1D contiguous usm_ndarray `dst` with linear
@@ -314,11 +315,11 @@ PYBIND11_MODULE(_tensor_impl, m)
     //           py::arg("include_endpoint"), py::arg("sycl_queue"),
     //           py::arg("depends") = py::list());
 
-    // m.def("_copy_numpy_ndarray_into_usm_ndarray",
-    //       &copy_numpy_ndarray_into_usm_ndarray,
-    //       "Copy from numpy array `src` into usm_ndarray `dst`
-    //       synchronously.", py::arg("src"), py::arg("dst"),
-    //       py::arg("sycl_queue"), py::arg("depends") = py::list());
+    m.def("_copy_numpy_ndarray_into_usm_ndarray",
+          &copy_numpy_ndarray_into_usm_ndarray,
+          "Copy from numpy array `src` into usm_ndarray `dst` synchronously.",
+          py::arg("src"), py::arg("dst"), py::arg("sycl_queue"),
+          py::arg("depends") = py::list());
 
     m.def("_zeros_usm_ndarray", &usm_ndarray_zeros,
           "Populate usm_ndarray `dst` with zeros.", py::arg("dst"),
diff --git a/dpnp/dpnp_algo/dpnp_arraycreation.py b/dpnp/dpnp_algo/dpnp_arraycreation.py
index d94a031801f3..47edf63a68b4 100644
--- a/dpnp/dpnp_algo/dpnp_arraycreation.py
+++ b/dpnp/dpnp_algo/dpnp_arraycreation.py
@@ -33,6 +33,9 @@
 import dpctl.utils as dpu
 import numpy
 
+# TODO: revert to `import dpctl.tensor...`
+# when dpnp fully migrates dpctl/tensor
+import dpctl_ext.tensor as dpt_ext
 import dpnp
 from dpnp.dpnp_array import dpnp_array
 from dpnp.dpnp_utils import get_usm_allocations, map_dtype_to_device
@@ -230,7 +233,9 @@ def dpnp_linspace(
             usm_type=_usm_type,
             sycl_queue=sycl_queue_normalized,
         )
-        usm_res = dpt.reshape(usm_res, (-1,) + (1,) * delta.ndim, copy=False)
+        usm_res = dpt_ext.reshape(
+            usm_res, (-1,) + (1,) * delta.ndim, copy=False
+        )
 
         if step_num > 0:
             step = delta / step_num
@@ -256,7 +261,7 @@ def dpnp_linspace(
     if dpnp.issubdtype(dtype, dpnp.integer):
         dpt.floor(usm_res, out=usm_res)
 
-    res = dpt.astype(usm_res, dtype, copy=False)
+    res = dpt_ext.astype(usm_res, dtype, copy=False)
     res = dpnp_array._create_from_usm_ndarray(res)
 
     if retstep is True:
diff --git a/dpnp/dpnp_algo/dpnp_elementwise_common.py b/dpnp/dpnp_algo/dpnp_elementwise_common.py
index 88abcee5035c..55d74e8c1803 100644
--- a/dpnp/dpnp_algo/dpnp_elementwise_common.py
+++ b/dpnp/dpnp_algo/dpnp_elementwise_common.py
@@ -47,6 +47,7 @@
 # pylint: disable=no-name-in-module
 # TODO: revert to `import dpctl.tensor...`
 # when dpnp fully migrates dpctl/tensor
+import dpctl_ext.tensor as dpt_ext
 import dpctl_ext.tensor._tensor_impl as dti
 import dpnp
 import dpnp.backend.extensions.vm._vm_impl as vmi
@@ -212,7 +213,7 @@ def __call__(
 
         x_usm = dpnp.get_usm_ndarray(x)
         if dtype is not None:
-            x_usm = dpt.astype(x_usm, dtype, copy=False)
+            x_usm = dpt_ext.astype(x_usm, dtype, copy=False)
 
         out = self._unpack_out_kw(out)
         out_usm = None if out is None else dpnp.get_usm_ndarray(out)
@@ -718,9 +719,9 @@ def __call__(
                     sycl_queue=x2.sycl_queue,
                     usm_type=x2.usm_type,
                 )
-                x2_usm = dpt.astype(x2_usm, dtype, copy=False)
+                x2_usm = dpt_ext.astype(x2_usm, dtype, copy=False)
             elif dpnp.isscalar(x2):
-                x1_usm = dpt.astype(x1_usm, dtype, copy=False)
+                x1_usm = dpt_ext.astype(x1_usm, dtype, copy=False)
                 x2_usm = dpt.asarray(
                     x2,
                     dtype=dtype,
@@ -728,8 +729,8 @@ def __call__(
                     usm_type=x1.usm_type,
                 )
             else:
-                x1_usm = dpt.astype(x1_usm, dtype, copy=False)
-                x2_usm = dpt.astype(x2_usm, dtype, copy=False)
+                x1_usm = dpt_ext.astype(x1_usm, dtype, copy=False)
+                x2_usm = dpt_ext.astype(x2_usm, dtype, copy=False)
 
         res_usm = super().__call__(x1_usm, x2_usm, out=out_usm, order=order)
 
@@ -1325,7 +1326,7 @@ def __call__(self, x, /, decimals=0, out=None, *, dtype=None):
                 res_usm = dpt.divide(x_usm, 10**decimals, out=out_usm)
 
             if dtype is not None:
-                res_usm = dpt.astype(res_usm, dtype, copy=False)
+                res_usm = dpt_ext.astype(res_usm, dtype, copy=False)
 
             if out is not None and isinstance(out, dpnp_array):
                 return out
diff --git a/dpnp/dpnp_algo/dpnp_fill.py b/dpnp/dpnp_algo/dpnp_fill.py
index 4137a2794747..ddba9f634cb1 100644
--- a/dpnp/dpnp_algo/dpnp_fill.py
+++ b/dpnp/dpnp_algo/dpnp_fill.py
@@ -32,10 +32,10 @@
 import dpctl.utils as dpu
 from dpctl.tensor._ctors import _cast_fill_val
 
-import dpnp
-
 # TODO: revert to `from dpctl.tensor...`
 # when dpnp fully migrates dpctl/tensor
+import dpctl_ext.tensor as dpt_ext
+import dpnp
 from dpctl_ext.tensor._tensor_impl import (
     _copy_usm_ndarray_into_usm_ndarray,
     _full_usm_ndarray,
@@ -56,7 +56,7 @@ def dpnp_fill(arr, val):
             raise dpu.ExecutionPlacementError(
                 "Input arrays have incompatible queues."
             )
-        a_val = dpt.astype(val, arr.dtype)
+        a_val = dpt_ext.astype(val, arr.dtype)
         a_val = dpt.broadcast_to(a_val, arr.shape)
         _manager = dpu.SequentialOrderManager[exec_q]
         dep_evs = _manager.submitted_events
diff --git a/dpnp/dpnp_array.py b/dpnp/dpnp_array.py
index bb864d4444a9..0b6d882c53db 100644
--- a/dpnp/dpnp_array.py
+++ b/dpnp/dpnp_array.py
@@ -41,6 +41,9 @@
 import dpctl.tensor._type_utils as dtu
 from dpctl.tensor._numpy_helper import AxisError
 
+# TODO: revert to `import dpctl.tensor...`
+# when dpnp fully migrates dpctl/tensor
+import dpctl_ext.tensor as dpt_ext
 import dpnp
 
 from . import memory as dpm
@@ -774,7 +777,7 @@ def asnumpy(self):
 
         """
 
-        return dpt.asnumpy(self._array_obj)
+        return dpt_ext.asnumpy(self._array_obj)
 
     def astype(
         self,
diff --git a/dpnp/dpnp_container.py b/dpnp/dpnp_container.py
index c8e28529cd57..acda579a5f5e 100644
--- a/dpnp/dpnp_container.py
+++ b/dpnp/dpnp_container.py
@@ -38,6 +38,8 @@
 import dpctl.tensor as dpt
 import dpctl.utils as dpu
 
+# TODO: revert to `import dpctl.tensor...`
+# when dpnp fully migrates dpctl/tensor
 import dpctl_ext.tensor as dpt_ext
 import dpnp
 from dpnp.dpnp_array import dpnp_array
@@ -141,7 +143,7 @@ def copy(x1, /, *, order="K"):
     if order is None:
         order = "K"
 
-    array_obj = dpt.copy(dpnp.get_usm_ndarray(x1), order=order)
+    array_obj = dpt_ext.copy(dpnp.get_usm_ndarray(x1), order=order)
     return dpnp_array._create_from_usm_ndarray(array_obj)
 
 
diff --git a/dpnp/dpnp_iface.py b/dpnp/dpnp_iface.py
index 533bdc36c617..6c050a208981 100644
--- a/dpnp/dpnp_iface.py
+++ b/dpnp/dpnp_iface.py
@@ -53,6 +53,7 @@
 # pylint: disable=no-name-in-module
 # TODO: revert to `import dpctl.tensor...`
 # when dpnp fully migrates dpctl/tensor
+import dpctl_ext.tensor as dpt_ext
 import dpctl_ext.tensor._tensor_impl as ti
 import dpnp
 
@@ -136,7 +137,7 @@ def asnumpy(a, order="C"):
         return a.asnumpy()
 
     if isinstance(a, dpt.usm_ndarray):
-        return dpt.asnumpy(a)
+        return dpt_ext.asnumpy(a)
 
     return numpy.asarray(a, order=order)
 
diff --git a/dpnp/dpnp_iface_arraycreation.py b/dpnp/dpnp_iface_arraycreation.py
index e7b902647186..52fc4b7f6448 100644
--- a/dpnp/dpnp_iface_arraycreation.py
+++ b/dpnp/dpnp_iface_arraycreation.py
@@ -46,6 +46,9 @@
 import dpctl.tensor as dpt
 import numpy
 
+# TODO: revert to `import dpctl.tensor...`
+# when dpnp fully migrates dpctl/tensor
+import dpctl_ext.tensor as dpt_ext
 import dpnp
 from dpnp import dpnp_container
 
@@ -934,7 +937,7 @@ def astype(x, dtype, /, *, order="K", casting="unsafe", copy=True, device=None):
         order = "K"
 
     usm_x = dpnp.get_usm_ndarray(x)
-    usm_res = dpt.astype(
+    usm_res = dpt_ext.astype(
         usm_x, dtype, order=order, casting=casting, copy=copy, device=device
     )
 
@@ -3116,7 +3119,7 @@ def meshgrid(*xi, copy=True, sparse=False, indexing="xy"):
 
     s0 = (1,) * ndim
     output = [
-        dpt.reshape(dpnp.get_usm_ndarray(x), s0[:i] + (-1,) + s0[i + 1 :])
+        dpt_ext.reshape(dpnp.get_usm_ndarray(x), s0[:i] + (-1,) + s0[i + 1 :])
         for i, x in enumerate(xi)
     ]
 
@@ -3124,14 +3127,14 @@ def meshgrid(*xi, copy=True, sparse=False, indexing="xy"):
     _, _ = get_usm_allocations(output)
 
     if indexing == "xy" and ndim > 1:
-        output[0] = dpt.reshape(output[0], (1, -1) + s0[2:])
-        output[1] = dpt.reshape(output[1], (-1, 1) + s0[2:])
+        output[0] = dpt_ext.reshape(output[0], (1, -1) + s0[2:])
+        output[1] = dpt_ext.reshape(output[1], (-1, 1) + s0[2:])
 
     if not sparse:
         output = dpt.broadcast_arrays(*output)
 
     if copy:
-        output = [dpt.copy(x) for x in output]
+        output = [dpt_ext.copy(x) for x in output]
 
     return [dpnp_array._create_from_usm_ndarray(x) for x in output]
 
@@ -3931,7 +3934,7 @@ def vander(
 
     tmp = m[:, ::-1] if not increasing else m
     dpnp.power(
-        dpt.reshape(usm_x, (-1, 1)),
+        dpt_ext.reshape(usm_x, (-1, 1)),
         dpt.arange(
             N, dtype=_dtype, usm_type=x_usm_type, sycl_queue=x_sycl_queue
         ),
diff --git a/dpnp/dpnp_iface_indexing.py b/dpnp/dpnp_iface_indexing.py
index 583561573b85..b0769337c38b 100644
--- a/dpnp/dpnp_iface_indexing.py
+++ b/dpnp/dpnp_iface_indexing.py
@@ -51,11 +51,10 @@
 from dpctl.tensor._indexing_functions import _get_indexing_mode
 from dpctl.tensor._numpy_helper import normalize_axis_index
 
-import dpctl_ext.tensor as dpt_ext
-
 # pylint: disable=no-name-in-module
 # TODO: revert to `import dpctl.tensor...`
 # when dpnp fully migrates dpctl/tensor
+import dpctl_ext.tensor as dpt_ext
 import dpctl_ext.tensor._tensor_impl as ti
 import dpnp
 
@@ -243,7 +242,7 @@ def choose(a, choices, out=None, mode="wrap"):
         # NumPy will cast up to int64 in general but
         # int32 is more than safe for bool
         if ind_dt == dpnp.bool:
-            inds = dpt.astype(inds, dpt.int32)
+            inds = dpt_ext.astype(inds, dpt.int32)
         else:
             raise TypeError("input index array must be of integer data type")
 
@@ -256,7 +255,7 @@ def choose(a, choices, out=None, mode="wrap"):
         choices = tuple(
             map(
                 lambda chc: (
-                    chc if chc.dtype == res_dt else dpt.astype(chc, res_dt)
+                    chc if chc.dtype == res_dt else dpt_ext.astype(chc, res_dt)
                 ),
                 choices,
             )
@@ -815,14 +814,14 @@ def extract(condition, a):
     )
 
     if usm_cond.size != usm_a.size:
-        usm_a = dpt.reshape(usm_a, -1)
-        usm_cond = dpt.reshape(usm_cond, -1)
+        usm_a = dpt_ext.reshape(usm_a, -1)
+        usm_cond = dpt_ext.reshape(usm_cond, -1)
 
         usm_res = dpt_ext.take(usm_a, dpt.nonzero(usm_cond)[0])
     else:
         if usm_cond.shape != usm_a.shape:
-            usm_a = dpt.reshape(usm_a, -1)
-            usm_cond = dpt.reshape(usm_cond, -1)
+            usm_a = dpt_ext.reshape(usm_a, -1)
+            usm_cond = dpt_ext.reshape(usm_cond, -1)
 
         usm_res = dpt.extract(usm_cond, usm_a)
 
@@ -959,18 +958,18 @@ def fill_diagonal(a, val, wrap=False):
     # a.flat[:end:step] = val
     # but need to consider use case when `a` is usm_ndarray also
     a_sh = a.shape
-    tmp_a = dpt.reshape(usm_a, -1)
+    tmp_a = dpt_ext.reshape(usm_a, -1)
     if dpnp.isscalar(usm_val):
         tmp_a[:end:step] = usm_val
     else:
-        usm_val = dpt.reshape(usm_val, -1)
+        usm_val = dpt_ext.reshape(usm_val, -1)
 
         # Setitem can work only if index size equal val size.
         # Using loop for general case without dependencies of val size.
         for i in range(0, usm_val.size):
             tmp_a[step * i : end : step * (i + 1)] = usm_val[i]
 
-    tmp_a = dpt.reshape(tmp_a, a_sh)
+    tmp_a = dpt_ext.reshape(tmp_a, a_sh)
     usm_a[:] = tmp_a
 
 
@@ -1611,12 +1610,14 @@ def place(a, mask, vals):
 
     if usm_vals.ndim != 1:
         # dpt.place supports only 1-D array of values
-        usm_vals = dpt.reshape(usm_vals, -1)
+        usm_vals = dpt_ext.reshape(usm_vals, -1)
 
     if usm_vals.dtype != usm_a.dtype:
         # dpt.place casts values to a.dtype with "unsafe" rule,
         # while numpy.place does that with "safe" casting rule
-        usm_vals = dpt.astype(usm_vals, usm_a.dtype, casting="safe", copy=False)
+        usm_vals = dpt_ext.astype(
+            usm_vals, usm_a.dtype, casting="safe", copy=False
+        )
 
     dpt.place(usm_a, usm_mask, usm_vals)
 
@@ -1708,19 +1709,19 @@ def put(a, ind, v, /, *, axis=None, mode="wrap"):
 
     if usm_ind.ndim != 1:
         # dpt.put supports only 1-D array of indices
-        usm_ind = dpt.reshape(usm_ind, -1, copy=False)
+        usm_ind = dpt_ext.reshape(usm_ind, -1, copy=False)
 
     if not dpnp.issubdtype(usm_ind.dtype, dpnp.integer):
         # dpt.put supports only integer dtype for array of indices
-        usm_ind = dpt.astype(usm_ind, dpnp.intp, casting="safe")
+        usm_ind = dpt_ext.astype(usm_ind, dpnp.intp, casting="safe")
 
     in_usm_a = usm_a
     if axis is None and usm_a.ndim > 1:
-        usm_a = dpt.reshape(usm_a, -1)
+        usm_a = dpt_ext.reshape(usm_a, -1)
 
     dpt_ext.put(usm_a, usm_ind, usm_v, axis=axis, mode=mode)
     if in_usm_a._pointer != usm_a._pointer:  # pylint: disable=protected-access
-        in_usm_a[:] = dpt.reshape(usm_a, in_usm_a.shape, copy=False)
+        in_usm_a[:] = dpt_ext.reshape(usm_a, in_usm_a.shape, copy=False)
 
 
 def put_along_axis(a, ind, values, axis, mode="wrap"):
@@ -2162,7 +2163,7 @@ def take(a, indices, /, *, axis=None, out=None, mode="wrap"):
     if axis is None:
         if a_ndim > 1:
             # flatten input array
-            usm_a = dpt.reshape(usm_a, -1)
+            usm_a = dpt_ext.reshape(usm_a, -1)
         axis = 0
     elif a_ndim == 0:
         axis = normalize_axis_index(operator.index(axis), 1)
@@ -2171,7 +2172,7 @@ def take(a, indices, /, *, axis=None, out=None, mode="wrap"):
 
     if not dpnp.issubdtype(usm_ind.dtype, dpnp.integer):
         # dpt.take supports only integer dtype for array of indices
-        usm_ind = dpt.astype(usm_ind, dpnp.intp, copy=False, casting="safe")
+        usm_ind = dpt_ext.astype(usm_ind, dpnp.intp, copy=False, casting="safe")
 
     usm_res = _take_index(
         usm_a, usm_ind, axis, exec_q, res_usm_type, out=out, mode=mode
diff --git a/dpnp/dpnp_iface_manipulation.py b/dpnp/dpnp_iface_manipulation.py
index dd872485a602..4866c912ab6a 100644
--- a/dpnp/dpnp_iface_manipulation.py
+++ b/dpnp/dpnp_iface_manipulation.py
@@ -53,6 +53,9 @@
     normalize_axis_tuple,
 )
 
+# TODO: revert to `import dpctl.tensor...`
+# when dpnp fully migrates dpctl/tensor
+import dpctl_ext.tensor as dpt_ext
 import dpnp
 
 from .dpnp_array import dpnp_array
@@ -415,7 +418,7 @@ def _get_first_nan_index(usm_a):
             dpt.place(
                 usm_res.inverse_indices,
                 usm_res.inverse_indices > first_nan,
-                dpt.reshape(first_nan, 1),
+                dpt_ext.reshape(first_nan, 1),
             )
 
         result += (usm_res.inverse_indices,)
@@ -3057,7 +3060,7 @@ def reshape(a, /, shape, order="C", *, copy=None):
         )
 
     usm_a = dpnp.get_usm_ndarray(a)
-    usm_res = dpt.reshape(usm_a, shape=shape, order=order, copy=copy)
+    usm_res = dpt_ext.reshape(usm_a, shape=shape, order=order, copy=copy)
     return dpnp_array._create_from_usm_ndarray(usm_res)
 
 
@@ -3259,9 +3262,9 @@ def roll(x, shift, axis=None):
         shift = dpnp.asnumpy(shift)
 
     if axis is None:
-        return roll(dpt.reshape(usm_x, -1), shift, 0).reshape(x.shape)
+        return roll(dpt_ext.reshape(usm_x, -1), shift, 0).reshape(x.shape)
 
-    usm_res = dpt.roll(usm_x, shift=shift, axis=axis)
+    usm_res = dpt_ext.roll(usm_x, shift=shift, axis=axis)
     return dpnp_array._create_from_usm_ndarray(usm_res)
 
 
diff --git a/dpnp/dpnp_iface_sorting.py b/dpnp/dpnp_iface_sorting.py
index 9c5097a5f3e3..be6c52ae9d80 100644
--- a/dpnp/dpnp_iface_sorting.py
+++ b/dpnp/dpnp_iface_sorting.py
@@ -44,6 +44,9 @@
 import dpctl.tensor as dpt
 from dpctl.tensor._numpy_helper import normalize_axis_index
 
+# TODO: revert to `import dpctl.tensor...`
+# when dpnp fully migrates dpctl/tensor
+import dpctl_ext.tensor as dpt_ext
 import dpnp
 
 # pylint: disable=no-name-in-module
@@ -84,7 +87,7 @@ def _wrap_sort_argsort(
 
     usm_a = dpnp.get_usm_ndarray(a)
     if axis is None:
-        usm_a = dpt.reshape(usm_a, -1)
+        usm_a = dpt_ext.reshape(usm_a, -1)
         axis = -1
 
     axis = normalize_axis_index(axis, ndim=usm_a.ndim)
diff --git a/dpnp/dpnp_iface_statistics.py b/dpnp/dpnp_iface_statistics.py
index 7e092184366c..daff981d5cc4 100644
--- a/dpnp/dpnp_iface_statistics.py
+++ b/dpnp/dpnp_iface_statistics.py
@@ -47,6 +47,9 @@
 import numpy
 from dpctl.tensor._numpy_helper import normalize_axis_index
 
+# TODO: revert to `import dpctl.tensor...`
+# when dpnp fully migrates dpctl/tensor
+import dpctl_ext.tensor as dpt_ext
 import dpnp
 
 # pylint: disable=no-name-in-module
@@ -1204,7 +1207,7 @@ def mean(a, /, axis=None, dtype=None, out=None, keepdims=False, *, where=True):
     usm_a = dpnp.get_usm_ndarray(a)
     usm_res = dpt.mean(usm_a, axis=axis, keepdims=keepdims)
     if dtype is not None:
-        usm_res = dpt.astype(usm_res, dtype)
+        usm_res = dpt_ext.astype(usm_res, dtype)
 
     return dpnp.get_result_array(usm_res, out, casting="unsafe")
 
diff --git a/dpnp/tests/test_arraycreation.py b/dpnp/tests/test_arraycreation.py
index d8a80ddbff78..88e6aacb997d 100644
--- a/dpnp/tests/test_arraycreation.py
+++ b/dpnp/tests/test_arraycreation.py
@@ -13,6 +13,9 @@
     assert_raises_regex,
 )
 
+# TODO: revert to `import dpctl.tensor...`
+# when dpnp fully migrates dpctl/tensor
+import dpctl_ext.tensor as dpt_ext
 import dpnp
 
 from .helper import (
@@ -969,7 +972,7 @@ def test_ones_like(array, dtype, order):
     ],
 )
 def test_dpctl_tensor_input(func, args):
-    x0 = dpt.reshape(dpt.arange(9), (3, 3))
+    x0 = dpt_ext.reshape(dpt.arange(9), (3, 3))
     new_args = [eval(val, {"x0": x0}) for val in args]
     X = getattr(dpt, func)(*new_args)
     Y = getattr(dpnp, func)(*new_args)

From 192bd937230e671bac1ebcf4e3addccccc83c890 Mon Sep 17 00:00:00 2001
From: vlad-perevezentsev <vladislav.perevezentsev@intel.com>
Date: Tue, 3 Mar 2026 14:39:46 +0100
Subject: [PATCH 05/43] Extend `._tensor_impl` with advanced indexing functions
 (#2777)

This PR extends `_tensor_impl` in `dpctl_ext.tensor` with the advanced
indexing (`_extract, _place, _nonzero, mask_positions, `), repeat
(`_cumsum_1d`) and `_eye` functions

It also adds `eye(), extract(), nonzero(), place(), put_along_axis(),
take_along_axis()` to `dpctl_ext.tensor` and updates the corresponding
dpnp functions to use these implementations internally
---
 dpctl_ext/tensor/CMakeLists.txt               |    6 +-
 dpctl_ext/tensor/__init__.py                  |   12 +
 dpctl_ext/tensor/_copy_utils.py               |  306 ++++
 dpctl_ext/tensor/_ctors.py                    |  128 ++
 dpctl_ext/tensor/_indexing_functions.py       |  309 ++++
 .../include/kernels/accumulators.hpp          | 1448 +++++++++++++++++
 .../kernels/boolean_advanced_indexing.hpp     |  853 ++++++++++
 .../include/kernels/constructors.hpp          |   96 +-
 .../tensor/libtensor/source/accumulators.cpp  |  406 +++++
 .../tensor/libtensor/source/accumulators.hpp  |   62 +
 .../source/boolean_advanced_indexing.cpp      |  859 ++++++++++
 .../source/boolean_advanced_indexing.hpp      |   81 +
 .../tensor/libtensor/source/eye_ctor.cpp      |  142 ++
 .../tensor/libtensor/source/eye_ctor.hpp      |   57 +
 .../tensor/libtensor/source/tensor_ctors.cpp  |   74 +-
 dpnp/dpnp_container.py                        |    2 +-
 dpnp/dpnp_iface_indexing.py                   |   18 +-
 dpnp/dpnp_iface_manipulation.py               |    2 +-
 18 files changed, 4809 insertions(+), 52 deletions(-)
 create mode 100644 dpctl_ext/tensor/libtensor/include/kernels/accumulators.hpp
 create mode 100644 dpctl_ext/tensor/libtensor/include/kernels/boolean_advanced_indexing.hpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/accumulators.cpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/accumulators.hpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/boolean_advanced_indexing.cpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/boolean_advanced_indexing.hpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/eye_ctor.cpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/eye_ctor.hpp

diff --git a/dpctl_ext/tensor/CMakeLists.txt b/dpctl_ext/tensor/CMakeLists.txt
index 93555981deaa..0b166a202735 100644
--- a/dpctl_ext/tensor/CMakeLists.txt
+++ b/dpctl_ext/tensor/CMakeLists.txt
@@ -45,7 +45,7 @@ set(_static_lib_sources
 )
 set(_tensor_impl_sources
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/tensor_ctors.cpp
-    # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/accumulators.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/accumulators.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_and_cast_usm_to_usm.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_as_contig.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_numpy_ndarray_into_usm_ndarray.cpp
@@ -53,8 +53,8 @@ set(_tensor_impl_sources
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_for_roll.cpp
     # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/linear_sequences.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/integer_advanced_indexing.cpp
-    # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/boolean_advanced_indexing.cpp
-    # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/eye_ctor.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/boolean_advanced_indexing.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/eye_ctor.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/full_ctor.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/zeros_ctor.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/triul_ctor.cpp
diff --git a/dpctl_ext/tensor/__init__.py b/dpctl_ext/tensor/__init__.py
index edb2c096bad1..fa76faccc632 100644
--- a/dpctl_ext/tensor/__init__.py
+++ b/dpctl_ext/tensor/__init__.py
@@ -35,13 +35,19 @@
     to_numpy,
 )
 from dpctl_ext.tensor._ctors import (
+    eye,
     full,
     tril,
     triu,
 )
 from dpctl_ext.tensor._indexing_functions import (
+    extract,
+    nonzero,
+    place,
     put,
+    put_along_axis,
     take,
+    take_along_axis,
 )
 from dpctl_ext.tensor._manipulation_functions import (
     roll,
@@ -52,12 +58,18 @@
     "asnumpy",
     "astype",
     "copy",
+    "extract",
+    "eye",
     "from_numpy",
     "full",
+    "nonzero",
+    "place",
     "put",
+    "put_along_axis",
     "reshape",
     "roll",
     "take",
+    "take_along_axis",
     "to_numpy",
     "tril",
     "triu",
diff --git a/dpctl_ext/tensor/_copy_utils.py b/dpctl_ext/tensor/_copy_utils.py
index c62218893a2c..5d1ac209c86b 100644
--- a/dpctl_ext/tensor/_copy_utils.py
+++ b/dpctl_ext/tensor/_copy_utils.py
@@ -27,6 +27,8 @@
 # *****************************************************************************
 
 import builtins
+import operator
+from numbers import Integral
 
 import dpctl
 import dpctl.memory as dpm
@@ -39,8 +41,11 @@
 
 # TODO: revert to `import dpctl.tensor...`
 # when dpnp fully migrates dpctl/tensor
+import dpctl_ext.tensor as dpt_ext
 import dpctl_ext.tensor._tensor_impl as ti
 
+from ._numpy_helper import normalize_axis_index
+
 __doc__ = (
     "Implementation module for copy- and cast- operations on "
     ":class:`dpctl.tensor.usm_ndarray`."
@@ -130,6 +135,307 @@ def _copy_from_numpy_into(dst, np_ary):
     )
 
 
+def _extract_impl(ary, ary_mask, axis=0):
+    """
+    Extract elements of ary by applying mask starting from slot
+    dimension axis
+    """
+    if not isinstance(ary, dpt.usm_ndarray):
+        raise TypeError(
+            f"Expecting type dpctl.tensor.usm_ndarray, got {type(ary)}"
+        )
+    if isinstance(ary_mask, dpt.usm_ndarray):
+        dst_usm_type = dpctl.utils.get_coerced_usm_type(
+            (ary.usm_type, ary_mask.usm_type)
+        )
+        exec_q = dpctl.utils.get_execution_queue(
+            (ary.sycl_queue, ary_mask.sycl_queue)
+        )
+        if exec_q is None:
+            raise dpctl.utils.ExecutionPlacementError(
+                "arrays have different associated queues. "
+                "Use `y.to_device(x.device)` to migrate."
+            )
+    elif isinstance(ary_mask, np.ndarray):
+        dst_usm_type = ary.usm_type
+        exec_q = ary.sycl_queue
+        ary_mask = dpt.asarray(
+            ary_mask, usm_type=dst_usm_type, sycl_queue=exec_q
+        )
+    else:
+        raise TypeError(
+            "Expecting type dpctl.tensor.usm_ndarray or numpy.ndarray, got "
+            f"{type(ary_mask)}"
+        )
+    ary_nd = ary.ndim
+    pp = normalize_axis_index(operator.index(axis), ary_nd)
+    mask_nd = ary_mask.ndim
+    if pp < 0 or pp + mask_nd > ary_nd:
+        raise ValueError(
+            "Parameter p is inconsistent with input array dimensions"
+        )
+    mask_nelems = ary_mask.size
+    cumsum_dt = dpt.int32 if mask_nelems < int32_t_max else dpt.int64
+    cumsum = dpt.empty(mask_nelems, dtype=cumsum_dt, device=ary_mask.device)
+    exec_q = cumsum.sycl_queue
+    _manager = dpctl.utils.SequentialOrderManager[exec_q]
+    dep_evs = _manager.submitted_events
+    mask_count = ti.mask_positions(
+        ary_mask, cumsum, sycl_queue=exec_q, depends=dep_evs
+    )
+    dst_shape = ary.shape[:pp] + (mask_count,) + ary.shape[pp + mask_nd :]
+    dst = dpt.empty(
+        dst_shape, dtype=ary.dtype, usm_type=dst_usm_type, device=ary.device
+    )
+    if dst.size == 0:
+        return dst
+    hev, ev = ti._extract(
+        src=ary,
+        cumsum=cumsum,
+        axis_start=pp,
+        axis_end=pp + mask_nd,
+        dst=dst,
+        sycl_queue=exec_q,
+        depends=dep_evs,
+    )
+    _manager.add_event_pair(hev, ev)
+    return dst
+
+
+def _get_indices_queue_usm_type(inds, queue, usm_type):
+    """
+    Utility for validating indices are NumPy ndarray or usm_ndarray of integral
+    dtype or Python integers. At least one must be an array.
+
+    For each array, the queue and usm type are appended to `queue_list` and
+    `usm_type_list`, respectively.
+    """
+    queues = [queue]
+    usm_types = [usm_type]
+    any_array = False
+    for ind in inds:
+        if isinstance(ind, (np.ndarray, dpt.usm_ndarray)):
+            any_array = True
+            if ind.dtype.kind not in "ui":
+                raise IndexError(
+                    "arrays used as indices must be of integer (or boolean) "
+                    "type"
+                )
+            if isinstance(ind, dpt.usm_ndarray):
+                queues.append(ind.sycl_queue)
+                usm_types.append(ind.usm_type)
+        elif not isinstance(ind, Integral):
+            raise TypeError(
+                "all elements of `ind` expected to be usm_ndarrays, "
+                f"NumPy arrays, or integers, found {type(ind)}"
+            )
+    if not any_array:
+        raise TypeError(
+            "at least one element of `inds` expected to be an array"
+        )
+    usm_type = dpctl.utils.get_coerced_usm_type(usm_types)
+    q = dpctl.utils.get_execution_queue(queues)
+    return q, usm_type
+
+
+def _nonzero_impl(ary):
+    if not isinstance(ary, dpt.usm_ndarray):
+        raise TypeError(
+            f"Expecting type dpctl.tensor.usm_ndarray, got {type(ary)}"
+        )
+    exec_q = ary.sycl_queue
+    usm_type = ary.usm_type
+    mask_nelems = ary.size
+    cumsum_dt = dpt.int32 if mask_nelems < int32_t_max else dpt.int64
+    cumsum = dpt.empty(
+        mask_nelems, dtype=cumsum_dt, sycl_queue=exec_q, order="C"
+    )
+    _manager = dpctl.utils.SequentialOrderManager[exec_q]
+    dep_evs = _manager.submitted_events
+    mask_count = ti.mask_positions(
+        ary, cumsum, sycl_queue=exec_q, depends=dep_evs
+    )
+    indexes_dt = ti.default_device_index_type(exec_q.sycl_device)
+    indexes = dpt.empty(
+        (ary.ndim, mask_count),
+        dtype=indexes_dt,
+        usm_type=usm_type,
+        sycl_queue=exec_q,
+        order="C",
+    )
+    hev, nz_ev = ti._nonzero(cumsum, indexes, ary.shape, exec_q)
+    res = tuple(indexes[i, :] for i in range(ary.ndim))
+    _manager.add_event_pair(hev, nz_ev)
+    return res
+
+
+def _prepare_indices_arrays(inds, q, usm_type):
+    """
+    Utility taking a mix of usm_ndarray and possibly Python int scalar indices,
+    a queue (assumed to be common to arrays in inds), and a usm type.
+
+    Python scalar integers are promoted to arrays on the provided queue and
+    with the provided usm type. All arrays are then promoted to a common
+    integral type (if possible) before being broadcast to a common shape.
+    """
+    # scalar integers -> arrays
+    inds = tuple(
+        map(
+            lambda ind: (
+                ind
+                if isinstance(ind, dpt.usm_ndarray)
+                else dpt.asarray(ind, usm_type=usm_type, sycl_queue=q)
+            ),
+            inds,
+        )
+    )
+
+    # promote to a common integral type if possible
+    ind_dt = dpt.result_type(*inds)
+    if ind_dt.kind not in "ui":
+        raise ValueError(
+            "cannot safely promote indices to an integer data type"
+        )
+    inds = tuple(
+        map(
+            lambda ind: (
+                ind if ind.dtype == ind_dt else dpt.astype(ind, ind_dt)
+            ),
+            inds,
+        )
+    )
+
+    # broadcast
+    inds = dpt.broadcast_arrays(*inds)
+
+    return inds
+
+
+def _put_multi_index(ary, inds, p, vals, mode=0):
+    if not isinstance(ary, dpt.usm_ndarray):
+        raise TypeError(
+            f"Expecting type dpctl.tensor.usm_ndarray, got {type(ary)}"
+        )
+    ary_nd = ary.ndim
+    p = normalize_axis_index(operator.index(p), ary_nd)
+    mode = operator.index(mode)
+    if mode not in [0, 1]:
+        raise ValueError(
+            "Invalid value for mode keyword, only 0 or 1 is supported"
+        )
+    if not isinstance(inds, (list, tuple)):
+        inds = (inds,)
+
+    exec_q, coerced_usm_type = _get_indices_queue_usm_type(
+        inds, ary.sycl_queue, ary.usm_type
+    )
+
+    if exec_q is not None:
+        if not isinstance(vals, dpt.usm_ndarray):
+            vals = dpt.asarray(
+                vals,
+                dtype=ary.dtype,
+                usm_type=coerced_usm_type,
+                sycl_queue=exec_q,
+            )
+        else:
+            exec_q = dpctl.utils.get_execution_queue((exec_q, vals.sycl_queue))
+            coerced_usm_type = dpctl.utils.get_coerced_usm_type(
+                (
+                    coerced_usm_type,
+                    vals.usm_type,
+                )
+            )
+    if exec_q is None:
+        raise dpctl.utils.ExecutionPlacementError(
+            "Can not automatically determine where to allocate the "
+            "result or performance execution. "
+            "Use `usm_ndarray.to_device` method to migrate data to "
+            "be associated with the same queue."
+        )
+
+    inds = _prepare_indices_arrays(inds, exec_q, coerced_usm_type)
+
+    ind0 = inds[0]
+    ary_sh = ary.shape
+    p_end = p + len(inds)
+    if 0 in ary_sh[p:p_end] and ind0.size != 0:
+        raise IndexError(
+            "cannot put into non-empty indices along an empty axis"
+        )
+    expected_vals_shape = ary_sh[:p] + ind0.shape + ary_sh[p_end:]
+    if vals.dtype == ary.dtype:
+        rhs = vals
+    else:
+        rhs = dpt_ext.astype(vals, ary.dtype)
+    rhs = dpt.broadcast_to(rhs, expected_vals_shape)
+    _manager = dpctl.utils.SequentialOrderManager[exec_q]
+    dep_ev = _manager.submitted_events
+    hev, put_ev = ti._put(
+        dst=ary,
+        ind=inds,
+        val=rhs,
+        axis_start=p,
+        mode=mode,
+        sycl_queue=exec_q,
+        depends=dep_ev,
+    )
+    _manager.add_event_pair(hev, put_ev)
+    return
+
+
+def _take_multi_index(ary, inds, p, mode=0):
+    if not isinstance(ary, dpt.usm_ndarray):
+        raise TypeError(
+            f"Expecting type dpctl.tensor.usm_ndarray, got {type(ary)}"
+        )
+    ary_nd = ary.ndim
+    p = normalize_axis_index(operator.index(p), ary_nd)
+    mode = operator.index(mode)
+    if mode not in [0, 1]:
+        raise ValueError(
+            "Invalid value for mode keyword, only 0 or 1 is supported"
+        )
+    if not isinstance(inds, (list, tuple)):
+        inds = (inds,)
+
+    exec_q, res_usm_type = _get_indices_queue_usm_type(
+        inds, ary.sycl_queue, ary.usm_type
+    )
+    if exec_q is None:
+        raise dpctl.utils.ExecutionPlacementError(
+            "Can not automatically determine where to allocate the "
+            "result or performance execution. "
+            "Use `usm_ndarray.to_device` method to migrate data to "
+            "be associated with the same queue."
+        )
+
+    inds = _prepare_indices_arrays(inds, exec_q, res_usm_type)
+
+    ind0 = inds[0]
+    ary_sh = ary.shape
+    p_end = p + len(inds)
+    if 0 in ary_sh[p:p_end] and ind0.size != 0:
+        raise IndexError("cannot take non-empty indices from an empty axis")
+    res_shape = ary_sh[:p] + ind0.shape + ary_sh[p_end:]
+    res = dpt.empty(
+        res_shape, dtype=ary.dtype, usm_type=res_usm_type, sycl_queue=exec_q
+    )
+    _manager = dpctl.utils.SequentialOrderManager[exec_q]
+    dep_ev = _manager.submitted_events
+    hev, take_ev = ti._take(
+        src=ary,
+        ind=inds,
+        dst=res,
+        axis_start=p,
+        mode=mode,
+        sycl_queue=exec_q,
+        depends=dep_ev,
+    )
+    _manager.add_event_pair(hev, take_ev)
+    return res
+
+
 def from_numpy(np_ary, /, *, device=None, usm_type="device", sycl_queue=None):
     """
     from_numpy(arg, device=None, usm_type="device", sycl_queue=None)
diff --git a/dpctl_ext/tensor/_ctors.py b/dpctl_ext/tensor/_ctors.py
index 5a39e9367e9c..5a9e07c73346 100644
--- a/dpctl_ext/tensor/_ctors.py
+++ b/dpctl_ext/tensor/_ctors.py
@@ -58,6 +58,38 @@ def _cast_fill_val(fill_val, dt):
         return fill_val
 
 
+def _ensure_native_dtype_device_support(dtype, dev) -> None:
+    """Check that dtype is natively supported by device.
+
+    Arg:
+        dtype:
+            Elemental data-type
+        dev (:class:`dpctl.SyclDevice`):
+            The device about which the query is being made.
+    Returns:
+        None
+    Raise:
+        ValueError:
+            if device does not natively support this `dtype`.
+    """
+    if dtype in [dpt.float64, dpt.complex128] and not dev.has_aspect_fp64:
+        raise ValueError(
+            f"Device {dev.name} does not provide native support "
+            "for double-precision floating point type."
+        )
+    if (
+        dtype
+        in [
+            dpt.float16,
+        ]
+        and not dev.has_aspect_fp16
+    ):
+        raise ValueError(
+            f"Device {dev.name} does not provide native support "
+            "for half-precision floating point type."
+        )
+
+
 def _to_scalar(obj, sc_ty):
     """A way to convert object to NumPy scalar type.
     Raises OverflowError if obj can not be represented
@@ -67,6 +99,102 @@ def _to_scalar(obj, sc_ty):
     return zd_arr[()]
 
 
+def eye(
+    n_rows,
+    n_cols=None,
+    /,
+    *,
+    k=0,
+    dtype=None,
+    order="C",
+    device=None,
+    usm_type="device",
+    sycl_queue=None,
+):
+    """
+    eye(n_rows, n_cols=None, /, *, k=0, dtype=None, \
+        device=None, usm_type="device", sycl_queue=None)
+
+    Creates :class:`dpctl.tensor.usm_ndarray` with ones on the `k`-th
+    diagonal.
+
+    Args:
+        n_rows (int):
+            number of rows in the output array.
+        n_cols (int, optional):
+            number of columns in the output array. If ``None``,
+            ``n_cols = n_rows``. Default: ``None``
+        k (int):
+            index of the diagonal, with ``0`` as the main diagonal.
+            A positive value of ``k`` is a superdiagonal, a negative value
+            is a subdiagonal.
+            Raises :exc:`TypeError` if ``k`` is not an integer.
+            Default: ``0``
+        dtype (optional):
+            data type of the array. Can be typestring,
+            a :class:`numpy.dtype` object, :mod:`numpy` char string, or
+            a NumPy scalar type. Default: ``None``
+        order ("C" or "F"):
+            memory layout for the array. Default: ``"C"``
+        device (optional):
+            array API concept of device where the output array
+            is created. ``device`` can be ``None``, a oneAPI filter selector
+            string, an instance of :class:`dpctl.SyclDevice` corresponding to
+            a non-partitioned SYCL device, an instance of
+            :class:`dpctl.SyclQueue`, or a :class:`dpctl.tensor.Device` object
+            returned by :attr:`dpctl.tensor.usm_ndarray.device`.
+            Default: ``None``
+        usm_type (``"device"``, ``"shared"``, ``"host"``, optional):
+            The type of SYCL USM allocation for the output array.
+            Default: ``"device"``
+        sycl_queue (:class:`dpctl.SyclQueue`, optional):
+            The SYCL queue to use
+            for output array allocation and copying. ``sycl_queue`` and
+            ``device`` are complementary arguments, i.e. use one or another.
+            If both are specified, a :exc:`TypeError` is raised unless both
+            imply the same underlying SYCL queue to be used. If both are
+            ``None``, a cached queue targeting default-selected device is
+            used for allocation and population. Default: ``None``
+
+    Returns:
+        usm_ndarray:
+            A diagonal matrix.
+    """
+    if not isinstance(order, str) or len(order) == 0 or order[0] not in "CcFf":
+        raise ValueError(
+            "Unrecognized order keyword value, expecting 'F' or 'C'."
+        )
+    order = order[0].upper()
+    n_rows = operator.index(n_rows)
+    n_cols = n_rows if n_cols is None else operator.index(n_cols)
+    k = operator.index(k)
+    if k >= n_cols or -k >= n_rows:
+        return dpt.zeros(
+            (n_rows, n_cols),
+            dtype=dtype,
+            order=order,
+            device=device,
+            usm_type=usm_type,
+            sycl_queue=sycl_queue,
+        )
+    dpctl.utils.validate_usm_type(usm_type, allow_none=False)
+    sycl_queue = normalize_queue_device(sycl_queue=sycl_queue, device=device)
+    dtype = _get_dtype(dtype, sycl_queue)
+    _ensure_native_dtype_device_support(dtype, sycl_queue.sycl_device)
+    res = dpt.usm_ndarray(
+        (n_rows, n_cols),
+        dtype=dtype,
+        buffer=usm_type,
+        order=order,
+        buffer_ctor_kwargs={"queue": sycl_queue},
+    )
+    if n_rows != 0 and n_cols != 0:
+        _manager = dpctl.utils.SequentialOrderManager[sycl_queue]
+        hev, eye_ev = ti._eye(k, dst=res, sycl_queue=sycl_queue)
+        _manager.add_event_pair(hev, eye_ev)
+    return res
+
+
 def _validate_fill_value(fill_val):
     """Validates that `fill_val` is a numeric or boolean scalar."""
     # TODO: verify if `np.True_` and `np.False_` should be instances of
diff --git a/dpctl_ext/tensor/_indexing_functions.py b/dpctl_ext/tensor/_indexing_functions.py
index df4f3e953042..6ca327192f73 100644
--- a/dpctl_ext/tensor/_indexing_functions.py
+++ b/dpctl_ext/tensor/_indexing_functions.py
@@ -37,6 +37,12 @@
 import dpctl_ext.tensor as dpt_ext
 import dpctl_ext.tensor._tensor_impl as ti
 
+from ._copy_utils import (
+    _extract_impl,
+    _nonzero_impl,
+    _put_multi_index,
+    _take_multi_index,
+)
 from ._numpy_helper import normalize_axis_index
 
 
@@ -50,6 +56,152 @@ def _get_indexing_mode(name):
         )
 
 
+def _range(sh_i, i, nd, q, usm_t, dt):
+    ind = dpt.arange(sh_i, dtype=dt, usm_type=usm_t, sycl_queue=q)
+    ind.shape = tuple(sh_i if i == j else 1 for j in range(nd))
+    return ind
+
+
+def extract(condition, arr):
+    """extract(condition, arr)
+
+    Returns the elements of an array that satisfies the condition.
+
+    If ``condition`` is boolean ``dpctl.tensor.extract`` is
+    equivalent to ``arr[condition]``.
+
+    Note that ``dpctl.tensor.place`` does the opposite of
+    ``dpctl.tensor.extract``.
+
+    Args:
+       conditions (usm_ndarray):
+            An array whose non-zero or ``True`` entries indicate the element
+            of ``arr`` to extract.
+
+       arr (usm_ndarray):
+            Input array of the same size as ``condition``.
+
+    Returns:
+        usm_ndarray:
+            Rank 1 array of values from ``arr`` where ``condition`` is
+            ``True``.
+    """
+    if not isinstance(condition, dpt.usm_ndarray):
+        raise TypeError(
+            "Expecting dpctl.tensor.usm_ndarray type, " f"got {type(condition)}"
+        )
+    if not isinstance(arr, dpt.usm_ndarray):
+        raise TypeError(
+            "Expecting dpctl.tensor.usm_ndarray type, " f"got {type(arr)}"
+        )
+    exec_q = dpctl.utils.get_execution_queue(
+        (
+            condition.sycl_queue,
+            arr.sycl_queue,
+        )
+    )
+    if exec_q is None:
+        raise dpctl.utils.ExecutionPlacementError
+    if condition.shape != arr.shape:
+        raise ValueError("Arrays are not of the same size")
+    return _extract_impl(arr, condition)
+
+
+def nonzero(arr):
+    """nonzero(arr)
+
+    Return the indices of non-zero elements.
+
+    Returns a tuple of usm_ndarrays, one for each dimension
+    of ``arr``, containing the indices of the non-zero elements
+    in that dimension. The values of ``arr`` are always tested in
+    row-major, C-style order.
+
+    Args:
+        arr (usm_ndarray):
+            Input array, which has non-zero array rank.
+
+    Returns:
+        Tuple[usm_ndarray, ...]:
+            Indices of non-zero array elements.
+    """
+    if not isinstance(arr, dpt.usm_ndarray):
+        raise TypeError(
+            "Expecting dpctl.tensor.usm_ndarray type, " f"got {type(arr)}"
+        )
+    if arr.ndim == 0:
+        raise ValueError("Array of positive rank is expected")
+    return _nonzero_impl(arr)
+
+
+def place(arr, mask, vals):
+    """place(arr, mask, vals)
+
+    Change elements of an array based on conditional and input values.
+
+    If ``mask`` is boolean ``dpctl.tensor.place`` is
+    equivalent to ``arr[condition] = vals``.
+
+    Args:
+        arr (usm_ndarray):
+            Array to put data into.
+        mask (usm_ndarray):
+            Boolean mask array. Must have the same size as ``arr``.
+        vals (usm_ndarray, sequence):
+            Values to put into ``arr``. Only the first N elements are
+            used, where N is the number of True values in ``mask``. If
+            ``vals`` is smaller than N, it will be repeated, and if
+            elements of ``arr`` are to be masked, this sequence must be
+            non-empty. Array ``vals`` must be one dimensional.
+    """
+    if not isinstance(arr, dpt.usm_ndarray):
+        raise TypeError(
+            "Expecting dpctl.tensor.usm_ndarray type, " f"got {type(arr)}"
+        )
+    if not isinstance(mask, dpt.usm_ndarray):
+        raise TypeError(
+            "Expecting dpctl.tensor.usm_ndarray type, " f"got {type(mask)}"
+        )
+    if not isinstance(vals, dpt.usm_ndarray):
+        raise TypeError(
+            "Expecting dpctl.tensor.usm_ndarray type, " f"got {type(vals)}"
+        )
+    exec_q = dpctl.utils.get_execution_queue(
+        (
+            arr.sycl_queue,
+            mask.sycl_queue,
+            vals.sycl_queue,
+        )
+    )
+    if exec_q is None:
+        raise dpctl.utils.ExecutionPlacementError
+    if arr.shape != mask.shape or vals.ndim != 1:
+        raise ValueError("Array sizes are not as required")
+    cumsum = dpt.empty(mask.size, dtype="i8", sycl_queue=exec_q)
+    _manager = dpctl.utils.SequentialOrderManager[exec_q]
+    deps_ev = _manager.submitted_events
+    nz_count = ti.mask_positions(
+        mask, cumsum, sycl_queue=exec_q, depends=deps_ev
+    )
+    if nz_count == 0:
+        return
+    if vals.size == 0:
+        raise ValueError("Cannot insert from an empty array!")
+    if vals.dtype == arr.dtype:
+        rhs = vals
+    else:
+        rhs = dpt.astype(vals, arr.dtype)
+    hev, pl_ev = ti._place(
+        dst=arr,
+        cumsum=cumsum,
+        axis_start=0,
+        axis_end=mask.ndim,
+        rhs=rhs,
+        sycl_queue=exec_q,
+    )
+    _manager.add_event_pair(hev, pl_ev)
+
+
 def put(x, indices, vals, /, *, axis=None, mode="wrap"):
     """put(x, indices, vals, axis=None, mode="wrap")
 
@@ -199,6 +351,86 @@ def put_vec_duplicates(vec, ind, vals):
     _manager.add_event_pair(hev, put_ev)
 
 
+def put_along_axis(x, indices, vals, /, *, axis=-1, mode="wrap"):
+    """
+    Puts elements into an array at the one-dimensional indices specified by
+    ``indices`` along a provided ``axis``.
+
+    Args:
+        x (usm_ndarray):
+            input array. Must be compatible with ``indices``, except for the
+            axis (dimension) specified by ``axis``.
+        indices (usm_ndarray):
+            array indices. Must have the same rank (i.e., number of dimensions)
+            as ``x``.
+        vals (usm_ndarray):
+            Array of values to be put into ``x``.
+            Must be broadcastable to the shape of ``indices``.
+        axis: int
+            axis along which to select values. If ``axis`` is negative, the
+            function determines the axis along which to select values by
+            counting from the last dimension. Default: ``-1``.
+        mode (str, optional):
+            How out-of-bounds indices will be handled. Possible values
+            are:
+
+            - ``"wrap"``: clamps indices to (``-n <= i < n``), then wraps
+              negative indices.
+            - ``"clip"``: clips indices to (``0 <= i < n``).
+
+            Default: ``"wrap"``.
+
+    .. note::
+
+        If input array ``indices`` contains duplicates, a race condition
+        occurs, and the value written into corresponding positions in ``x``
+        may vary from run to run. Preserving sequential semantics in handing
+        the duplicates to achieve deterministic behavior requires additional
+        work.
+    """
+    if not isinstance(x, dpt.usm_ndarray):
+        raise TypeError(f"Expected dpctl.tensor.usm_ndarray, got {type(x)}")
+    if not isinstance(indices, dpt.usm_ndarray):
+        raise TypeError(
+            f"Expected dpctl.tensor.usm_ndarray, got {type(indices)}"
+        )
+    x_nd = x.ndim
+    if x_nd != indices.ndim:
+        raise ValueError(
+            "Number of dimensions in the first and the second "
+            "argument arrays must be equal"
+        )
+    pp = normalize_axis_index(operator.index(axis), x_nd)
+    if isinstance(vals, dpt.usm_ndarray):
+        queues_ = [x.sycl_queue, indices.sycl_queue, vals.sycl_queue]
+        usm_types_ = [x.usm_type, indices.usm_type, vals.usm_type]
+    else:
+        queues_ = [x.sycl_queue, indices.sycl_queue]
+        usm_types_ = [x.usm_type, indices.usm_type]
+    exec_q = dpctl.utils.get_execution_queue(queues_)
+    if exec_q is None:
+        raise dpctl.utils.ExecutionPlacementError(
+            "Execution placement can not be unambiguously inferred "
+            "from input arguments. "
+        )
+    out_usm_type = dpctl.utils.get_coerced_usm_type(usm_types_)
+    mode_i = _get_indexing_mode(mode)
+    indexes_dt = (
+        dpt.uint64
+        if indices.dtype == dpt.uint64
+        else ti.default_device_index_type(exec_q.sycl_device)
+    )
+    _ind = tuple(
+        (
+            indices
+            if i == pp
+            else _range(x.shape[i], i, x_nd, exec_q, out_usm_type, indexes_dt)
+        )
+        for i in range(x_nd)
+    )
+    return _put_multi_index(x, _ind, 0, vals, mode=mode_i)
+
+
 def take(x, indices, /, *, axis=None, out=None, mode="wrap"):
     """take(x, indices, axis=None, out=None, mode="wrap")
 
@@ -330,3 +562,80 @@ def take(x, indices, /, *, axis=None, out=None, mode="wrap"):
         out = orig_out
 
     return out
+
+
+def take_along_axis(x, indices, /, *, axis=-1, mode="wrap"):
+    """
+    Returns elements from an array at the one-dimensional indices specified
+    by ``indices`` along a provided ``axis``.
+
+    Args:
+        x (usm_ndarray):
+            input array. Must be compatible with ``indices``, except for the
+            axis (dimension) specified by ``axis``.
+        indices (usm_ndarray):
+            array indices. Must have the same rank (i.e., number of dimensions)
+            as ``x``.
+        axis: int
+            axis along which to select values. If ``axis`` is negative, the
+            function determines the axis along which to select values by
+            counting from the last dimension. Default: ``-1``.
+        mode (str, optional):
+            How out-of-bounds indices will be handled. Possible values
+            are:
+
+            - ``"wrap"``: clamps indices to (``-n <= i < n``), then wraps
+              negative indices.
+            - ``"clip"``: clips indices to (``0 <= i < n``).
+
+            Default: ``"wrap"``.
+
+    Returns:
+        usm_ndarray:
+            an array having the same data type as ``x``. The returned array has
+            the same rank (i.e., number of dimensions) as ``x`` and a shape
+            determined according to broadcasting rules, except for the axis
+            (dimension) specified by ``axis`` whose size must equal the size
+            of the corresponding axis (dimension) in ``indices``.
+
+    Note:
+        Treatment of the out-of-bound indices in ``indices`` array is controlled
+        by the value of ``mode`` keyword.
+    """
+    if not isinstance(x, dpt.usm_ndarray):
+        raise TypeError(f"Expected dpctl.tensor.usm_ndarray, got {type(x)}")
+    if not isinstance(indices, dpt.usm_ndarray):
+        raise TypeError(
+            f"Expected dpctl.tensor.usm_ndarray, got {type(indices)}"
+        )
+    x_nd = x.ndim
+    if x_nd != indices.ndim:
+        raise ValueError(
+            "Number of dimensions in the first and the second "
+            "argument arrays must be equal"
+        )
+    pp = normalize_axis_index(operator.index(axis), x_nd)
+    out_usm_type = dpctl.utils.get_coerced_usm_type(
+        (x.usm_type, indices.usm_type)
+    )
+    exec_q = dpctl.utils.get_execution_queue((x.sycl_queue, indices.sycl_queue))
+    if exec_q is None:
+        raise dpctl.utils.ExecutionPlacementError(
+            "Execution placement can not be unambiguously inferred "
+            "from input arguments. "
+        )
+    mode_i = _get_indexing_mode(mode)
+    indexes_dt = (
+        dpt.uint64
+        if indices.dtype == dpt.uint64
+        else ti.default_device_index_type(exec_q.sycl_device)
+    )
+    _ind = tuple(
+        (
+            indices
+            if i == pp
+            else _range(x.shape[i], i, x_nd, exec_q, out_usm_type, indexes_dt)
+        )
+        for i in range(x_nd)
+    )
+    return _take_multi_index(x, _ind, 0, mode=mode_i)
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/accumulators.hpp b/dpctl_ext/tensor/libtensor/include/kernels/accumulators.hpp
new file mode 100644
index 000000000000..6451bc950006
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/include/kernels/accumulators.hpp
@@ -0,0 +1,1448 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for accumulators (cumulative sum, prod, etc.).
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <array>
+#include <cstddef>
+#include <cstdint>
+#include <type_traits>
+#include <utility>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpctl_tensor_types.hpp"
+#include "utils/offset_utils.hpp"
+#include "utils/sycl_alloc_utils.hpp"
+#include "utils/sycl_utils.hpp"
+#include "utils/type_dispatch_building.hpp"
+#include "utils/type_utils.hpp"
+
+namespace dpctl::tensor::kernels::accumulators
+{
+
+namespace su_ns = dpctl::tensor::sycl_utils;
+
+using dpctl::tensor::ssize_t;
+using namespace dpctl::tensor::offset_utils;
+
+template <typename T>
+T ceiling_quotient(T n, T m)
+{
+    return (n + m - 1) / m;
+}
+
+template <typename inputT, typename outputT>
+struct NonZeroIndicator
+{
+    constexpr NonZeroIndicator() {}
+
+    outputT operator()(const inputT &val) const
+    {
+        static constexpr outputT out_one(1);
+        static constexpr outputT out_zero(0);
+        static constexpr inputT val_zero(0);
+
+        return (val == val_zero) ? out_zero : out_one;
+    }
+};
+
+template <typename T>
+struct NoOpTransformer
+{
+    constexpr NoOpTransformer() {}
+
+    T operator()(const T &val) const
+    {
+        return val;
+    }
+};
+
+template <typename srcTy, typename dstTy>
+struct CastTransformer
+{
+    constexpr CastTransformer() {}
+
+    dstTy operator()(const srcTy &val) const
+    {
+        using dpctl::tensor::type_utils::convert_impl;
+        return convert_impl<dstTy, srcTy>(val);
+    }
+};
+
+template <typename ScanOpT, typename T>
+struct needs_workaround
+{
+    // workaround needed due to crash in JITing on CPU
+    // remove when CMPLRLLVM-65813 is resolved
+    static constexpr bool value = su_ns::IsSyclLogicalAnd<T, ScanOpT>::value ||
+                                  su_ns::IsSyclLogicalOr<T, ScanOpT>::value;
+};
+
+template <typename BinOpT, typename T>
+struct can_use_inclusive_scan_over_group
+{
+    static constexpr bool value = sycl::has_known_identity<BinOpT, T>::value &&
+                                  !needs_workaround<BinOpT, T>::value;
+};
+
+namespace detail
+{
+template <typename T>
+class stack_t
+{
+    T *src_;
+    std::size_t size_;
+    T *local_scans_;
+
+public:
+    stack_t() : src_{}, size_{}, local_scans_{} {}
+    stack_t(T *src, std::size_t sz, T *local_scans)
+        : src_(src), size_(sz), local_scans_(local_scans)
+    {
+    }
+    ~stack_t(){};
+
+    T *get_src_ptr() const
+    {
+        return src_;
+    }
+
+    std::size_t get_size() const
+    {
+        return size_;
+    }
+
+    T *get_local_scans_ptr() const
+    {
+        return local_scans_;
+    }
+};
+
+template <typename T>
+class stack_strided_t
+{
+    T *src_;
+    std::size_t size_;
+    T *local_scans_;
+    std::size_t local_stride_;
+
+public:
+    stack_strided_t() : src_{}, size_{}, local_scans_{}, local_stride_{} {}
+    stack_strided_t(T *src,
+                    std::size_t sz,
+                    T *local_scans,
+                    std::size_t local_stride)
+        : src_(src), size_(sz), local_scans_(local_scans),
+          local_stride_(local_stride)
+    {
+    }
+    ~stack_strided_t(){};
+
+    T *get_src_ptr() const
+    {
+        return src_;
+    }
+
+    std::size_t get_size() const
+    {
+        return size_;
+    }
+
+    T *get_local_scans_ptr() const
+    {
+        return local_scans_;
+    }
+
+    std::size_t get_local_stride() const
+    {
+        return local_stride_;
+    }
+};
+
+} // end of namespace detail
+
+// Iterative cumulative summation
+
+using nwiT = std::uint32_t;
+
+template <typename inputT,
+          typename outputT,
+          nwiT n_wi,
+          typename IterIndexerT,
+          typename InpIndexerT,
+          typename OutIndexerT,
+          typename TransformerT,
+          typename ScanOpT,
+          bool include_initial>
+class inclusive_scan_iter_local_scan_blocked_krn;
+
+template <typename inputT,
+          typename outputT,
+          nwiT n_wi,
+          typename IterIndexerT,
+          typename InpIndexerT,
+          typename OutIndexerT,
+          typename TransformerT,
+          typename ScanOpT,
+          bool include_initial>
+class inclusive_scan_iter_local_scan_striped_krn;
+
+template <typename inputT,
+          typename outputT,
+          nwiT n_wi,
+          typename IterIndexerT,
+          typename InpIndexerT,
+          typename OutIndexerT,
+          typename TransformerT,
+          typename ScanOpT,
+          bool include_initial = false>
+sycl::event inclusive_scan_base_step_blocked(
+    sycl::queue &exec_q,
+    const std::uint32_t wg_size,
+    const std::size_t iter_nelems,
+    const std::size_t acc_nelems,
+    const inputT *input,
+    outputT *output,
+    const std::size_t s0,
+    const std::size_t s1,
+    const IterIndexerT &iter_indexer,
+    const InpIndexerT &inp_indexer,
+    const OutIndexerT &out_indexer,
+    TransformerT transformer,
+    const ScanOpT &scan_op,
+    outputT identity,
+    std::size_t &acc_groups,
+    const std::vector<sycl::event> &depends = {})
+{
+    acc_groups = ceiling_quotient<std::size_t>(acc_nelems, n_wi * wg_size);
+
+    sycl::event inc_scan_phase1_ev = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+
+        using slmT = sycl::local_accessor<outputT, 1>;
+
+        auto gws = sycl::range<1>(iter_nelems * acc_groups * wg_size);
+        auto lws = sycl::range<1>(wg_size);
+
+        auto ndRange = sycl::nd_range<1>(gws, lws);
+
+        slmT slm_iscan_tmp(lws, cgh);
+
+        using KernelName = inclusive_scan_iter_local_scan_blocked_krn<
+            inputT, outputT, n_wi, IterIndexerT, InpIndexerT, OutIndexerT,
+            TransformerT, ScanOpT, include_initial>;
+
+        cgh.parallel_for<KernelName>(ndRange, [=, slm_iscan_tmp =
+                                                      std::move(slm_iscan_tmp)](
+                                                  sycl::nd_item<1> it) {
+            const std::size_t gid = it.get_global_id(0);
+            const std::size_t lid = it.get_local_id(0);
+
+            const std::uint32_t wg_size = it.get_local_range(0);
+            const std::size_t reduce_chunks = acc_groups * wg_size;
+            const std::size_t iter_gid = gid / reduce_chunks;
+            const std::size_t chunk_gid = gid - (iter_gid * reduce_chunks);
+
+            const std::size_t i = chunk_gid * n_wi;
+            const auto &iter_offsets = iter_indexer(iter_gid);
+            const auto &inp_iter_offset = iter_offsets.get_first_offset();
+            const auto &out_iter_offset = iter_offsets.get_second_offset();
+
+            std::array<outputT, n_wi> local_iscan;
+
+#pragma unroll
+            for (nwiT m_wi = 0; m_wi < n_wi; ++m_wi) {
+                const std::size_t i_m_wi = i + m_wi;
+                if constexpr (!include_initial) {
+                    local_iscan[m_wi] =
+                        (i_m_wi < acc_nelems)
+                            ? transformer(input[inp_iter_offset +
+                                                inp_indexer(s0 + s1 * i_m_wi)])
+                            : identity;
+                }
+                else {
+                    // shift input to the left by a single element relative to
+                    // output
+                    local_iscan[m_wi] =
+                        (i_m_wi < acc_nelems && i_m_wi > 0)
+                            ? transformer(
+                                  input[inp_iter_offset +
+                                        inp_indexer((s0 + s1 * i_m_wi) - 1)])
+                            : identity;
+                }
+            }
+
+#pragma unroll
+            for (nwiT m_wi = 1; m_wi < n_wi; ++m_wi) {
+                local_iscan[m_wi] =
+                    scan_op(local_iscan[m_wi], local_iscan[m_wi - 1]);
+            }
+            // local_iscan is now result of
+            // inclusive scan of locally stored inputs
+
+            outputT wg_iscan_val;
+            if constexpr (can_use_inclusive_scan_over_group<ScanOpT,
+                                                            outputT>::value) {
+                wg_iscan_val = sycl::inclusive_scan_over_group(
+                    it.get_group(), local_iscan.back(), scan_op, identity);
+            }
+            else {
+                wg_iscan_val = su_ns::custom_inclusive_scan_over_group(
+                    it.get_group(), it.get_sub_group(), slm_iscan_tmp,
+                    local_iscan.back(), identity, scan_op);
+                // ensure all finished reading from SLM, to avoid race condition
+                // with subsequent writes into SLM
+                it.barrier(sycl::access::fence_space::local_space);
+            }
+
+            slm_iscan_tmp[(lid + 1) % wg_size] = wg_iscan_val;
+            it.barrier(sycl::access::fence_space::local_space);
+            const outputT modifier = (lid == 0) ? identity : slm_iscan_tmp[lid];
+
+#pragma unroll
+            for (nwiT m_wi = 0; m_wi < n_wi; ++m_wi) {
+                local_iscan[m_wi] = scan_op(local_iscan[m_wi], modifier);
+            }
+
+            const std::size_t start = std::min(i, acc_nelems);
+            const std::size_t end = std::min(i + n_wi, acc_nelems);
+            const nwiT m_max = static_cast<nwiT>(end - start);
+            for (nwiT m_wi = 0; m_wi < m_max; ++m_wi) {
+                output[out_iter_offset + out_indexer(i + m_wi)] =
+                    local_iscan[m_wi];
+            }
+        });
+    });
+
+    return inc_scan_phase1_ev;
+}
+
+template <typename inputT,
+          typename outputT,
+          nwiT n_wi,
+          typename IterIndexerT,
+          typename InpIndexerT,
+          typename OutIndexerT,
+          typename TransformerT,
+          typename ScanOpT,
+          bool include_initial = false>
+sycl::event inclusive_scan_base_step_striped(
+    sycl::queue &exec_q,
+    const std::uint32_t wg_size,
+    const std::size_t iter_nelems,
+    const std::size_t acc_nelems,
+    const inputT *input,
+    outputT *output,
+    const std::size_t s0,
+    const std::size_t s1,
+    const IterIndexerT &iter_indexer,
+    const InpIndexerT &inp_indexer,
+    const OutIndexerT &out_indexer,
+    TransformerT transformer,
+    const ScanOpT &scan_op,
+    outputT identity,
+    std::size_t &acc_groups,
+    const std::vector<sycl::event> &depends = {})
+{
+    const std::uint32_t reduce_nelems_per_wg = n_wi * wg_size;
+    acc_groups =
+        ceiling_quotient<std::size_t>(acc_nelems, reduce_nelems_per_wg);
+
+    sycl::event inc_scan_phase1_ev = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+
+        using slmT = sycl::local_accessor<outputT, 1>;
+
+        const auto &gRange = sycl::range<1>{iter_nelems * acc_groups * wg_size};
+        const auto &lRange = sycl::range<1>{wg_size};
+
+        const auto &ndRange = sycl::nd_range<1>{gRange, lRange};
+
+        slmT slm_iscan_tmp(reduce_nelems_per_wg, cgh);
+
+        using KernelName = inclusive_scan_iter_local_scan_striped_krn<
+            inputT, outputT, n_wi, IterIndexerT, InpIndexerT, OutIndexerT,
+            TransformerT, ScanOpT, include_initial>;
+
+        cgh.parallel_for<KernelName>(ndRange, [=, slm_iscan_tmp =
+                                                      std::move(slm_iscan_tmp)](
+                                                  sycl::nd_item<1> it) {
+            const std::uint32_t lid = it.get_local_linear_id();
+            const std::uint32_t wg_size = it.get_local_range(0);
+
+            const auto &sg = it.get_sub_group();
+            const std::uint32_t sgSize = sg.get_max_local_range()[0];
+            const std::size_t sgroup_id = sg.get_group_id()[0];
+            const std::uint32_t lane_id = sg.get_local_id()[0];
+
+            const std::size_t flat_group_id = it.get_group(0);
+            const std::size_t iter_gid = flat_group_id / acc_groups;
+            const std::size_t acc_group_id =
+                flat_group_id - (iter_gid * acc_groups);
+
+            const auto &iter_offsets = iter_indexer(iter_gid);
+            const auto &inp_iter_offset = iter_offsets.get_first_offset();
+            const auto &out_iter_offset = iter_offsets.get_second_offset();
+
+            std::array<outputT, n_wi> local_iscan{};
+
+            const std::size_t inp_id0 = acc_group_id * n_wi * wg_size +
+                                        sgroup_id * n_wi * sgSize + lane_id;
+
+#pragma unroll
+            for (nwiT m_wi = 0; m_wi < n_wi; ++m_wi) {
+                const std::size_t inp_id = inp_id0 + m_wi * sgSize;
+                if constexpr (!include_initial) {
+                    local_iscan[m_wi] =
+                        (inp_id < acc_nelems)
+                            ? transformer(input[inp_iter_offset +
+                                                inp_indexer(s0 + s1 * inp_id)])
+                            : identity;
+                }
+                else {
+                    // shift input to the left by a single element relative to
+                    // output
+                    local_iscan[m_wi] =
+                        (inp_id < acc_nelems && inp_id > 0)
+                            ? transformer(
+                                  input[inp_iter_offset +
+                                        inp_indexer((s0 + s1 * inp_id) - 1)])
+                            : identity;
+                }
+            }
+
+            // change layout from striped to blocked
+            {
+                {
+                    const std::uint32_t local_offset0 = lid * n_wi;
+#pragma unroll
+                    for (std::uint32_t i = 0; i < n_wi; ++i) {
+                        slm_iscan_tmp[local_offset0 + i] = local_iscan[i];
+                    }
+
+                    it.barrier(sycl::access::fence_space::local_space);
+                }
+
+                {
+                    const std::uint32_t block_offset =
+                        sgroup_id * sgSize * n_wi;
+                    const std::uint32_t disp0 = lane_id * n_wi;
+#pragma unroll
+                    for (nwiT i = 0; i < n_wi; ++i) {
+                        const std::uint32_t disp = disp0 + i;
+
+                        // disp == lane_id1 + i1 * sgSize;
+                        const std::uint32_t i1 = disp / sgSize;
+                        const std::uint32_t lane_id1 = disp - i1 * sgSize;
+
+                        const std::uint32_t disp_exchanged =
+                            (lane_id1 * n_wi + i1);
+
+                        local_iscan[i] =
+                            slm_iscan_tmp[block_offset + disp_exchanged];
+                    }
+
+                    it.barrier(sycl::access::fence_space::local_space);
+                }
+            }
+
+#pragma unroll
+            for (nwiT m_wi = 1; m_wi < n_wi; ++m_wi) {
+                local_iscan[m_wi] =
+                    scan_op(local_iscan[m_wi], local_iscan[m_wi - 1]);
+            }
+            // local_iscan is now result of
+            // inclusive scan of locally stored inputs
+
+            outputT wg_iscan_val;
+            if constexpr (can_use_inclusive_scan_over_group<ScanOpT,
+                                                            outputT>::value) {
+                wg_iscan_val = sycl::inclusive_scan_over_group(
+                    it.get_group(), local_iscan.back(), scan_op, identity);
+            }
+            else {
+                wg_iscan_val = su_ns::custom_inclusive_scan_over_group(
+                    it.get_group(), sg, slm_iscan_tmp, local_iscan.back(),
+                    identity, scan_op);
+                // ensure all finished reading from SLM, to avoid race condition
+                // with subsequent writes into SLM
+                it.barrier(sycl::access::fence_space::local_space);
+            }
+
+            slm_iscan_tmp[(lid + 1) % wg_size] = wg_iscan_val;
+            it.barrier(sycl::access::fence_space::local_space);
+            const outputT modifier = (lid == 0) ? identity : slm_iscan_tmp[lid];
+
+#pragma unroll
+            for (nwiT m_wi = 0; m_wi < n_wi; ++m_wi) {
+                local_iscan[m_wi] = scan_op(local_iscan[m_wi], modifier);
+            }
+
+            it.barrier(sycl::access::fence_space::local_space);
+
+            // convert back to blocked layout
+            {{const std::uint32_t local_offset0 = lid * n_wi;
+#pragma unroll
+            for (nwiT m_wi = 0; m_wi < n_wi; ++m_wi) {
+                slm_iscan_tmp[local_offset0 + m_wi] = local_iscan[m_wi];
+            }
+
+            it.barrier(sycl::access::fence_space::local_space);
+                }
+            }
+
+            {
+        const std::uint32_t block_offset = sgroup_id * sgSize * n_wi + lane_id;
+#pragma unroll
+        for (nwiT m_wi = 0; m_wi < n_wi; ++m_wi) {
+            const std::uint32_t m_wi_scaled = m_wi * sgSize;
+            const std::size_t out_id = inp_id0 + m_wi_scaled;
+            if (out_id < acc_nelems) {
+                output[out_iter_offset + out_indexer(out_id)] =
+                    slm_iscan_tmp[block_offset + m_wi_scaled];
+            }
+        }
+            }
+});
+});
+
+return inc_scan_phase1_ev;
+}
+
+template <typename inputT,
+          typename outputT,
+          nwiT n_wi,
+          typename IterIndexerT,
+          typename InpIndexerT,
+          typename OutIndexerT,
+          typename TransformerT,
+          typename ScanOpT,
+          bool include_initial = false>
+sycl::event
+    inclusive_scan_base_step(sycl::queue &exec_q,
+                             const std::uint32_t wg_size,
+                             const std::size_t iter_nelems,
+                             const std::size_t acc_nelems,
+                             const inputT *input,
+                             outputT *output,
+                             const std::size_t s0,
+                             const std::size_t s1,
+                             const IterIndexerT &iter_indexer,
+                             const InpIndexerT &inp_indexer,
+                             const OutIndexerT &out_indexer,
+                             TransformerT transformer,
+                             const ScanOpT &scan_op,
+                             outputT identity,
+                             std::size_t &acc_groups,
+                             const std::vector<sycl::event> &depends = {})
+{
+    // For small stride use striped load/store.
+    // Threshold value chosen experimentally.
+    if (s1 <= 16) {
+        return inclusive_scan_base_step_striped<
+            inputT, outputT, n_wi, IterIndexerT, InpIndexerT, OutIndexerT,
+            TransformerT, ScanOpT, include_initial>(
+            exec_q, wg_size, iter_nelems, acc_nelems, input, output, s0, s1,
+            iter_indexer, inp_indexer, out_indexer, transformer, scan_op,
+            identity, acc_groups, depends);
+    }
+    else {
+        return inclusive_scan_base_step_blocked<
+            inputT, outputT, n_wi, IterIndexerT, InpIndexerT, OutIndexerT,
+            TransformerT, ScanOpT, include_initial>(
+            exec_q, wg_size, iter_nelems, acc_nelems, input, output, s0, s1,
+            iter_indexer, inp_indexer, out_indexer, transformer, scan_op,
+            identity, acc_groups, depends);
+    }
+}
+
+template <typename outputT, nwiT n_wi, typename ScanOpT>
+class inclusive_scan_1d_iter_chunk_update_krn;
+
+template <typename UpdateKernelName,
+          typename outputT,
+          nwiT n_wi,
+          typename ScanOpT>
+sycl::event update_local_chunks_1d(sycl::queue &exec_q,
+                                   outputT *src,
+                                   std::size_t src_size,
+                                   const outputT *local_scans,
+                                   std::size_t chunk_size,
+                                   const sycl::event &dependent_event)
+{
+    const auto &ctx = exec_q.get_context();
+    const auto &dev = exec_q.get_device();
+
+    const auto &kernel_id = sycl::get_kernel_id<UpdateKernelName>();
+    auto kb = sycl::get_kernel_bundle<sycl::bundle_state::executable>(
+        ctx, {dev}, {kernel_id});
+    auto krn = kb.get_kernel(kernel_id);
+
+    const std::uint32_t sg_size = krn.template get_info<
+        sycl::info::kernel_device_specific::max_sub_group_size>(dev);
+
+    // output[ chunk_size * (i + 1) + j] += temp[i]
+    sycl::event update_event = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(dependent_event);
+        cgh.use_kernel_bundle(kb);
+
+        static constexpr nwiT updates_per_wi = n_wi;
+        const std::size_t n_items =
+            ceiling_quotient<std::size_t>(src_size, sg_size * n_wi) * sg_size;
+
+        sycl::range<1> gRange{n_items};
+        sycl::range<1> lRange{sg_size};
+        sycl::nd_range<1> ndRange{gRange, lRange};
+
+        cgh.parallel_for<UpdateKernelName>(
+            ndRange,
+            [chunk_size, src, src_size, local_scans](sycl::nd_item<1> ndit) {
+                static constexpr ScanOpT scan_op{};
+                static constexpr outputT identity =
+                    su_ns::Identity<ScanOpT, outputT>::value;
+
+                const std::uint32_t lws = ndit.get_local_range(0);
+                const std::size_t block_offset = ndit.get_group(0) * n_wi * lws;
+#pragma unroll
+                for (std::size_t i = 0; i < updates_per_wi; ++i) {
+                    const std::size_t src_id =
+                        block_offset + ndit.get_local_id(0) + i * lws;
+                    if (src_id < src_size) {
+                        const std::size_t scan_id = (src_id / chunk_size);
+                        const outputT modifier =
+                            (scan_id > 0) ? local_scans[scan_id - 1] : identity;
+                        src[src_id] = scan_op(src[src_id], modifier);
+                    }
+                }
+            });
+    });
+
+    return update_event;
+}
+
+/*
+ * output[j] = sum( input[s0 + i * s1], 0 <= i <= j)
+ * for 0 <= j < n_elems
+ */
+template <typename inputT,
+          typename outputT,
+          nwiT n_wi,
+          typename IndexerT,
+          typename TransformerT,
+          typename ScanOpT,
+          bool include_initial>
+sycl::event inclusive_scan_iter_1d(sycl::queue &exec_q,
+                                   const std::uint32_t wg_size,
+                                   const std::size_t n_elems,
+                                   const inputT *input,
+                                   outputT *output,
+                                   const std::size_t s0,
+                                   const std::size_t s1,
+                                   const IndexerT &indexer,
+                                   const TransformerT &transformer,
+                                   std::vector<sycl::event> &host_tasks,
+                                   const std::vector<sycl::event> &depends = {})
+{
+    static constexpr ScanOpT scan_op{};
+    static constexpr outputT identity =
+        su_ns::Identity<ScanOpT, outputT>::value;
+
+    static constexpr std::size_t _iter_nelems = 1;
+
+    using IterIndexerT = dpctl::tensor::offset_utils::TwoZeroOffsets_Indexer;
+    static constexpr IterIndexerT _no_op_iter_indexer{};
+
+    using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+    static constexpr NoOpIndexerT _no_op_indexer{};
+
+    std::size_t n_groups;
+    sycl::event inc_scan_phase1_ev =
+        inclusive_scan_base_step<inputT, outputT, n_wi, IterIndexerT, IndexerT,
+                                 NoOpIndexerT, TransformerT, ScanOpT,
+                                 include_initial>(
+            exec_q, wg_size, _iter_nelems, n_elems, input, output, s0, s1,
+            _no_op_iter_indexer, indexer, _no_op_indexer, transformer, scan_op,
+            identity, n_groups, depends);
+
+    sycl::event dependent_event = inc_scan_phase1_ev;
+    if (n_groups > 1) {
+        const std::size_t chunk_size = wg_size * n_wi;
+
+        // how much of temporary allocation do we need
+        std::size_t n_groups_ = n_groups;
+        std::size_t temp_size = 0;
+        while (n_groups_ > 1) {
+            const std::size_t this_size = (n_groups_ - 1);
+            temp_size += this_size;
+            n_groups_ = ceiling_quotient(this_size, chunk_size);
+        }
+
+        // allocate
+        auto temp_owner =
+            dpctl::tensor::alloc_utils::smart_malloc_device<outputT>(temp_size,
+                                                                     exec_q);
+        outputT *temp = temp_owner.get();
+
+        std::vector<detail::stack_t<outputT>> stack{};
+
+        // inclusive scans over blocks
+        n_groups_ = n_groups;
+        outputT *src = output;
+        outputT *local_scans = temp;
+
+        using NoOpTransformerT = NoOpTransformer<outputT>;
+        static constexpr NoOpTransformerT _no_op_transformer{};
+        std::size_t size_to_update = n_elems;
+        while (n_groups_ > 1) {
+
+            const std::size_t src_size = n_groups_ - 1;
+            dependent_event =
+                inclusive_scan_base_step<outputT, outputT, n_wi, IterIndexerT,
+                                         NoOpIndexerT, NoOpIndexerT,
+                                         NoOpTransformerT, ScanOpT>(
+                    exec_q, wg_size, _iter_nelems, src_size, src, local_scans,
+                    chunk_size - 1, chunk_size, _no_op_iter_indexer,
+                    _no_op_indexer, _no_op_indexer, _no_op_transformer, scan_op,
+                    identity, n_groups_, // n_groups_ is modified in place
+                    {dependent_event});
+            stack.push_back({src, size_to_update, local_scans});
+            src = local_scans;
+            local_scans += src_size;
+            size_to_update = src_size;
+        }
+
+        for (std::size_t reverse_stack_id = 0; reverse_stack_id < stack.size();
+             ++reverse_stack_id)
+        {
+            const std::size_t stack_id = stack.size() - 1 - reverse_stack_id;
+
+            const auto &stack_elem = stack[stack_id];
+            outputT *src = stack_elem.get_src_ptr();
+            const std::size_t src_size = stack_elem.get_size();
+            const outputT *local_scans = stack_elem.get_local_scans_ptr();
+
+            using UpdateKernelName =
+                class inclusive_scan_1d_iter_chunk_update_krn<outputT, n_wi,
+                                                              ScanOpT>;
+
+            dependent_event = update_local_chunks_1d<UpdateKernelName, outputT,
+                                                     n_wi, ScanOpT>(
+                exec_q, src, src_size, local_scans, chunk_size,
+                dependent_event);
+        }
+
+        sycl::event free_ev = dpctl::tensor::alloc_utils::async_smart_free(
+            exec_q, {dependent_event}, temp_owner);
+
+        host_tasks.push_back(free_ev);
+    }
+
+    return dependent_event;
+}
+
+typedef sycl::event (*accumulate_1d_contig_impl_fn_ptr_t)(
+    sycl::queue &,
+    std::size_t,
+    const char *,
+    char *,
+    std::vector<sycl::event> &,
+    const std::vector<sycl::event> &);
+
+template <typename srcT,
+          typename dstT,
+          typename transformerT,
+          typename AccumulateOpT,
+          bool include_initial>
+sycl::event
+    accumulate_1d_contig_impl(sycl::queue &q,
+                              std::size_t n_elems,
+                              const char *src,
+                              char *dst,
+                              std::vector<sycl::event> &host_tasks,
+                              const std::vector<sycl::event> &depends = {})
+{
+    const srcT *src_data_ptr = reinterpret_cast<const srcT *>(src);
+    dstT *dst_data_ptr = reinterpret_cast<dstT *>(dst);
+
+    using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+    static constexpr NoOpIndexerT flat_indexer{};
+    static constexpr transformerT transformer{};
+
+    static constexpr std::size_t s0 = 0;
+    static constexpr std::size_t s1 = 1;
+
+    sycl::event comp_ev;
+    const sycl::device &dev = q.get_device();
+    if (dev.has(sycl::aspect::cpu)) {
+        static constexpr nwiT n_wi_for_cpu = 8;
+        const std::uint32_t wg_size = 256;
+        comp_ev = inclusive_scan_iter_1d<srcT, dstT, n_wi_for_cpu, NoOpIndexerT,
+                                         transformerT, AccumulateOpT,
+                                         include_initial>(
+            q, wg_size, n_elems, src_data_ptr, dst_data_ptr, s0, s1,
+            flat_indexer, transformer, host_tasks, depends);
+    }
+    else {
+        static constexpr nwiT n_wi_for_gpu = 4;
+        // base_scan_striped algorithm does not execute correctly
+        // on HIP device with wg_size > 64
+        const std::uint32_t wg_size =
+            (q.get_backend() == sycl::backend::ext_oneapi_hip) ? 64 : 256;
+        comp_ev = inclusive_scan_iter_1d<srcT, dstT, n_wi_for_gpu, NoOpIndexerT,
+                                         transformerT, AccumulateOpT,
+                                         include_initial>(
+            q, wg_size, n_elems, src_data_ptr, dst_data_ptr, s0, s1,
+            flat_indexer, transformer, host_tasks, depends);
+    }
+    return comp_ev;
+}
+
+template <typename outputT,
+          nwiT n_wi,
+          typename IterIndexerT,
+          typename IndexerT,
+          typename ScanOpT>
+class inclusive_scan_final_chunk_update_krn;
+
+template <typename UpdateKernelName,
+          typename outputT,
+          nwiT n_wi,
+          typename OutIterIndexerT,
+          typename OutIndexerT,
+          typename ScanOpT>
+sycl::event final_update_local_chunks(sycl::queue &exec_q,
+                                      std::size_t iter_nelems,
+                                      outputT *src,
+                                      std::size_t src_size,
+                                      const outputT *local_scans,
+                                      std::size_t chunk_size,
+                                      std::size_t local_stride,
+                                      const OutIterIndexerT &out_iter_indexer,
+                                      const OutIndexerT &out_indexer,
+                                      sycl::event dependent_event)
+{
+    const auto &kernel_id = sycl::get_kernel_id<UpdateKernelName>();
+
+    auto const &ctx = exec_q.get_context();
+    auto const &dev = exec_q.get_device();
+    auto kb = sycl::get_kernel_bundle<sycl::bundle_state::executable>(
+        ctx, {dev}, {kernel_id});
+
+    auto krn = kb.get_kernel(kernel_id);
+
+    const std::uint32_t sg_size = krn.template get_info<
+        sycl::info::kernel_device_specific::max_sub_group_size>(dev);
+
+    static constexpr nwiT updates_per_wi = n_wi;
+    const std::size_t updates_per_sg = sg_size * updates_per_wi;
+    const std::size_t update_nelems =
+        ceiling_quotient(src_size, updates_per_sg) * sg_size;
+
+    sycl::range<2> gRange{iter_nelems, update_nelems};
+    sycl::range<2> lRange{1, sg_size};
+
+    sycl::nd_range<2> ndRange{gRange, lRange};
+
+    sycl::event update_event = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(dependent_event);
+
+        cgh.parallel_for<UpdateKernelName>(
+            ndRange, [chunk_size, src_size, local_stride, src, local_scans,
+                      out_iter_indexer, out_indexer](sycl::nd_item<2> ndit) {
+                static constexpr ScanOpT scan_op{};
+                static constexpr outputT identity =
+                    su_ns::Identity<ScanOpT, outputT>::value;
+
+                const std::uint32_t lws = ndit.get_local_range(1);
+
+                const std::size_t iter_gid = ndit.get_group(0);
+
+                const std::size_t src_axis_id0 =
+                    ndit.get_group(1) * updates_per_wi * lws +
+                    ndit.get_local_id(1);
+                const std::size_t src_iter_id = out_iter_indexer(iter_gid);
+#pragma unroll
+                for (nwiT i = 0; i < updates_per_wi; ++i) {
+                    const std::size_t src_axis_id = src_axis_id0 + i * lws;
+                    const std::size_t src_id =
+                        out_indexer(src_axis_id) + src_iter_id;
+
+                    if (src_axis_id < src_size) {
+                        const std::size_t scan_axis_id =
+                            src_axis_id / chunk_size;
+                        const std::size_t scan_id =
+                            scan_axis_id + iter_gid * local_stride;
+
+                        const outputT modifier = (scan_axis_id > 0)
+                                                     ? local_scans[scan_id - 1]
+                                                     : identity;
+
+                        src[src_id] = scan_op(src[src_id], modifier);
+                    }
+                }
+            });
+    });
+
+    return update_event;
+}
+
+template <typename outputT, nwiT n_wi, typename ScanOpT>
+class inclusive_scan_iter_chunk_update_krn;
+
+template <typename UpdateKernelName,
+          typename outputT,
+          nwiT n_wi,
+          typename ScanOpT>
+sycl::event update_local_chunks(sycl::queue &exec_q,
+                                std::size_t iter_nelems,
+                                outputT *src,
+                                std::size_t src_size,
+                                const outputT *local_scans,
+                                std::size_t chunk_size,
+                                std::size_t local_stride,
+                                sycl::event dependent_event)
+{
+    static constexpr NoOpIndexer out_indexer{};
+    static constexpr NoOpIndexer iter_out_indexer{};
+
+    return final_update_local_chunks<UpdateKernelName, outputT, n_wi,
+                                     NoOpIndexer, NoOpIndexer, ScanOpT>(
+        exec_q, iter_nelems, src, src_size, local_scans, chunk_size,
+        local_stride, iter_out_indexer, out_indexer, dependent_event);
+}
+
+template <typename inputT,
+          typename outputT,
+          nwiT n_wi,
+          typename InpIterIndexerT,
+          typename OutIterIndexerT,
+          typename InpIndexerT,
+          typename OutIndexerT,
+          typename TransformerT,
+          typename ScanOpT,
+          bool include_initial>
+sycl::event inclusive_scan_iter(sycl::queue &exec_q,
+                                const std::uint32_t wg_size,
+                                const std::size_t iter_nelems,
+                                const std::size_t acc_nelems,
+                                const inputT *input,
+                                outputT *output,
+                                const std::size_t s0,
+                                const std::size_t s1,
+                                const InpIterIndexerT &inp_iter_indexer,
+                                const OutIterIndexerT &out_iter_indexer,
+                                const InpIndexerT &inp_indexer,
+                                const OutIndexerT &out_indexer,
+                                const TransformerT &transformer,
+                                std::vector<sycl::event> &host_tasks,
+                                const std::vector<sycl::event> &depends = {})
+{
+    static constexpr ScanOpT scan_op{};
+    static constexpr outputT identity =
+        su_ns::Identity<ScanOpT, outputT>::value;
+
+    using IterIndexerT =
+        dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
+            InpIterIndexerT, OutIterIndexerT>;
+    const IterIndexerT iter_indexer{inp_iter_indexer, out_iter_indexer};
+
+    std::size_t acc_groups;
+    sycl::event inc_scan_phase1_ev =
+        inclusive_scan_base_step<inputT, outputT, n_wi, IterIndexerT,
+                                 InpIndexerT, OutIndexerT, TransformerT,
+                                 ScanOpT, include_initial>(
+            exec_q, wg_size, iter_nelems, acc_nelems, input, output, s0, s1,
+            iter_indexer, inp_indexer, out_indexer, transformer, scan_op,
+            identity, acc_groups, depends);
+
+    sycl::event dependent_event = inc_scan_phase1_ev;
+    if (acc_groups > 1) {
+        const std::size_t chunk_size = wg_size * n_wi;
+
+        // how much of temporary allocation do we need
+        std::size_t acc_groups_ = acc_groups;
+        std::size_t temp_size = 0;
+        while (acc_groups_ > 1) {
+            const std::size_t this_size = (acc_groups_ - 1);
+            temp_size += this_size;
+            acc_groups_ = ceiling_quotient<std::size_t>(this_size, chunk_size);
+        }
+
+        // allocate
+        auto temp_owner =
+            dpctl::tensor::alloc_utils::smart_malloc_device<outputT>(
+                iter_nelems * temp_size, exec_q);
+        outputT *temp = temp_owner.get();
+
+        std::vector<detail::stack_strided_t<outputT>> stack{};
+
+        // inclusive scans over blocks
+        acc_groups_ = acc_groups;
+        outputT *src = output;
+        outputT *local_scans = temp;
+
+        using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+        static constexpr NoOpIndexerT _no_op_indexer{};
+        using NoOpTransformerT = NoOpTransformer<outputT>;
+        static constexpr NoOpTransformerT _no_op_transformer{};
+        std::size_t size_to_update = acc_nelems;
+
+        {
+            std::size_t src_size = acc_groups - 1;
+            using LocalScanIndexerT =
+                dpctl::tensor::offset_utils::Strided1DIndexer;
+            const LocalScanIndexerT scan_iter_indexer{/* size */ iter_nelems,
+                                                      /* step */ src_size};
+
+            using IterIndexerT =
+                dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
+                    OutIterIndexerT, LocalScanIndexerT>;
+            const IterIndexerT iter_indexer_{out_iter_indexer,
+                                             scan_iter_indexer};
+
+            dependent_event =
+                inclusive_scan_base_step<outputT, outputT, n_wi, IterIndexerT,
+                                         OutIndexerT, NoOpIndexerT,
+                                         NoOpTransformerT, ScanOpT>(
+                    exec_q, wg_size, iter_nelems, src_size, src, local_scans,
+                    chunk_size - 1, chunk_size, iter_indexer_, out_indexer,
+                    _no_op_indexer, _no_op_transformer, scan_op, identity,
+                    acc_groups_, // acc_groups_ is modified in place
+                    {dependent_event});
+            stack.push_back({src, size_to_update, local_scans, src_size});
+            src = local_scans;
+            local_scans += src_size * iter_nelems;
+            size_to_update = src_size;
+        }
+
+        while (acc_groups_ > 1) {
+            std::size_t src_size = acc_groups_ - 1;
+
+            using LocalScanIndexerT =
+                dpctl::tensor::offset_utils::Strided1DIndexer;
+            const LocalScanIndexerT scan1_iter_indexer{
+                /* size */ iter_nelems,
+                /* step */ size_to_update};
+            const LocalScanIndexerT scan2_iter_indexer{/* size */ iter_nelems,
+                                                       /* step */ src_size};
+
+            using IterIndexerT =
+                dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
+                    LocalScanIndexerT, LocalScanIndexerT>;
+            const IterIndexerT iter_indexer_{scan1_iter_indexer,
+                                             scan2_iter_indexer};
+
+            dependent_event =
+                inclusive_scan_base_step<outputT, outputT, n_wi, IterIndexerT,
+                                         NoOpIndexerT, NoOpIndexerT,
+                                         NoOpTransformerT, ScanOpT>(
+                    exec_q, wg_size, iter_nelems, src_size, src, local_scans,
+                    chunk_size - 1, chunk_size, iter_indexer_, _no_op_indexer,
+                    _no_op_indexer, _no_op_transformer, scan_op, identity,
+                    acc_groups_, // acc_groups_ is modified in place
+                    {dependent_event});
+            stack.push_back({src, size_to_update, local_scans, src_size});
+            src = local_scans;
+            local_scans += src_size * iter_nelems;
+            size_to_update = src_size;
+        }
+
+        for (std::size_t reverse_stack_id = 0;
+             reverse_stack_id < stack.size() - 1; ++reverse_stack_id)
+        {
+            const std::size_t stack_id = stack.size() - 1 - reverse_stack_id;
+
+            const auto &stack_elem = stack[stack_id];
+            outputT *src = stack_elem.get_src_ptr();
+            std::size_t src_size = stack_elem.get_size();
+            outputT *local_scans = stack_elem.get_local_scans_ptr();
+            std::size_t local_stride = stack_elem.get_local_stride();
+
+            using UpdateKernelName =
+                class inclusive_scan_iter_chunk_update_krn<outputT, n_wi,
+                                                           ScanOpT>;
+
+            dependent_event =
+                update_local_chunks<UpdateKernelName, outputT, n_wi, ScanOpT>(
+                    exec_q, iter_nelems, src, src_size, local_scans, chunk_size,
+                    local_stride, dependent_event);
+        }
+
+        // last stack element is always directly to output
+        {
+            const auto &stack_elem = stack[0];
+            outputT *src = stack_elem.get_src_ptr();
+            const std::size_t src_size = stack_elem.get_size();
+            outputT *local_scans = stack_elem.get_local_scans_ptr();
+            const std::size_t local_stride = stack_elem.get_local_stride();
+
+            using UpdateKernelName =
+                class inclusive_scan_final_chunk_update_krn<
+                    outputT, n_wi, OutIterIndexerT, OutIndexerT, ScanOpT>;
+
+            dependent_event =
+                final_update_local_chunks<UpdateKernelName, outputT, n_wi,
+                                          OutIterIndexerT, OutIndexerT,
+                                          ScanOpT>(
+                    exec_q, iter_nelems, src, src_size, local_scans, chunk_size,
+                    local_stride, out_iter_indexer, out_indexer,
+                    dependent_event);
+        }
+
+        sycl::event free_ev = dpctl::tensor::alloc_utils::async_smart_free(
+            exec_q, {dependent_event}, temp_owner);
+        host_tasks.push_back(free_ev);
+    }
+
+    return dependent_event;
+}
+
+typedef sycl::event (*accumulate_strided_impl_fn_ptr_t)(
+    sycl::queue &,
+    std::size_t,
+    std::size_t,
+    const char *,
+    int,
+    const ssize_t *,
+    ssize_t,
+    ssize_t,
+    int,
+    const ssize_t *,
+    char *,
+    std::vector<sycl::event> &,
+    const std::vector<sycl::event> &);
+
+template <typename srcT,
+          typename dstT,
+          typename transformerT,
+          typename AccumulateOpT,
+          bool include_initial>
+sycl::event
+    accumulate_strided_impl(sycl::queue &q,
+                            std::size_t iter_nelems,
+                            std::size_t acc_nelems,
+                            const char *src,
+                            int iter_nd,
+                            const ssize_t *iter_shape_strides,
+                            ssize_t inp_iter_offset,
+                            ssize_t out_iter_offset,
+                            int acc_nd,
+                            const ssize_t *acc_shape_strides,
+                            char *dst,
+                            std::vector<sycl::event> &host_tasks,
+                            const std::vector<sycl::event> &depends = {})
+{
+    const srcT *src_data_ptr = reinterpret_cast<const srcT *>(src);
+    dstT *dst_data_ptr = reinterpret_cast<dstT *>(dst);
+
+    using InpIndexerT = dpctl::tensor::offset_utils::StridedIndexer;
+    const InpIndexerT inp_axis_indexer{acc_nd, 0, acc_shape_strides};
+    const InpIndexerT inp_iter_indexer{iter_nd, inp_iter_offset,
+                                       iter_shape_strides};
+
+    using OutIndexerT = dpctl::tensor::offset_utils::UnpackedStridedIndexer;
+    const OutIndexerT out_axis_indexer{acc_nd, 0, acc_shape_strides,
+                                       acc_shape_strides + 2 * acc_nd};
+    const OutIndexerT out_iter_indexer{iter_nd, out_iter_offset,
+                                       iter_shape_strides,
+                                       iter_shape_strides + 2 * iter_nd};
+
+    static constexpr transformerT transformer{};
+
+    static constexpr std::size_t s0 = 0;
+    static constexpr std::size_t s1 = 1;
+
+    const sycl::device &dev = q.get_device();
+    sycl::event comp_ev;
+    if (dev.has(sycl::aspect::cpu)) {
+        static constexpr nwiT n_wi_for_cpu = 8;
+        const std::uint32_t wg_size = 256;
+        comp_ev =
+            inclusive_scan_iter<srcT, dstT, n_wi_for_cpu, InpIndexerT,
+                                OutIndexerT, InpIndexerT, OutIndexerT,
+                                transformerT, AccumulateOpT, include_initial>(
+                q, wg_size, iter_nelems, acc_nelems, src_data_ptr, dst_data_ptr,
+                s0, s1, inp_iter_indexer, out_iter_indexer, inp_axis_indexer,
+                out_axis_indexer, transformer, host_tasks, depends);
+    }
+    else {
+        static constexpr nwiT n_wi_for_gpu = 4;
+        // base_scan_striped algorithm does not execute correctly
+        // on HIP device with wg_size > 64
+        const std::uint32_t wg_size =
+            (q.get_backend() == sycl::backend::ext_oneapi_hip) ? 64 : 256;
+        comp_ev =
+            inclusive_scan_iter<srcT, dstT, n_wi_for_gpu, InpIndexerT,
+                                OutIndexerT, InpIndexerT, OutIndexerT,
+                                transformerT, AccumulateOpT, include_initial>(
+                q, wg_size, iter_nelems, acc_nelems, src_data_ptr, dst_data_ptr,
+                s0, s1, inp_iter_indexer, out_iter_indexer, inp_axis_indexer,
+                out_axis_indexer, transformer, host_tasks, depends);
+    }
+
+    return comp_ev;
+}
+
+typedef std::size_t (*cumsum_val_contig_impl_fn_ptr_t)(
+    sycl::queue &,
+    std::size_t,
+    const char *,
+    char *,
+    std::vector<sycl::event> &,
+    const std::vector<sycl::event> &);
+
+template <typename maskT, typename cumsumT, typename transformerT>
+std::size_t cumsum_val_contig_impl(sycl::queue &q,
+                                   std::size_t n_elems,
+                                   const char *mask,
+                                   char *cumsum,
+                                   std::vector<sycl::event> &host_tasks,
+                                   const std::vector<sycl::event> &depends = {})
+{
+    const maskT *mask_data_ptr = reinterpret_cast<const maskT *>(mask);
+    cumsumT *cumsum_data_ptr = reinterpret_cast<cumsumT *>(cumsum);
+
+    using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+    static constexpr NoOpIndexerT flat_indexer{};
+    static constexpr transformerT transformer{};
+
+    static constexpr std::size_t s0 = 0;
+    static constexpr std::size_t s1 = 1;
+    static constexpr bool include_initial = false;
+    using AccumulateOpT = sycl::plus<cumsumT>;
+
+    sycl::event comp_ev;
+    const sycl::device &dev = q.get_device();
+    if (dev.has(sycl::aspect::cpu)) {
+        static constexpr nwiT n_wi_for_cpu = 8;
+        const std::uint32_t wg_size = 256;
+        comp_ev = inclusive_scan_iter_1d<maskT, cumsumT, n_wi_for_cpu,
+                                         NoOpIndexerT, transformerT,
+                                         AccumulateOpT, include_initial>(
+            q, wg_size, n_elems, mask_data_ptr, cumsum_data_ptr, s0, s1,
+            flat_indexer, transformer, host_tasks, depends);
+    }
+    else {
+        static constexpr nwiT n_wi_for_gpu = 4;
+        // base_scan_striped algorithm does not execute correctly
+        // on HIP device with wg_size > 64
+        const std::uint32_t wg_size =
+            (q.get_backend() == sycl::backend::ext_oneapi_hip) ? 64 : 256;
+        comp_ev = inclusive_scan_iter_1d<maskT, cumsumT, n_wi_for_gpu,
+                                         NoOpIndexerT, transformerT,
+                                         AccumulateOpT, include_initial>(
+            q, wg_size, n_elems, mask_data_ptr, cumsum_data_ptr, s0, s1,
+            flat_indexer, transformer, host_tasks, depends);
+    }
+    cumsumT *last_elem = cumsum_data_ptr + (n_elems - 1);
+
+    auto host_usm_owner =
+        dpctl::tensor::alloc_utils::smart_malloc_host<cumsumT>(1, q);
+    cumsumT *last_elem_host_usm = host_usm_owner.get();
+
+    sycl::event copy_e = q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(comp_ev);
+        cgh.copy<cumsumT>(last_elem, last_elem_host_usm, 1);
+    });
+    copy_e.wait();
+    std::size_t return_val = static_cast<std::size_t>(*last_elem_host_usm);
+
+    // explicitly free USM host allocation, by invoking deleter of
+    // the unique_ptr
+    host_usm_owner.reset(nullptr);
+
+    return return_val;
+}
+
+template <typename fnT, typename T>
+struct MaskPositionsContigFactoryForInt32
+{
+    fnT get()
+    {
+        using cumsumT = std::int32_t;
+        fnT fn =
+            cumsum_val_contig_impl<T, cumsumT, NonZeroIndicator<T, cumsumT>>;
+        return fn;
+    }
+};
+
+template <typename fnT, typename T>
+struct MaskPositionsContigFactoryForInt64
+{
+    fnT get()
+    {
+        using cumsumT = std::int64_t;
+        fnT fn =
+            cumsum_val_contig_impl<T, cumsumT, NonZeroIndicator<T, cumsumT>>;
+        return fn;
+    }
+};
+
+template <typename fnT, typename T>
+struct Cumsum1DContigFactory
+{
+    fnT get()
+    {
+        if constexpr (std::is_integral_v<T>) {
+            using cumsumT = std::int64_t;
+            fnT fn =
+                cumsum_val_contig_impl<T, cumsumT, NoOpTransformer<cumsumT>>;
+            return fn;
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+typedef std::size_t (*cumsum_val_strided_impl_fn_ptr_t)(
+    sycl::queue &,
+    std::size_t,
+    const char *,
+    int,
+    const ssize_t *,
+    char *,
+    std::vector<sycl::event> &,
+    const std::vector<sycl::event> &);
+
+template <typename maskT, typename cumsumT, typename transformerT>
+std::size_t
+    cumsum_val_strided_impl(sycl::queue &q,
+                            std::size_t n_elems,
+                            const char *mask,
+                            int nd,
+                            const ssize_t *shape_strides,
+                            char *cumsum,
+                            std::vector<sycl::event> &host_tasks,
+                            const std::vector<sycl::event> &depends = {})
+{
+    const maskT *mask_data_ptr = reinterpret_cast<const maskT *>(mask);
+    cumsumT *cumsum_data_ptr = reinterpret_cast<cumsumT *>(cumsum);
+
+    using StridedIndexerT = dpctl::tensor::offset_utils::StridedIndexer;
+    const StridedIndexerT strided_indexer{nd, 0, shape_strides};
+    static constexpr transformerT transformer{};
+
+    static constexpr std::size_t s0 = 0;
+    static constexpr std::size_t s1 = 1;
+    static constexpr bool include_initial = false;
+    using AccumulateOpT = sycl::plus<cumsumT>;
+
+    const sycl::device &dev = q.get_device();
+    sycl::event comp_ev;
+    if (dev.has(sycl::aspect::cpu)) {
+        static constexpr nwiT n_wi_for_cpu = 8;
+        const std::uint32_t wg_size = 256;
+        comp_ev = inclusive_scan_iter_1d<maskT, cumsumT, n_wi_for_cpu,
+                                         StridedIndexerT, transformerT,
+                                         AccumulateOpT, include_initial>(
+            q, wg_size, n_elems, mask_data_ptr, cumsum_data_ptr, s0, s1,
+            strided_indexer, transformer, host_tasks, depends);
+    }
+    else {
+        static constexpr nwiT n_wi_for_gpu = 4;
+        // base_scan_striped algorithm does not execute correctly
+        // on HIP device with wg_size > 64
+        const std::uint32_t wg_size =
+            (q.get_backend() == sycl::backend::ext_oneapi_hip) ? 64 : 256;
+        comp_ev = inclusive_scan_iter_1d<maskT, cumsumT, n_wi_for_gpu,
+                                         StridedIndexerT, transformerT,
+                                         AccumulateOpT, include_initial>(
+            q, wg_size, n_elems, mask_data_ptr, cumsum_data_ptr, s0, s1,
+            strided_indexer, transformer, host_tasks, depends);
+    }
+
+    cumsumT *last_elem = cumsum_data_ptr + (n_elems - 1);
+
+    auto host_usm_owner =
+        dpctl::tensor::alloc_utils::smart_malloc_host<cumsumT>(1, q);
+    cumsumT *last_elem_host_usm = host_usm_owner.get();
+
+    sycl::event copy_e = q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(comp_ev);
+        cgh.copy<cumsumT>(last_elem, last_elem_host_usm, 1);
+    });
+    copy_e.wait();
+    std::size_t return_val = static_cast<std::size_t>(*last_elem_host_usm);
+
+    // explicitly free USM-host temporary, by invoking deleter of
+    // the unique_ptr
+    host_usm_owner.reset(nullptr);
+
+    return return_val;
+}
+
+template <typename fnT, typename T>
+struct MaskPositionsStridedFactoryForInt32
+{
+    fnT get()
+    {
+        using cumsumT = std::int32_t;
+        fnT fn =
+            cumsum_val_strided_impl<T, cumsumT, NonZeroIndicator<T, cumsumT>>;
+        return fn;
+    }
+};
+
+template <typename fnT, typename T>
+struct MaskPositionsStridedFactoryForInt64
+{
+    fnT get()
+    {
+        using cumsumT = std::int64_t;
+        fnT fn =
+            cumsum_val_strided_impl<T, cumsumT, NonZeroIndicator<T, cumsumT>>;
+        return fn;
+    }
+};
+
+template <typename fnT, typename T>
+struct Cumsum1DStridedFactory
+{
+    fnT get()
+    {
+        if constexpr (std::is_integral_v<T>) {
+            using cumsumT = std::int64_t;
+            fnT fn =
+                cumsum_val_strided_impl<T, cumsumT, NoOpTransformer<cumsumT>>;
+            return fn;
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+} // namespace dpctl::tensor::kernels::accumulators
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/boolean_advanced_indexing.hpp b/dpctl_ext/tensor/libtensor/include/kernels/boolean_advanced_indexing.hpp
new file mode 100644
index 000000000000..046ad87d7d78
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/include/kernels/boolean_advanced_indexing.hpp
@@ -0,0 +1,853 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for advanced tensor index operations.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <algorithm>
+#include <cstddef>
+#include <cstdint>
+#include <type_traits>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpctl_tensor_types.hpp"
+#include "utils/offset_utils.hpp"
+#include "utils/type_dispatch_building.hpp"
+
+namespace dpctl::tensor::kernels::indexing
+{
+
+using dpctl::tensor::ssize_t;
+using namespace dpctl::tensor::offset_utils;
+
+template <typename OrthogIndexerT,
+          typename MaskedSrcIndexerT,
+          typename MaskedDstIndexerT,
+          typename dataT,
+          typename indT,
+          typename LocalAccessorT>
+struct MaskedExtractStridedFunctor
+{
+    MaskedExtractStridedFunctor(const dataT *src_data_p,
+                                const indT *cumsum_data_p,
+                                dataT *dst_data_p,
+                                std::size_t masked_iter_size,
+                                const OrthogIndexerT &orthog_src_dst_indexer_,
+                                const MaskedSrcIndexerT &masked_src_indexer_,
+                                const MaskedDstIndexerT &masked_dst_indexer_,
+                                const LocalAccessorT &lacc_)
+        : src(src_data_p), cumsum(cumsum_data_p), dst(dst_data_p),
+          masked_nelems(masked_iter_size),
+          orthog_src_dst_indexer(orthog_src_dst_indexer_),
+          masked_src_indexer(masked_src_indexer_),
+          masked_dst_indexer(masked_dst_indexer_), lacc(lacc_)
+    {
+        static_assert(
+            std::is_same_v<indT, typename LocalAccessorT::value_type>);
+    }
+
+    void operator()(sycl::nd_item<2> ndit) const
+    {
+        const std::size_t orthog_i = ndit.get_global_id(0);
+        const std::uint32_t l_i = ndit.get_local_id(1);
+        const std::uint32_t lws = ndit.get_local_range(1);
+
+        const std::size_t masked_i = ndit.get_global_id(1);
+        const std::size_t masked_block_start = masked_i - l_i;
+
+        const std::size_t max_offset = masked_nelems + 1;
+        for (std::uint32_t i = l_i; i < lacc.size(); i += lws) {
+            const std::size_t offset = masked_block_start + i;
+            lacc[i] = (offset == 0)           ? indT(0)
+                      : (offset < max_offset) ? cumsum[offset - 1]
+                                              : cumsum[masked_nelems - 1] + 1;
+        }
+
+        sycl::group_barrier(ndit.get_group());
+
+        const indT current_running_count = lacc[l_i + 1];
+        const bool mask_set = (masked_i == 0)
+                                  ? (current_running_count == 1)
+                                  : (current_running_count == lacc[l_i] + 1);
+
+        // dst[cumsum[i] - 1, j] = src[i, j]
+        //     if cumsum[i] == ((i > 0) ? cumsum[i-1] + 1 : 1)
+        if (mask_set && (masked_i < masked_nelems)) {
+            const auto &orthog_offsets = orthog_src_dst_indexer(orthog_i);
+
+            const std::size_t total_src_offset =
+                masked_src_indexer(masked_i) +
+                orthog_offsets.get_first_offset();
+            const std::size_t total_dst_offset =
+                masked_dst_indexer(current_running_count - 1) +
+                orthog_offsets.get_second_offset();
+
+            dst[total_dst_offset] = src[total_src_offset];
+        }
+    }
+
+private:
+    const dataT *src = nullptr;
+    const indT *cumsum = nullptr;
+    dataT *dst = nullptr;
+    std::size_t masked_nelems = 0;
+    // has nd, shape, src_strides, dst_strides for
+    // dimensions that ARE NOT masked
+    OrthogIndexerT orthog_src_dst_indexer;
+    // has nd, shape, src_strides for
+    // dimensions that ARE masked
+    MaskedSrcIndexerT masked_src_indexer;
+    // has 1, dst_strides for dimensions that ARE masked
+    MaskedDstIndexerT masked_dst_indexer;
+    LocalAccessorT lacc;
+};
+
+template <typename OrthogIndexerT,
+          typename MaskedDstIndexerT,
+          typename MaskedRhsIndexerT,
+          typename dataT,
+          typename indT,
+          typename LocalAccessorT>
+struct MaskedPlaceStridedFunctor
+{
+    MaskedPlaceStridedFunctor(dataT *dst_data_p,
+                              const indT *cumsum_data_p,
+                              const dataT *rhs_data_p,
+                              std::size_t masked_iter_size,
+                              const OrthogIndexerT &orthog_dst_rhs_indexer_,
+                              const MaskedDstIndexerT &masked_dst_indexer_,
+                              const MaskedRhsIndexerT &masked_rhs_indexer_,
+                              const LocalAccessorT &lacc_)
+        : dst(dst_data_p), cumsum(cumsum_data_p), rhs(rhs_data_p),
+          masked_nelems(masked_iter_size),
+          orthog_dst_rhs_indexer(orthog_dst_rhs_indexer_),
+          masked_dst_indexer(masked_dst_indexer_),
+          masked_rhs_indexer(masked_rhs_indexer_), lacc(lacc_)
+    {
+        static_assert(
+            std::is_same_v<indT, typename LocalAccessorT::value_type>);
+    }
+
+    void operator()(sycl::nd_item<2> ndit) const
+    {
+        const std::size_t orthog_i = ndit.get_global_id(0);
+        const std::uint32_t l_i = ndit.get_local_id(1);
+        const std::uint32_t lws = ndit.get_local_range(1);
+
+        const std::size_t masked_i = ndit.get_global_id(1);
+        const std::size_t masked_block_start = masked_i - l_i;
+
+        const std::size_t max_offset = masked_nelems + 1;
+        for (std::uint32_t i = l_i; i < lacc.size(); i += lws) {
+            const std::size_t offset = masked_block_start + i;
+            lacc[i] = (offset == 0)           ? indT(0)
+                      : (offset < max_offset) ? cumsum[offset - 1]
+                                              : cumsum[masked_nelems - 1] + 1;
+        }
+
+        sycl::group_barrier(ndit.get_group());
+
+        const indT current_running_count = lacc[l_i + 1];
+        const bool mask_set = (masked_i == 0)
+                                  ? (current_running_count == 1)
+                                  : (current_running_count == lacc[l_i] + 1);
+
+        // src[i, j] = rhs[cumsum[i] - 1, j]
+        // if cumsum[i] == ((i > 0) ? cumsum[i-1] + 1 : 1)
+        if (mask_set && (masked_i < masked_nelems)) {
+            const auto &orthog_offsets = orthog_dst_rhs_indexer(orthog_i);
+
+            const std::size_t total_dst_offset =
+                masked_dst_indexer(masked_i) +
+                orthog_offsets.get_first_offset();
+            const std::size_t total_rhs_offset =
+                masked_rhs_indexer(current_running_count - 1) +
+                orthog_offsets.get_second_offset();
+
+            dst[total_dst_offset] = rhs[total_rhs_offset];
+        }
+    }
+
+private:
+    dataT *dst = nullptr;
+    const indT *cumsum = nullptr;
+    const dataT *rhs = nullptr;
+    std::size_t masked_nelems = 0;
+    // has nd, shape, dst_strides, rhs_strides for
+    // dimensions that ARE NOT masked
+    OrthogIndexerT orthog_dst_rhs_indexer;
+    // has nd, shape, dst_strides for
+    // dimensions that ARE masked
+    MaskedDstIndexerT masked_dst_indexer;
+    // has 1, rhs_strides for dimensions that ARE masked
+    MaskedRhsIndexerT masked_rhs_indexer;
+    LocalAccessorT lacc;
+};
+
+// ======= Masked extraction ================================
+
+namespace detail
+{
+
+template <std::size_t I, std::size_t... IR>
+std::size_t _get_lws_impl(std::size_t n)
+{
+    if constexpr (sizeof...(IR) == 0) {
+        return I;
+    }
+    else {
+        return (n < I) ? _get_lws_impl<IR...>(n) : I;
+    }
+}
+
+inline std::size_t get_lws(std::size_t n)
+{
+    static constexpr std::size_t lws0 = 256u;
+    static constexpr std::size_t lws1 = 128u;
+    static constexpr std::size_t lws2 = 64u;
+    return _get_lws_impl<lws0, lws1, lws2>(n);
+}
+
+} // end of namespace detail
+
+template <typename MaskedDstIndexerT, typename dataT, typename indT>
+class masked_extract_all_slices_contig_impl_krn;
+
+typedef sycl::event (*masked_extract_all_slices_contig_impl_fn_ptr_t)(
+    sycl::queue &,
+    ssize_t,
+    const char *,
+    const char *,
+    char *,
+    ssize_t,
+    ssize_t,
+    const std::vector<sycl::event> &);
+
+template <typename dataT, typename indT>
+sycl::event masked_extract_all_slices_contig_impl(
+    sycl::queue &exec_q,
+    ssize_t iteration_size,
+    const char *src_p,
+    const char *cumsum_p,
+    char *dst_p,
+    ssize_t dst_size, // dst is 1D
+    ssize_t dst_stride,
+    const std::vector<sycl::event> &depends = {})
+{
+    static constexpr TwoZeroOffsets_Indexer orthog_src_dst_indexer{};
+
+    static constexpr NoOpIndexer masked_src_indexer{};
+    const Strided1DIndexer masked_dst_indexer(/* size */ dst_size,
+                                              /* step */ dst_stride);
+
+    using KernelName =
+        class masked_extract_all_slices_contig_impl_krn<Strided1DIndexer, dataT,
+                                                        indT>;
+
+    using LocalAccessorT = sycl::local_accessor<indT, 1>;
+    using Impl =
+        struct MaskedExtractStridedFunctor<TwoZeroOffsets_Indexer, NoOpIndexer,
+                                           Strided1DIndexer, dataT, indT,
+                                           LocalAccessorT>;
+
+    const std::size_t masked_extent = iteration_size;
+
+    const std::size_t lws = detail::get_lws(masked_extent);
+
+    const std::size_t n_groups = (iteration_size + lws - 1) / lws;
+
+    sycl::range<2> gRange{1, n_groups * lws};
+    sycl::range<2> lRange{1, lws};
+
+    sycl::nd_range<2> ndRange(gRange, lRange);
+
+    const dataT *src_tp = reinterpret_cast<const dataT *>(src_p);
+    const indT *cumsum_tp = reinterpret_cast<const indT *>(cumsum_p);
+    dataT *dst_tp = reinterpret_cast<dataT *>(dst_p);
+
+    sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+
+        const std::size_t lacc_size = std::min(lws, masked_extent) + 1;
+        LocalAccessorT lacc(lacc_size, cgh);
+
+        cgh.parallel_for<KernelName>(
+            ndRange, Impl(src_tp, cumsum_tp, dst_tp, masked_extent,
+                          orthog_src_dst_indexer, masked_src_indexer,
+                          masked_dst_indexer, lacc));
+    });
+
+    return comp_ev;
+}
+
+template <typename MaskedSrcIndexerT,
+          typename MaskedDstIndexerT,
+          typename dataT,
+          typename indT>
+class masked_extract_all_slices_strided_impl_krn;
+
+typedef sycl::event (*masked_extract_all_slices_strided_impl_fn_ptr_t)(
+    sycl::queue &,
+    ssize_t,
+    const char *,
+    const char *,
+    char *,
+    int,
+    ssize_t const *,
+    ssize_t,
+    ssize_t,
+    const std::vector<sycl::event> &);
+
+template <typename dataT, typename indT>
+sycl::event masked_extract_all_slices_strided_impl(
+    sycl::queue &exec_q,
+    ssize_t iteration_size,
+    const char *src_p,
+    const char *cumsum_p,
+    char *dst_p,
+    int nd,
+    const ssize_t
+        *packed_src_shape_strides, // [src_shape, src_strides], length 2*nd
+    ssize_t dst_size,              // dst is 1D
+    ssize_t dst_stride,
+    const std::vector<sycl::event> &depends = {})
+{
+    static constexpr TwoZeroOffsets_Indexer orthog_src_dst_indexer{};
+
+    /* StridedIndexer(int _nd, ssize_t _offset, ssize_t const
+     * *_packed_shape_strides) */
+    const StridedIndexer masked_src_indexer(nd, 0, packed_src_shape_strides);
+    const Strided1DIndexer masked_dst_indexer(/* size */ dst_size,
+                                              /* step */ dst_stride);
+
+    using KernelName = class masked_extract_all_slices_strided_impl_krn<
+        StridedIndexer, Strided1DIndexer, dataT, indT>;
+
+    using LocalAccessorT = sycl::local_accessor<indT, 1>;
+    using Impl =
+        struct MaskedExtractStridedFunctor<TwoZeroOffsets_Indexer,
+                                           StridedIndexer, Strided1DIndexer,
+                                           dataT, indT, LocalAccessorT>;
+
+    const std::size_t masked_nelems = iteration_size;
+
+    const std::size_t lws = detail::get_lws(masked_nelems);
+
+    const std::size_t n_groups = (masked_nelems + lws - 1) / lws;
+
+    sycl::range<2> gRange{1, n_groups * lws};
+    sycl::range<2> lRange{1, lws};
+
+    sycl::nd_range<2> ndRange(gRange, lRange);
+
+    const dataT *src_tp = reinterpret_cast<const dataT *>(src_p);
+    const indT *cumsum_tp = reinterpret_cast<const indT *>(cumsum_p);
+    dataT *dst_tp = reinterpret_cast<dataT *>(dst_p);
+
+    sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+
+        const std::size_t lacc_size = std::min(lws, masked_nelems) + 1;
+        LocalAccessorT lacc(lacc_size, cgh);
+
+        cgh.parallel_for<KernelName>(
+            ndRange, Impl(src_tp, cumsum_tp, dst_tp, iteration_size,
+                          orthog_src_dst_indexer, masked_src_indexer,
+                          masked_dst_indexer, lacc));
+    });
+
+    return comp_ev;
+}
+
+typedef sycl::event (*masked_extract_some_slices_strided_impl_fn_ptr_t)(
+    sycl::queue &,
+    ssize_t,
+    ssize_t,
+    const char *,
+    const char *,
+    char *,
+    int,
+    ssize_t const *,
+    ssize_t,
+    ssize_t,
+    int,
+    ssize_t const *,
+    ssize_t,
+    ssize_t,
+    const std::vector<sycl::event> &);
+
+template <typename OrthoIndexerT,
+          typename MaskedSrcIndexerT,
+          typename MaskedDstIndexerT,
+          typename dataT,
+          typename indT>
+class masked_extract_some_slices_strided_impl_krn;
+
+template <typename dataT, typename indT>
+sycl::event masked_extract_some_slices_strided_impl(
+    sycl::queue &exec_q,
+    ssize_t orthog_nelems,
+    ssize_t masked_nelems,
+    const char *src_p,
+    const char *cumsum_p,
+    char *dst_p,
+    int orthog_nd,
+    // [ortho_shape, ortho_src_strides, // ortho_dst_strides],
+    // length 3*ortho_nd
+    const ssize_t *packed_ortho_src_dst_shape_strides,
+    ssize_t ortho_src_offset,
+    ssize_t ortho_dst_offset,
+    int masked_nd,
+    // [masked_src_shape, masked_src_strides],
+    // length 2*masked_nd, mask_dst is 1D
+    const ssize_t *packed_masked_src_shape_strides,
+    ssize_t masked_dst_size,
+    ssize_t masked_dst_stride,
+    const std::vector<sycl::event> &depends = {})
+{
+    const TwoOffsets_StridedIndexer orthog_src_dst_indexer{
+        orthog_nd, ortho_src_offset, ortho_dst_offset,
+        packed_ortho_src_dst_shape_strides};
+
+    const StridedIndexer masked_src_indexer{masked_nd, 0,
+                                            packed_masked_src_shape_strides};
+    const Strided1DIndexer masked_dst_indexer{/* size */ masked_dst_size,
+                                              /* step */ masked_dst_stride};
+
+    using KernelName = class masked_extract_some_slices_strided_impl_krn<
+        TwoOffsets_StridedIndexer, StridedIndexer, Strided1DIndexer, dataT,
+        indT>;
+
+    using LocalAccessorT = sycl::local_accessor<indT, 1>;
+    using Impl =
+        struct MaskedExtractStridedFunctor<TwoOffsets_StridedIndexer,
+                                           StridedIndexer, Strided1DIndexer,
+                                           dataT, indT, LocalAccessorT>;
+
+    const std::size_t masked_extent = masked_nelems;
+
+    const std::size_t lws = detail::get_lws(masked_extent);
+
+    const std::size_t n_groups = ((masked_extent + lws - 1) / lws);
+    const std::size_t orthog_extent = static_cast<std::size_t>(orthog_nelems);
+
+    sycl::range<2> gRange{orthog_extent, n_groups * lws};
+    sycl::range<2> lRange{1, lws};
+
+    sycl::nd_range<2> ndRange(gRange, lRange);
+
+    const dataT *src_tp = reinterpret_cast<const dataT *>(src_p);
+    const indT *cumsum_tp = reinterpret_cast<const indT *>(cumsum_p);
+    dataT *dst_tp = reinterpret_cast<dataT *>(dst_p);
+
+    sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+
+        const std::size_t lacc_size =
+            std::min<std::size_t>(lws, masked_extent) + 1;
+        LocalAccessorT lacc(lacc_size, cgh);
+
+        cgh.parallel_for<KernelName>(
+            ndRange, Impl(src_tp, cumsum_tp, dst_tp, masked_nelems,
+                          orthog_src_dst_indexer, masked_src_indexer,
+                          masked_dst_indexer, lacc));
+    });
+
+    return comp_ev;
+}
+
+template <typename fnT, typename T>
+struct MaskExtractAllSlicesContigFactoryForInt32
+{
+    fnT get()
+    {
+        fnT fn = masked_extract_all_slices_contig_impl<T, std::int32_t>;
+        return fn;
+    }
+};
+
+template <typename fnT, typename T>
+struct MaskExtractAllSlicesContigFactoryForInt64
+{
+    fnT get()
+    {
+        fnT fn = masked_extract_all_slices_contig_impl<T, std::int64_t>;
+        return fn;
+    }
+};
+
+template <typename fnT, typename T>
+struct MaskExtractAllSlicesStridedFactoryForInt32
+{
+    fnT get()
+    {
+        fnT fn = masked_extract_all_slices_strided_impl<T, std::int32_t>;
+        return fn;
+    }
+};
+
+template <typename fnT, typename T>
+struct MaskExtractAllSlicesStridedFactoryForInt64
+{
+    fnT get()
+    {
+        fnT fn = masked_extract_all_slices_strided_impl<T, std::int64_t>;
+        return fn;
+    }
+};
+
+template <typename fnT, typename T>
+struct MaskExtractSomeSlicesStridedFactoryForInt32
+{
+    fnT get()
+    {
+        fnT fn = masked_extract_some_slices_strided_impl<T, std::int32_t>;
+        return fn;
+    }
+};
+
+template <typename fnT, typename T>
+struct MaskExtractSomeSlicesStridedFactoryForInt64
+{
+    fnT get()
+    {
+        fnT fn = masked_extract_some_slices_strided_impl<T, std::int64_t>;
+        return fn;
+    }
+};
+
+// Masked placement
+
+template <typename OrthoIndexerT,
+          typename MaskedDstIndexerT,
+          typename MaskedRhsIndexerT,
+          typename dataT,
+          typename indT>
+class masked_place_all_slices_strided_impl_krn;
+
+typedef sycl::event (*masked_place_all_slices_strided_impl_fn_ptr_t)(
+    sycl::queue &,
+    ssize_t,
+    char *,
+    const char *,
+    const char *,
+    int,
+    ssize_t const *,
+    ssize_t,
+    ssize_t,
+    const std::vector<sycl::event> &);
+
+template <typename dataT, typename indT>
+sycl::event masked_place_all_slices_strided_impl(
+    sycl::queue &exec_q,
+    ssize_t iteration_size,
+    char *dst_p,
+    const char *cumsum_p,
+    const char *rhs_p,
+    int nd,
+    const ssize_t
+        *packed_dst_shape_strides, // [dst_shape, dst_strides], length 2*nd
+    ssize_t rhs_size,              // rhs is 1D
+    ssize_t rhs_stride,
+    const std::vector<sycl::event> &depends = {})
+{
+    static constexpr TwoZeroOffsets_Indexer orthog_dst_rhs_indexer{};
+
+    /* StridedIndexer(int _nd, ssize_t _offset, ssize_t const
+     * *_packed_shape_strides) */
+    const StridedIndexer masked_dst_indexer(nd, 0, packed_dst_shape_strides);
+    const Strided1DCyclicIndexer masked_rhs_indexer(0, rhs_size, rhs_stride);
+
+    using KernelName = class masked_place_all_slices_strided_impl_krn<
+        TwoZeroOffsets_Indexer, StridedIndexer, Strided1DCyclicIndexer, dataT,
+        indT>;
+
+    static constexpr std::size_t nominal_lws = 256;
+    const std::size_t masked_extent = iteration_size;
+    const std::size_t lws = std::min(masked_extent, nominal_lws);
+
+    const std::size_t n_groups = (masked_extent + lws - 1) / lws;
+
+    sycl::range<2> gRange{1, n_groups * lws};
+    sycl::range<2> lRange{1, lws};
+    sycl::nd_range<2> ndRange{gRange, lRange};
+
+    using LocalAccessorT = sycl::local_accessor<indT, 1>;
+    using Impl =
+        MaskedPlaceStridedFunctor<TwoZeroOffsets_Indexer, StridedIndexer,
+                                  Strided1DCyclicIndexer, dataT, indT,
+                                  LocalAccessorT>;
+
+    dataT *dst_tp = reinterpret_cast<dataT *>(dst_p);
+    const dataT *rhs_tp = reinterpret_cast<const dataT *>(rhs_p);
+    const indT *cumsum_tp = reinterpret_cast<const indT *>(cumsum_p);
+
+    sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+
+        const std::size_t lacc_size = std::min(masked_extent, lws) + 1;
+        LocalAccessorT lacc(lacc_size, cgh);
+
+        cgh.parallel_for<KernelName>(
+            ndRange, Impl(dst_tp, cumsum_tp, rhs_tp, iteration_size,
+                          orthog_dst_rhs_indexer, masked_dst_indexer,
+                          masked_rhs_indexer, lacc));
+    });
+
+    return comp_ev;
+}
+
+typedef sycl::event (*masked_place_some_slices_strided_impl_fn_ptr_t)(
+    sycl::queue &,
+    ssize_t,
+    ssize_t,
+    char *,
+    const char *,
+    const char *,
+    int,
+    ssize_t const *,
+    ssize_t,
+    ssize_t,
+    int,
+    ssize_t const *,
+    ssize_t,
+    ssize_t,
+    const std::vector<sycl::event> &);
+
+template <typename OrthoIndexerT,
+          typename MaskedSrcIndexerT,
+          typename MaskedDstIndexerT,
+          typename dataT,
+          typename indT>
+class masked_place_some_slices_strided_impl_krn;
+
+template <typename dataT, typename indT>
+sycl::event masked_place_some_slices_strided_impl(
+    sycl::queue &exec_q,
+    ssize_t orthog_nelems,
+    ssize_t masked_nelems,
+    char *dst_p,
+    const char *cumsum_p,
+    const char *rhs_p,
+    int orthog_nd,
+    // [ortho_shape, ortho_dst_strides, ortho_rhs_strides],
+    // length 3*ortho_nd
+    const ssize_t *packed_ortho_dst_rhs_shape_strides,
+    ssize_t ortho_dst_offset,
+    ssize_t ortho_rhs_offset,
+    int masked_nd,
+    // [masked_dst_shape, masked_dst_strides],
+    // length 2*masked_nd, mask_dst is 1D
+    const ssize_t *packed_masked_dst_shape_strides,
+    ssize_t masked_rhs_size,
+    ssize_t masked_rhs_stride,
+    const std::vector<sycl::event> &depends = {})
+{
+    const TwoOffsets_StridedIndexer orthog_dst_rhs_indexer{
+        orthog_nd, ortho_dst_offset, ortho_rhs_offset,
+        packed_ortho_dst_rhs_shape_strides};
+
+    /* StridedIndexer(int _nd, ssize_t _offset, ssize_t const
+     * *_packed_shape_strides) */
+    const StridedIndexer masked_dst_indexer{masked_nd, 0,
+                                            packed_masked_dst_shape_strides};
+    const Strided1DCyclicIndexer masked_rhs_indexer{0, masked_rhs_size,
+                                                    masked_rhs_stride};
+
+    using KernelName = class masked_place_some_slices_strided_impl_krn<
+        TwoOffsets_StridedIndexer, StridedIndexer, Strided1DCyclicIndexer,
+        dataT, indT>;
+
+    static constexpr std::size_t nominal_lws = 256;
+    const std::size_t orthog_extent = orthog_nelems;
+    const std::size_t masked_extent = masked_nelems;
+    const std::size_t lws = std::min(masked_extent, nominal_lws);
+
+    const std::size_t n_groups = (masked_extent + lws - 1) / lws;
+
+    sycl::range<2> gRange{orthog_extent, n_groups * lws};
+    sycl::range<2> lRange{1, lws};
+    sycl::nd_range<2> ndRange{gRange, lRange};
+
+    using LocalAccessorT = sycl::local_accessor<indT, 1>;
+    using Impl =
+        MaskedPlaceStridedFunctor<TwoOffsets_StridedIndexer, StridedIndexer,
+                                  Strided1DCyclicIndexer, dataT, indT,
+                                  LocalAccessorT>;
+
+    dataT *dst_tp = reinterpret_cast<dataT *>(dst_p);
+    const dataT *rhs_tp = reinterpret_cast<const dataT *>(rhs_p);
+    const indT *cumsum_tp = reinterpret_cast<const indT *>(cumsum_p);
+
+    sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+
+        const std::size_t lacc_size = std::min(masked_extent, lws) + 1;
+        LocalAccessorT lacc(lacc_size, cgh);
+
+        cgh.parallel_for<KernelName>(
+            ndRange, Impl(dst_tp, cumsum_tp, rhs_tp, masked_nelems,
+                          orthog_dst_rhs_indexer, masked_dst_indexer,
+                          masked_rhs_indexer, lacc));
+    });
+
+    return comp_ev;
+}
+
+template <typename fnT, typename T>
+struct MaskPlaceAllSlicesStridedFactoryForInt32
+{
+    fnT get()
+    {
+        fnT fn = masked_place_all_slices_strided_impl<T, std::int32_t>;
+        return fn;
+    }
+};
+
+template <typename fnT, typename T>
+struct MaskPlaceAllSlicesStridedFactoryForInt64
+{
+    fnT get()
+    {
+        fnT fn = masked_place_all_slices_strided_impl<T, std::int64_t>;
+        return fn;
+    }
+};
+
+template <typename fnT, typename T>
+struct MaskPlaceSomeSlicesStridedFactoryForInt32
+{
+    fnT get()
+    {
+        fnT fn = masked_place_some_slices_strided_impl<T, std::int32_t>;
+        return fn;
+    }
+};
+
+template <typename fnT, typename T>
+struct MaskPlaceSomeSlicesStridedFactoryForInt64
+{
+    fnT get()
+    {
+        fnT fn = masked_place_some_slices_strided_impl<T, std::int64_t>;
+        return fn;
+    }
+};
+
+// Non-zero
+
+template <typename T1, typename T2>
+class non_zero_indexes_krn;
+
+typedef sycl::event (*non_zero_indexes_fn_ptr_t)(
+    sycl::queue &,
+    ssize_t,
+    ssize_t,
+    int,
+    const char *,
+    char *,
+    const ssize_t *,
+    std::vector<sycl::event> const &);
+
+template <typename indT1, typename indT2>
+sycl::event non_zero_indexes_impl(sycl::queue &exec_q,
+                                  ssize_t iter_size,
+                                  ssize_t nz_elems,
+                                  int nd,
+                                  const char *cumsum_cp,
+                                  char *indexes_cp,
+                                  const ssize_t *mask_shape,
+                                  std::vector<sycl::event> const &depends)
+{
+    const indT1 *cumsum_data = reinterpret_cast<const indT1 *>(cumsum_cp);
+    indT2 *indexes_data = reinterpret_cast<indT2 *>(indexes_cp);
+
+    static constexpr std::size_t nominal_lws = 256u;
+    const std::size_t masked_extent = iter_size;
+    const std::size_t lws = std::min(masked_extent, nominal_lws);
+
+    const std::size_t n_groups = (masked_extent + lws - 1) / lws;
+    sycl::range<1> gRange{n_groups * lws};
+    sycl::range<1> lRange{lws};
+
+    sycl::nd_range<1> ndRange{gRange, lRange};
+
+    sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+
+        const std::size_t lacc_size = std::min(lws, masked_extent) + 1;
+        sycl::local_accessor<indT1, 1> lacc(lacc_size, cgh);
+
+        using KernelName = class non_zero_indexes_krn<indT1, indT2>;
+
+        cgh.parallel_for<KernelName>(ndRange, [=](sycl::nd_item<1> ndit) {
+            const std::size_t group_i = ndit.get_group(0);
+            const std::uint32_t l_i = ndit.get_local_id(0);
+            const std::uint32_t lws = ndit.get_local_range(0);
+
+            const std::size_t masked_block_start = group_i * lws;
+
+            for (std::uint32_t i = l_i; i < lacc.size(); i += lws) {
+                const std::size_t offset = masked_block_start + i;
+                lacc[i] = (offset == 0) ? indT1(0)
+                          : (offset - 1 < masked_extent)
+                              ? cumsum_data[offset - 1]
+                              : cumsum_data[masked_extent - 1] + 1;
+            }
+
+            sycl::group_barrier(ndit.get_group());
+
+            const std::size_t i = masked_block_start + l_i;
+            const auto cs_val = lacc[l_i];
+            const bool cond = (lacc[l_i + 1] == cs_val + 1);
+
+            if (cond && (i < masked_extent)) {
+                ssize_t i_ = static_cast<ssize_t>(i);
+                for (int dim = nd; --dim > 0;) {
+                    const auto sd = mask_shape[dim];
+                    const ssize_t q = i_ / sd;
+                    const ssize_t r = (i_ - q * sd);
+                    indexes_data[cs_val + dim * nz_elems] =
+                        static_cast<indT2>(r);
+                    i_ = q;
+                }
+                indexes_data[cs_val] = static_cast<indT2>(i_);
+            }
+        });
+    });
+
+    return comp_ev;
+}
+
+} // namespace dpctl::tensor::kernels::indexing
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/constructors.hpp b/dpctl_ext/tensor/libtensor/include/kernels/constructors.hpp
index 22189ee3129c..26ae46707a6b 100644
--- a/dpctl_ext/tensor/libtensor/include/kernels/constructors.hpp
+++ b/dpctl_ext/tensor/libtensor/include/kernels/constructors.hpp
@@ -56,7 +56,8 @@ using dpctl::tensor::ssize_t;
 
 template <typename Ty>
 class full_strided_kernel;
-// template <typename Ty> class eye_kernel;
+template <typename Ty>
+class eye_kernel;
 
 using namespace dpctl::tensor::offset_utils;
 
@@ -162,6 +163,99 @@ sycl::event full_strided_impl(sycl::queue &q,
     return fill_ev;
 }
 
+/* ================ Eye ================== */
+
+typedef sycl::event (*eye_fn_ptr_t)(sycl::queue &,
+                                    std::size_t nelems, // num_elements
+                                    ssize_t start,
+                                    ssize_t end,
+                                    ssize_t step,
+                                    char *, // dst_data_ptr
+                                    const std::vector<sycl::event> &);
+
+template <typename Ty>
+class EyeFunctor
+{
+private:
+    Ty *p = nullptr;
+    ssize_t start_v;
+    ssize_t end_v;
+    ssize_t step_v;
+
+public:
+    EyeFunctor(char *dst_p,
+               const ssize_t v0,
+               const ssize_t v1,
+               const ssize_t dv)
+        : p(reinterpret_cast<Ty *>(dst_p)), start_v(v0), end_v(v1), step_v(dv)
+    {
+    }
+
+    void operator()(sycl::id<1> wiid) const
+    {
+        Ty set_v = 0;
+        ssize_t i = static_cast<ssize_t>(wiid.get(0));
+        if (i >= start_v and i <= end_v) {
+            if ((i - start_v) % step_v == 0) {
+                set_v = 1;
+            }
+        }
+        p[i] = set_v;
+    }
+};
+
+/*!
+ * @brief Function to populate 2D array with eye matrix.
+ *
+ * @param exec_q  Sycl queue to which kernel is submitted for execution.
+ * @param nelems  Number of elements to assign.
+ * @param start   Position of the first non-zero value.
+ * @param end     Position of the last non-zero value.
+ * @param step    Number of array elements between non-zeros.
+ * @param array_data Kernel accessible USM pointer for the destination array.
+ * @param depends  List of events to wait for before starting computations, if
+ * any.
+ *
+ * @return  Event to wait on to ensure that computation completes.
+ * @defgroup CtorKernels
+ */
+template <typename Ty>
+sycl::event eye_impl(sycl::queue &exec_q,
+                     std::size_t nelems,
+                     const ssize_t start,
+                     const ssize_t end,
+                     const ssize_t step,
+                     char *array_data,
+                     const std::vector<sycl::event> &depends)
+{
+    dpctl::tensor::type_utils::validate_type_for_device<Ty>(exec_q);
+    sycl::event eye_event = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+
+        using KernelName = eye_kernel<Ty>;
+        using Impl = EyeFunctor<Ty>;
+
+        cgh.parallel_for<KernelName>(sycl::range<1>{nelems},
+                                     Impl(array_data, start, end, step));
+    });
+
+    return eye_event;
+}
+
+/*!
+ * @brief  Factory to get function pointer of type `fnT` for data type `Ty`.
+ * @ingroup CtorKernels
+ */
+template <typename fnT, typename Ty>
+struct EyeFactory
+{
+    fnT get()
+    {
+        fnT f = eye_impl<Ty>;
+        return f;
+    }
+};
+
 /* =========================== Tril and triu ============================== */
 
 // define function type
diff --git a/dpctl_ext/tensor/libtensor/source/accumulators.cpp b/dpctl_ext/tensor/libtensor/source/accumulators.cpp
new file mode 100644
index 000000000000..82913010755a
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/accumulators.cpp
@@ -0,0 +1,406 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions
+//===----------------------------------------------------------------------===//
+
+#include <algorithm>
+#include <cstddef>
+#include <cstdint>
+#include <stdexcept>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "kernels/accumulators.hpp"
+#include "simplify_iteration_space.hpp"
+#include "utils/memory_overlap.hpp"
+#include "utils/offset_utils.hpp"
+#include "utils/output_validation.hpp"
+#include "utils/sycl_alloc_utils.hpp"
+#include "utils/type_dispatch.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+// Computation of positions of masked elements
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+using dpctl::tensor::kernels::accumulators::cumsum_val_contig_impl_fn_ptr_t;
+static cumsum_val_contig_impl_fn_ptr_t
+    mask_positions_contig_i64_dispatch_vector[td_ns::num_types];
+static cumsum_val_contig_impl_fn_ptr_t
+    mask_positions_contig_i32_dispatch_vector[td_ns::num_types];
+
+using dpctl::tensor::kernels::accumulators::cumsum_val_strided_impl_fn_ptr_t;
+static cumsum_val_strided_impl_fn_ptr_t
+    mask_positions_strided_i64_dispatch_vector[td_ns::num_types];
+static cumsum_val_strided_impl_fn_ptr_t
+    mask_positions_strided_i32_dispatch_vector[td_ns::num_types];
+
+void populate_mask_positions_dispatch_vectors(void)
+{
+    using dpctl::tensor::kernels::accumulators::
+        MaskPositionsContigFactoryForInt64;
+    td_ns::DispatchVectorBuilder<cumsum_val_contig_impl_fn_ptr_t,
+                                 MaskPositionsContigFactoryForInt64,
+                                 td_ns::num_types>
+        dvb1;
+    dvb1.populate_dispatch_vector(mask_positions_contig_i64_dispatch_vector);
+
+    using dpctl::tensor::kernels::accumulators::
+        MaskPositionsContigFactoryForInt32;
+    td_ns::DispatchVectorBuilder<cumsum_val_contig_impl_fn_ptr_t,
+                                 MaskPositionsContigFactoryForInt32,
+                                 td_ns::num_types>
+        dvb2;
+    dvb2.populate_dispatch_vector(mask_positions_contig_i32_dispatch_vector);
+
+    using dpctl::tensor::kernels::accumulators::
+        MaskPositionsStridedFactoryForInt64;
+    td_ns::DispatchVectorBuilder<cumsum_val_strided_impl_fn_ptr_t,
+                                 MaskPositionsStridedFactoryForInt64,
+                                 td_ns::num_types>
+        dvb3;
+    dvb3.populate_dispatch_vector(mask_positions_strided_i64_dispatch_vector);
+
+    using dpctl::tensor::kernels::accumulators::
+        MaskPositionsStridedFactoryForInt32;
+    td_ns::DispatchVectorBuilder<cumsum_val_strided_impl_fn_ptr_t,
+                                 MaskPositionsStridedFactoryForInt32,
+                                 td_ns::num_types>
+        dvb4;
+    dvb4.populate_dispatch_vector(mask_positions_strided_i32_dispatch_vector);
+
+    return;
+}
+
+std::size_t py_mask_positions(const dpctl::tensor::usm_ndarray &mask,
+                              const dpctl::tensor::usm_ndarray &cumsum,
+                              sycl::queue &exec_q,
+                              const std::vector<sycl::event> &depends)
+{
+    dpctl::tensor::validation::CheckWritable::throw_if_not_writable(cumsum);
+
+    // cumsum is 1D
+    if (cumsum.get_ndim() != 1) {
+        throw py::value_error("Result array must be one-dimensional.");
+    }
+
+    if (!cumsum.is_c_contiguous()) {
+        throw py::value_error("Expecting `cumsum` array must be C-contiguous.");
+    }
+
+    // cumsum.shape == (mask.size,)
+    auto mask_size = mask.get_size();
+    auto cumsum_size = cumsum.get_shape(0);
+    if (cumsum_size != mask_size) {
+        throw py::value_error("Inconsistent dimensions");
+    }
+
+    if (!dpctl::utils::queues_are_compatible(exec_q, {mask, cumsum})) {
+        // FIXME: use ExecutionPlacementError
+        throw py::value_error(
+            "Execution queue is not compatible with allocation queues");
+    }
+
+    if (mask_size == 0) {
+        return 0;
+    }
+
+    int mask_typenum = mask.get_typenum();
+    int cumsum_typenum = cumsum.get_typenum();
+
+    // mask can be any type
+    const char *mask_data = mask.get_data();
+    char *cumsum_data = cumsum.get_data();
+
+    auto const &array_types = td_ns::usm_ndarray_types();
+
+    int mask_typeid = array_types.typenum_to_lookup_id(mask_typenum);
+    int cumsum_typeid = array_types.typenum_to_lookup_id(cumsum_typenum);
+
+    // cumsum must be int32_t/int64_t only
+    static constexpr int int32_typeid =
+        static_cast<int>(td_ns::typenum_t::INT32);
+    static constexpr int int64_typeid =
+        static_cast<int>(td_ns::typenum_t::INT64);
+    if (cumsum_typeid != int32_typeid && cumsum_typeid != int64_typeid) {
+        throw py::value_error(
+            "Cumulative sum array must have int32 or int64 data-type.");
+    }
+
+    const bool use_i32 = (cumsum_typeid == int32_typeid);
+
+    std::vector<sycl::event> host_task_events;
+
+    if (mask.is_c_contiguous()) {
+        auto fn = (use_i32)
+                      ? mask_positions_contig_i32_dispatch_vector[mask_typeid]
+                      : mask_positions_contig_i64_dispatch_vector[mask_typeid];
+
+        std::size_t total_set;
+
+        {
+            py::gil_scoped_release release;
+
+            total_set = fn(exec_q, mask_size, mask_data, cumsum_data,
+                           host_task_events, depends);
+
+            sycl::event::wait(host_task_events);
+        }
+        return total_set;
+    }
+
+    const py::ssize_t *shape = mask.get_shape_raw();
+    auto const &strides_vector = mask.get_strides_vector();
+
+    using shT = std::vector<py::ssize_t>;
+    shT compact_shape;
+    shT compact_strides;
+
+    int mask_nd = mask.get_ndim();
+    int nd = mask_nd;
+
+    dpctl::tensor::py_internal::compact_iteration_space(
+        nd, shape, strides_vector, compact_shape, compact_strides);
+
+    // Strided implementation
+    auto strided_fn =
+        (use_i32) ? mask_positions_strided_i32_dispatch_vector[mask_typeid]
+                  : mask_positions_strided_i64_dispatch_vector[mask_typeid];
+
+    using dpctl::tensor::offset_utils::device_allocate_and_pack;
+    auto ptr_size_event_tuple = device_allocate_and_pack<py::ssize_t>(
+        exec_q, host_task_events, compact_shape, compact_strides);
+    auto shape_strides_owner = std::move(std::get<0>(ptr_size_event_tuple));
+    sycl::event copy_shape_ev = std::get<2>(ptr_size_event_tuple);
+    const py::ssize_t *shape_strides = shape_strides_owner.get();
+
+    if (2 * static_cast<std::size_t>(nd) != std::get<1>(ptr_size_event_tuple)) {
+        {
+            py::gil_scoped_release release;
+
+            copy_shape_ev.wait();
+            sycl::event::wait(host_task_events);
+
+            // ensure deleter of smart pointer is invoked with GIL released
+            shape_strides_owner.reset(nullptr);
+        }
+        throw std::runtime_error("Unexpected error");
+    }
+
+    std::vector<sycl::event> dependent_events;
+    dependent_events.reserve(depends.size() + 1);
+    dependent_events.insert(dependent_events.end(), copy_shape_ev);
+    dependent_events.insert(dependent_events.end(), depends.begin(),
+                            depends.end());
+
+    std::size_t total_set;
+
+    {
+        py::gil_scoped_release release;
+
+        total_set = strided_fn(exec_q, mask_size, mask_data, nd, shape_strides,
+                               cumsum_data, host_task_events, dependent_events);
+
+        sycl::event::wait(host_task_events);
+        // ensure deleter of smart pointer is invoked with GIL released
+        shape_strides_owner.reset(nullptr);
+    }
+
+    return total_set;
+}
+
+using dpctl::tensor::kernels::accumulators::cumsum_val_strided_impl_fn_ptr_t;
+static cumsum_val_strided_impl_fn_ptr_t
+    cumsum_1d_strided_dispatch_vector[td_ns::num_types];
+using dpctl::tensor::kernels::accumulators::cumsum_val_contig_impl_fn_ptr_t;
+static cumsum_val_contig_impl_fn_ptr_t
+    cumsum_1d_contig_dispatch_vector[td_ns::num_types];
+
+void populate_cumsum_1d_dispatch_vectors(void)
+{
+    using dpctl::tensor::kernels::accumulators::Cumsum1DContigFactory;
+    td_ns::DispatchVectorBuilder<cumsum_val_contig_impl_fn_ptr_t,
+                                 Cumsum1DContigFactory, td_ns::num_types>
+        dvb1;
+    dvb1.populate_dispatch_vector(cumsum_1d_contig_dispatch_vector);
+
+    using dpctl::tensor::kernels::accumulators::Cumsum1DStridedFactory;
+    td_ns::DispatchVectorBuilder<cumsum_val_strided_impl_fn_ptr_t,
+                                 Cumsum1DStridedFactory, td_ns::num_types>
+        dvb2;
+    dvb2.populate_dispatch_vector(cumsum_1d_strided_dispatch_vector);
+
+    return;
+}
+
+std::size_t py_cumsum_1d(const dpctl::tensor::usm_ndarray &src,
+                         const dpctl::tensor::usm_ndarray &cumsum,
+                         sycl::queue &exec_q,
+                         std::vector<sycl::event> const &depends)
+{
+    // cumsum is 1D
+    if (cumsum.get_ndim() != 1) {
+        throw py::value_error("cumsum array must be one-dimensional.");
+    }
+
+    if (!cumsum.is_c_contiguous()) {
+        throw py::value_error("Expecting `cumsum` array to be C-contiguous.");
+    }
+
+    // cumsum.shape == (src.size,)
+    auto src_size = src.get_size();
+    auto cumsum_size = cumsum.get_shape(0);
+    if (cumsum_size != src_size) {
+        throw py::value_error("Inconsistent dimensions");
+    }
+
+    if (!dpctl::utils::queues_are_compatible(exec_q, {src, cumsum})) {
+        // FIXME: use ExecutionPlacementError
+        throw py::value_error(
+            "Execution queue is not compatible with allocation queues");
+    }
+
+    dpctl::tensor::validation::CheckWritable::throw_if_not_writable(cumsum);
+
+    if (src_size == 0) {
+        return 0;
+    }
+
+    int src_typenum = src.get_typenum();
+    int cumsum_typenum = cumsum.get_typenum();
+
+    // src can be any type
+    const char *src_data = src.get_data();
+    char *cumsum_data = cumsum.get_data();
+
+    auto const &array_types = td_ns::usm_ndarray_types();
+
+    int src_typeid = array_types.typenum_to_lookup_id(src_typenum);
+    int cumsum_typeid = array_types.typenum_to_lookup_id(cumsum_typenum);
+
+    // this cumsum must be int64_t only
+    static constexpr int int64_typeid =
+        static_cast<int>(td_ns::typenum_t::INT64);
+    if (cumsum_typeid != int64_typeid) {
+        throw py::value_error(
+            "Cumulative sum array must have int64 data-type.");
+    }
+
+    std::vector<sycl::event> host_task_events;
+
+    if (src.is_c_contiguous()) {
+        auto fn = cumsum_1d_contig_dispatch_vector[src_typeid];
+        if (fn == nullptr) {
+            throw std::runtime_error(
+                "this cumsum requires integer type, got src_typeid=" +
+                std::to_string(src_typeid));
+        }
+        std::size_t total = fn(exec_q, src_size, src_data, cumsum_data,
+                               host_task_events, depends);
+        {
+            py::gil_scoped_release release;
+            sycl::event::wait(host_task_events);
+        }
+        return total;
+    }
+
+    const py::ssize_t *shape = src.get_shape_raw();
+    auto const &strides_vector = src.get_strides_vector();
+
+    using shT = std::vector<py::ssize_t>;
+    shT compact_shape;
+    shT compact_strides;
+
+    int src_nd = src.get_ndim();
+    int nd = src_nd;
+
+    dpctl::tensor::py_internal::compact_iteration_space(
+        nd, shape, strides_vector, compact_shape, compact_strides);
+
+    // Strided implementation
+    auto strided_fn = cumsum_1d_strided_dispatch_vector[src_typeid];
+    if (strided_fn == nullptr) {
+        throw std::runtime_error(
+            "this cumsum requires integer type, got src_typeid=" +
+            std::to_string(src_typeid));
+    }
+
+    using dpctl::tensor::offset_utils::device_allocate_and_pack;
+    auto ptr_size_event_tuple = device_allocate_and_pack<py::ssize_t>(
+        exec_q, host_task_events, compact_shape, compact_strides);
+    auto shape_strides_owner = std::move(std::get<0>(ptr_size_event_tuple));
+    sycl::event copy_shape_ev = std::get<2>(ptr_size_event_tuple);
+    const py::ssize_t *shape_strides = shape_strides_owner.get();
+
+    if (2 * static_cast<std::size_t>(nd) != std::get<1>(ptr_size_event_tuple)) {
+        {
+            py::gil_scoped_release release;
+
+            copy_shape_ev.wait();
+            sycl::event::wait(host_task_events);
+
+            // ensure USM deleter is called with GIL released
+            shape_strides_owner.reset(nullptr);
+        }
+        throw std::runtime_error("Unexpected error");
+    }
+
+    std::vector<sycl::event> dependent_events;
+    dependent_events.reserve(depends.size() + 1);
+    dependent_events.insert(dependent_events.end(), copy_shape_ev);
+    dependent_events.insert(dependent_events.end(), depends.begin(),
+                            depends.end());
+
+    std::size_t total =
+        strided_fn(exec_q, src_size, src_data, nd, shape_strides, cumsum_data,
+                   host_task_events, dependent_events);
+
+    {
+        py::gil_scoped_release release;
+        sycl::event::wait(host_task_events);
+
+        // ensure USM deleter is called with GIL released
+        shape_strides_owner.reset(nullptr);
+    }
+
+    return total;
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/accumulators.hpp b/dpctl_ext/tensor/libtensor/source/accumulators.hpp
new file mode 100644
index 000000000000..42503093789b
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/accumulators.hpp
@@ -0,0 +1,62 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions
+//===----------------------------------------------------------------------===//
+
+#pragma once
+#include <cstddef>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/pybind11.h>
+
+namespace dpctl::tensor::py_internal
+{
+
+extern void populate_mask_positions_dispatch_vectors(void);
+
+extern std::size_t
+    py_mask_positions(const dpctl::tensor::usm_ndarray &mask,
+                      const dpctl::tensor::usm_ndarray &cumsum,
+                      sycl::queue &exec_q,
+                      const std::vector<sycl::event> &depends = {});
+
+extern void populate_cumsum_1d_dispatch_vectors(void);
+
+extern std::size_t py_cumsum_1d(const dpctl::tensor::usm_ndarray &src,
+                                const dpctl::tensor::usm_ndarray &cumsum,
+                                sycl::queue &exec_q,
+                                std::vector<sycl::event> const &depends = {});
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/boolean_advanced_indexing.cpp b/dpctl_ext/tensor/libtensor/source/boolean_advanced_indexing.cpp
new file mode 100644
index 000000000000..a78cb1750b81
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/boolean_advanced_indexing.cpp
@@ -0,0 +1,859 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines implementation functions of dpctl.tensor.place and
+/// dpctl.tensor.extract, dpctl.tensor.nonzero
+//===----------------------------------------------------------------------===//
+
+#include <algorithm>
+#include <array>
+#include <cstddef>
+#include <cstdint>
+#include <functional>
+#include <limits>
+#include <numeric>
+#include <tuple>
+#include <utility>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "simplify_iteration_space.hpp"
+#include "utils/memory_overlap.hpp"
+#include "utils/offset_utils.hpp"
+#include "utils/output_validation.hpp"
+#include "utils/sycl_alloc_utils.hpp"
+#include "utils/type_dispatch.hpp"
+
+#include "boolean_advanced_indexing.hpp"
+#include "kernels/boolean_advanced_indexing.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+// Masked extraction
+
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+using dpctl::tensor::kernels::indexing::
+    masked_extract_all_slices_strided_impl_fn_ptr_t;
+
+static masked_extract_all_slices_strided_impl_fn_ptr_t
+    masked_extract_all_slices_strided_i32_impl_dispatch_vector
+        [td_ns::num_types];
+static masked_extract_all_slices_strided_impl_fn_ptr_t
+    masked_extract_all_slices_strided_i64_impl_dispatch_vector
+        [td_ns::num_types];
+
+using dpctl::tensor::kernels::indexing::
+    masked_extract_all_slices_contig_impl_fn_ptr_t;
+
+static masked_extract_all_slices_contig_impl_fn_ptr_t
+    masked_extract_all_slices_contig_i32_impl_dispatch_vector[td_ns::num_types];
+static masked_extract_all_slices_contig_impl_fn_ptr_t
+    masked_extract_all_slices_contig_i64_impl_dispatch_vector[td_ns::num_types];
+
+using dpctl::tensor::kernels::indexing::
+    masked_extract_some_slices_strided_impl_fn_ptr_t;
+
+static masked_extract_some_slices_strided_impl_fn_ptr_t
+    masked_extract_some_slices_strided_i32_impl_dispatch_vector
+        [td_ns::num_types];
+static masked_extract_some_slices_strided_impl_fn_ptr_t
+    masked_extract_some_slices_strided_i64_impl_dispatch_vector
+        [td_ns::num_types];
+
+void populate_masked_extract_dispatch_vectors(void)
+{
+    using dpctl::tensor::kernels::indexing::
+        MaskExtractAllSlicesStridedFactoryForInt32;
+    td_ns::DispatchVectorBuilder<
+        masked_extract_all_slices_strided_impl_fn_ptr_t,
+        MaskExtractAllSlicesStridedFactoryForInt32, td_ns::num_types>
+        dvb1;
+    dvb1.populate_dispatch_vector(
+        masked_extract_all_slices_strided_i32_impl_dispatch_vector);
+
+    using dpctl::tensor::kernels::indexing::
+        MaskExtractAllSlicesStridedFactoryForInt64;
+    td_ns::DispatchVectorBuilder<
+        masked_extract_all_slices_strided_impl_fn_ptr_t,
+        MaskExtractAllSlicesStridedFactoryForInt64, td_ns::num_types>
+        dvb2;
+    dvb2.populate_dispatch_vector(
+        masked_extract_all_slices_strided_i64_impl_dispatch_vector);
+
+    using dpctl::tensor::kernels::indexing::
+        MaskExtractSomeSlicesStridedFactoryForInt32;
+    td_ns::DispatchVectorBuilder<
+        masked_extract_some_slices_strided_impl_fn_ptr_t,
+        MaskExtractSomeSlicesStridedFactoryForInt32, td_ns::num_types>
+        dvb3;
+    dvb3.populate_dispatch_vector(
+        masked_extract_some_slices_strided_i32_impl_dispatch_vector);
+
+    using dpctl::tensor::kernels::indexing::
+        MaskExtractSomeSlicesStridedFactoryForInt64;
+    td_ns::DispatchVectorBuilder<
+        masked_extract_some_slices_strided_impl_fn_ptr_t,
+        MaskExtractSomeSlicesStridedFactoryForInt64, td_ns::num_types>
+        dvb4;
+    dvb4.populate_dispatch_vector(
+        masked_extract_some_slices_strided_i64_impl_dispatch_vector);
+
+    using dpctl::tensor::kernels::indexing::
+        MaskExtractAllSlicesContigFactoryForInt32;
+    td_ns::DispatchVectorBuilder<masked_extract_all_slices_contig_impl_fn_ptr_t,
+                                 MaskExtractAllSlicesContigFactoryForInt32,
+                                 td_ns::num_types>
+        dvb5;
+    dvb5.populate_dispatch_vector(
+        masked_extract_all_slices_contig_i32_impl_dispatch_vector);
+
+    using dpctl::tensor::kernels::indexing::
+        MaskExtractAllSlicesContigFactoryForInt64;
+    td_ns::DispatchVectorBuilder<masked_extract_all_slices_contig_impl_fn_ptr_t,
+                                 MaskExtractAllSlicesContigFactoryForInt64,
+                                 td_ns::num_types>
+        dvb6;
+    dvb6.populate_dispatch_vector(
+        masked_extract_all_slices_contig_i64_impl_dispatch_vector);
+}
+
+std::pair<sycl::event, sycl::event>
+    py_extract(const dpctl::tensor::usm_ndarray &src,
+               const dpctl::tensor::usm_ndarray &cumsum,
+               int axis_start, // axis_start <= mask_i < axis_end
+               int axis_end,
+               const dpctl::tensor::usm_ndarray &dst,
+               sycl::queue &exec_q,
+               const std::vector<sycl::event> &depends)
+{
+    dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst);
+
+    int src_nd = src.get_ndim();
+    if ((axis_start < 0 || axis_end > src_nd || axis_start >= axis_end)) {
+        throw py::value_error("Specified axes_start and axes_end are invalid.");
+    }
+    int mask_span_sz = axis_end - axis_start;
+
+    int dst_nd = dst.get_ndim();
+    if (src_nd != dst_nd + (mask_span_sz - 1)) {
+        throw py::value_error("Number of dimensions of source and destination "
+                              "arrays is not consistent");
+    }
+
+    if (!cumsum.is_c_contiguous() || cumsum.get_ndim() != 1) {
+        throw py::value_error("cumsum array must be a C-contiguous vector");
+    }
+
+    if (!dpctl::utils::queues_are_compatible(exec_q, {src, cumsum, dst})) {
+        throw py::value_error(
+            "Execution queue is not compatible with allocation queues");
+    }
+
+    py::ssize_t cumsum_sz = cumsum.get_size();
+
+    const py::ssize_t *src_shape = src.get_shape_raw();
+    const py::ssize_t *dst_shape = dst.get_shape_raw();
+    bool same_ortho_dims(true);
+    std::size_t ortho_nelems(1); // number of orthogonal iterations
+
+    for (auto i = 0; i < axis_start; ++i) {
+        auto src_sh_i = src_shape[i];
+        ortho_nelems *= src_sh_i;
+        same_ortho_dims = same_ortho_dims && (src_sh_i == dst_shape[i]);
+    }
+    for (auto i = axis_end; i < src_nd; ++i) {
+        auto src_sh_i = src_shape[i];
+        ortho_nelems *= src_sh_i;
+        same_ortho_dims =
+            same_ortho_dims && (src_sh_i == dst_shape[i - (mask_span_sz - 1)]);
+    }
+
+    std::size_t masked_src_nelems(1);
+    std::size_t masked_dst_nelems(dst_shape[axis_start]);
+    for (auto i = axis_start; i < axis_end; ++i) {
+        masked_src_nelems *= src_shape[i];
+    }
+
+    // masked_dst_nelems is number of set elements in the mask, or last element
+    // in cumsum
+    if (!same_ortho_dims ||
+        (masked_src_nelems != static_cast<std::size_t>(cumsum_sz)))
+    {
+        throw py::value_error("Inconsistent array dimensions");
+    }
+
+    dpctl::tensor::validation::AmpleMemory::throw_if_not_ample(
+        dst, ortho_nelems * masked_dst_nelems);
+
+    auto const &overlap = dpctl::tensor::overlap::MemoryOverlap();
+    // check that dst does not intersect with src, not with cumsum.
+    if (overlap(dst, cumsum) || overlap(dst, src)) {
+        throw py::value_error("Destination array overlaps with inputs");
+    }
+
+    int src_typenum = src.get_typenum();
+    int dst_typenum = dst.get_typenum();
+    int cumsum_typenum = cumsum.get_typenum();
+
+    auto const &array_types = td_ns::usm_ndarray_types();
+    int src_typeid = array_types.typenum_to_lookup_id(src_typenum);
+    int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum);
+    int cumsum_typeid = array_types.typenum_to_lookup_id(cumsum_typenum);
+
+    static constexpr int int32_typeid =
+        static_cast<int>(td_ns::typenum_t::INT32);
+    static constexpr int int64_typeid =
+        static_cast<int>(td_ns::typenum_t::INT64);
+    if (cumsum_typeid != int32_typeid && cumsum_typeid != int64_typeid) {
+        throw py::value_error("Unexpected data type of cumsum array, expecting "
+                              "'int32' or 'int64'");
+    }
+
+    const bool use_i32 = (cumsum_typeid == int32_typeid);
+
+    if (src_typeid != dst_typeid) {
+        throw py::value_error(
+            "Destination array must have the same elemental data types");
+    }
+
+    char *src_data_p = src.get_data();
+    char *dst_data_p = dst.get_data();
+    char *cumsum_data_p = cumsum.get_data();
+
+    auto src_shape_vec = src.get_shape_vector();
+    auto src_strides_vec = src.get_strides_vector();
+
+    auto dst_shape_vec = dst.get_shape_vector();
+    auto dst_strides_vec = dst.get_strides_vector();
+
+    sycl::event extract_ev;
+    std::vector<sycl::event> host_task_events{};
+    if (axis_start == 0 && axis_end == src_nd) {
+        assert(dst_shape_vec.size() == 1);
+        assert(dst_strides_vec.size() == 1);
+
+        if (src.is_c_contiguous()) {
+            auto fn =
+                (use_i32)
+                    ? masked_extract_all_slices_contig_i32_impl_dispatch_vector
+                          [src_typeid]
+                    : masked_extract_all_slices_contig_i64_impl_dispatch_vector
+                          [src_typeid];
+
+            extract_ev =
+                fn(exec_q, cumsum_sz, src_data_p, cumsum_data_p, dst_data_p,
+                   dst_shape_vec[0], dst_strides_vec[0], depends);
+
+            //
+            host_task_events.push_back(extract_ev);
+        }
+        else {
+            // empty orthogonal directions
+            auto fn =
+                (use_i32)
+                    ? masked_extract_all_slices_strided_i32_impl_dispatch_vector
+                          [src_typeid]
+                    : masked_extract_all_slices_strided_i64_impl_dispatch_vector
+                          [src_typeid];
+
+            using dpctl::tensor::offset_utils::device_allocate_and_pack;
+            auto ptr_size_event_tuple1 = device_allocate_and_pack<py::ssize_t>(
+                exec_q, host_task_events, src_shape_vec, src_strides_vec);
+            auto packed_src_shape_strides_owner =
+                std::move(std::get<0>(ptr_size_event_tuple1));
+            sycl::event copy_src_shape_strides_ev =
+                std::get<2>(ptr_size_event_tuple1);
+            const py::ssize_t *packed_src_shape_strides =
+                packed_src_shape_strides_owner.get();
+
+            std::vector<sycl::event> all_deps;
+            all_deps.reserve(depends.size() + 1);
+            all_deps.insert(all_deps.end(), depends.begin(), depends.end());
+            all_deps.push_back(copy_src_shape_strides_ev);
+
+            assert(all_deps.size() == depends.size() + 1);
+
+            extract_ev = fn(exec_q, cumsum_sz, src_data_p, cumsum_data_p,
+                            dst_data_p, src_nd, packed_src_shape_strides,
+                            dst_shape_vec[0], dst_strides_vec[0], all_deps);
+
+            sycl::event cleanup_tmp_allocations_ev =
+                dpctl::tensor::alloc_utils::async_smart_free(
+                    exec_q, {extract_ev}, packed_src_shape_strides_owner);
+            host_task_events.push_back(cleanup_tmp_allocations_ev);
+        }
+    }
+    else {
+        // non-empty orthogonal directions
+        auto fn =
+            (use_i32)
+                ? masked_extract_some_slices_strided_i32_impl_dispatch_vector
+                      [src_typeid]
+                : masked_extract_some_slices_strided_i64_impl_dispatch_vector
+                      [src_typeid];
+
+        int masked_src_nd = mask_span_sz;
+        int ortho_nd = src_nd - masked_src_nd;
+
+        using shT = std::vector<py::ssize_t>;
+
+        shT ortho_src_shape;
+        shT masked_src_shape;
+        shT ortho_src_strides;
+        shT masked_src_strides;
+        dpctl::tensor::py_internal::split_iteration_space(
+            src_shape_vec, src_strides_vec, axis_start, axis_end,
+            ortho_src_shape,
+            masked_src_shape, // 4 vectors modified
+            ortho_src_strides, masked_src_strides);
+
+        shT ortho_dst_shape;
+        shT masked_dst_shape;
+        shT ortho_dst_strides;
+        shT masked_dst_strides;
+        dpctl::tensor::py_internal::split_iteration_space(
+            dst_shape_vec, dst_strides_vec, axis_start, axis_start + 1,
+            ortho_dst_shape,
+            masked_dst_shape, // 4 vectors modified
+            ortho_dst_strides, masked_dst_strides);
+
+        assert(ortho_src_shape.size() == static_cast<std::size_t>(ortho_nd));
+        assert(ortho_dst_shape.size() == static_cast<std::size_t>(ortho_nd));
+        assert(std::equal(ortho_src_shape.begin(), ortho_src_shape.end(),
+                          ortho_dst_shape.begin()));
+
+        std::vector<py::ssize_t> simplified_ortho_shape;
+        std::vector<py::ssize_t> simplified_ortho_src_strides;
+        std::vector<py::ssize_t> simplified_ortho_dst_strides;
+
+        const py::ssize_t *_shape = ortho_src_shape.data();
+
+        py::ssize_t ortho_src_offset(0);
+        py::ssize_t ortho_dst_offset(0);
+
+        dpctl::tensor::py_internal::simplify_iteration_space(
+            ortho_nd, _shape, ortho_src_strides, ortho_dst_strides,
+            // output
+            simplified_ortho_shape, simplified_ortho_src_strides,
+            simplified_ortho_dst_strides, ortho_src_offset, ortho_dst_offset);
+
+        assert(masked_dst_shape.size() == 1);
+        assert(masked_dst_strides.size() == 1);
+
+        using dpctl::tensor::offset_utils::device_allocate_and_pack;
+        auto ptr_size_event_tuple1 = device_allocate_and_pack<py::ssize_t>(
+            exec_q, host_task_events, simplified_ortho_shape,
+            simplified_ortho_src_strides, simplified_ortho_dst_strides,
+            masked_src_shape, masked_src_strides);
+        auto packed_shapes_strides_owner =
+            std::move(std::get<0>(ptr_size_event_tuple1));
+        sycl::event copy_shapes_strides_ev = std::get<2>(ptr_size_event_tuple1);
+        const py::ssize_t *packed_shapes_strides =
+            packed_shapes_strides_owner.get();
+
+        const py::ssize_t *packed_ortho_src_dst_shape_strides =
+            packed_shapes_strides;
+        const py::ssize_t *packed_masked_src_shape_strides =
+            packed_shapes_strides + (3 * ortho_nd);
+
+        std::vector<sycl::event> all_deps;
+        all_deps.reserve(depends.size() + 1);
+        all_deps.insert(all_deps.end(), depends.begin(), depends.end());
+        all_deps.push_back(copy_shapes_strides_ev);
+
+        assert(all_deps.size() == depends.size() + 1);
+
+        // OrthogIndexerT orthog_src_dst_indexer_, MaskedIndexerT
+        // masked_src_indexer_, MaskedIndexerT masked_dst_indexer_
+        extract_ev = fn(exec_q, ortho_nelems, masked_src_nelems, src_data_p,
+                        cumsum_data_p, dst_data_p,
+                        // data to build orthog_src_dst_indexer
+                        ortho_nd, packed_ortho_src_dst_shape_strides,
+                        ortho_src_offset, ortho_dst_offset,
+                        // data to build masked_src_indexer
+                        masked_src_nd, packed_masked_src_shape_strides,
+                        // data to build masked_dst_indexer,
+                        masked_dst_shape[0], masked_dst_strides[0], all_deps);
+
+        sycl::event cleanup_tmp_allocations_ev =
+            dpctl::tensor::alloc_utils::async_smart_free(
+                exec_q, {extract_ev}, packed_shapes_strides_owner);
+        host_task_events.push_back(cleanup_tmp_allocations_ev);
+    }
+
+    sycl::event py_obj_management_host_task_ev = dpctl::utils::keep_args_alive(
+        exec_q, {src, cumsum, dst}, host_task_events);
+
+    return std::make_pair(py_obj_management_host_task_ev, extract_ev);
+}
+
+// Masked placement
+
+using dpctl::tensor::kernels::indexing::
+    masked_place_all_slices_strided_impl_fn_ptr_t;
+
+static masked_place_all_slices_strided_impl_fn_ptr_t
+    masked_place_all_slices_strided_i32_impl_dispatch_vector[td_ns::num_types];
+static masked_place_all_slices_strided_impl_fn_ptr_t
+    masked_place_all_slices_strided_i64_impl_dispatch_vector[td_ns::num_types];
+
+using dpctl::tensor::kernels::indexing::
+    masked_place_some_slices_strided_impl_fn_ptr_t;
+
+static masked_place_some_slices_strided_impl_fn_ptr_t
+    masked_place_some_slices_strided_i32_impl_dispatch_vector[td_ns::num_types];
+static masked_place_some_slices_strided_impl_fn_ptr_t
+    masked_place_some_slices_strided_i64_impl_dispatch_vector[td_ns::num_types];
+
+void populate_masked_place_dispatch_vectors(void)
+{
+    using dpctl::tensor::kernels::indexing::
+        MaskPlaceAllSlicesStridedFactoryForInt32;
+    td_ns::DispatchVectorBuilder<masked_place_all_slices_strided_impl_fn_ptr_t,
+                                 MaskPlaceAllSlicesStridedFactoryForInt32,
+                                 td_ns::num_types>
+        dvb1;
+    dvb1.populate_dispatch_vector(
+        masked_place_all_slices_strided_i32_impl_dispatch_vector);
+
+    using dpctl::tensor::kernels::indexing::
+        MaskPlaceAllSlicesStridedFactoryForInt64;
+    td_ns::DispatchVectorBuilder<masked_place_all_slices_strided_impl_fn_ptr_t,
+                                 MaskPlaceAllSlicesStridedFactoryForInt64,
+                                 td_ns::num_types>
+        dvb2;
+    dvb2.populate_dispatch_vector(
+        masked_place_all_slices_strided_i64_impl_dispatch_vector);
+
+    using dpctl::tensor::kernels::indexing::
+        MaskPlaceSomeSlicesStridedFactoryForInt32;
+    td_ns::DispatchVectorBuilder<masked_place_some_slices_strided_impl_fn_ptr_t,
+                                 MaskPlaceSomeSlicesStridedFactoryForInt32,
+                                 td_ns::num_types>
+        dvb3;
+    dvb3.populate_dispatch_vector(
+        masked_place_some_slices_strided_i32_impl_dispatch_vector);
+
+    using dpctl::tensor::kernels::indexing::
+        MaskPlaceSomeSlicesStridedFactoryForInt64;
+    td_ns::DispatchVectorBuilder<masked_place_some_slices_strided_impl_fn_ptr_t,
+                                 MaskPlaceSomeSlicesStridedFactoryForInt64,
+                                 td_ns::num_types>
+        dvb4;
+    dvb4.populate_dispatch_vector(
+        masked_place_some_slices_strided_i64_impl_dispatch_vector);
+}
+
+/*
+ * @brief Copy dst[i, ortho_id] = rhs[cumsum[i] - 1, ortho_id]  if cumsum[i] ==
+ * ((i > 0) ? cumsum[i-1] + 1 : 1)
+ */
+std::pair<sycl::event, sycl::event>
+    py_place(const dpctl::tensor::usm_ndarray &dst,
+             const dpctl::tensor::usm_ndarray &cumsum,
+             int axis_start, // axis_start <= mask_i < axis_end
+             int axis_end,
+             const dpctl::tensor::usm_ndarray &rhs,
+             sycl::queue &exec_q,
+             const std::vector<sycl::event> &depends)
+{
+    dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst);
+
+    int dst_nd = dst.get_ndim();
+    if ((axis_start < 0 || axis_end > dst_nd || axis_start >= axis_end)) {
+        throw py::value_error("Specified axes_start and axes_end are invalid.");
+    }
+    int mask_span_sz = axis_end - axis_start;
+
+    int rhs_nd = rhs.get_ndim();
+    if (dst_nd != rhs_nd + (mask_span_sz - 1)) {
+        throw py::value_error("Number of dimensions of source and destination "
+                              "arrays is not consistent");
+    }
+
+    if (!cumsum.is_c_contiguous() || cumsum.get_ndim() != 1) {
+        throw py::value_error("cumsum array must be a C-contiguous vector");
+    }
+
+    if (!dpctl::utils::queues_are_compatible(exec_q, {dst, cumsum, rhs})) {
+        throw py::value_error(
+            "Execution queue is not compatible with allocation queues");
+    }
+
+    py::ssize_t cumsum_sz = cumsum.get_size();
+
+    const py::ssize_t *dst_shape = dst.get_shape_raw();
+    const py::ssize_t *rhs_shape = rhs.get_shape_raw();
+    bool same_ortho_dims(true);
+    std::size_t ortho_nelems(1); // number of orthogonal iterations
+
+    for (auto i = 0; i < axis_start; ++i) {
+        auto dst_sh_i = dst_shape[i];
+        ortho_nelems *= dst_sh_i;
+        same_ortho_dims = same_ortho_dims && (dst_sh_i == rhs_shape[i]);
+    }
+    for (auto i = axis_end; i < dst_nd; ++i) {
+        auto dst_sh_i = dst_shape[i];
+        ortho_nelems *= dst_sh_i;
+        same_ortho_dims =
+            same_ortho_dims && (dst_sh_i == rhs_shape[i - (mask_span_sz - 1)]);
+    }
+
+    std::size_t masked_dst_nelems(1);
+    for (auto i = axis_start; i < axis_end; ++i) {
+        masked_dst_nelems *= dst_shape[i];
+    }
+
+    if (!same_ortho_dims ||
+        (masked_dst_nelems != static_cast<std::size_t>(cumsum_sz)))
+    {
+        throw py::value_error("Inconsistent array dimensions");
+    }
+
+    dpctl::tensor::validation::AmpleMemory::throw_if_not_ample(
+        dst, ortho_nelems * masked_dst_nelems);
+
+    auto const &overlap = dpctl::tensor::overlap::MemoryOverlap();
+    // check that dst does not intersect with src, not with cumsum.
+    if (overlap(dst, rhs) || overlap(dst, cumsum)) {
+        throw py::value_error("Destination array overlaps with inputs");
+    }
+
+    int dst_typenum = dst.get_typenum();
+    int rhs_typenum = rhs.get_typenum();
+    int cumsum_typenum = cumsum.get_typenum();
+
+    auto const &array_types = td_ns::usm_ndarray_types();
+    int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum);
+    int rhs_typeid = array_types.typenum_to_lookup_id(rhs_typenum);
+    int cumsum_typeid = array_types.typenum_to_lookup_id(cumsum_typenum);
+
+    static constexpr int int32_typeid =
+        static_cast<int>(td_ns::typenum_t::INT32);
+    static constexpr int int64_typeid =
+        static_cast<int>(td_ns::typenum_t::INT64);
+    if (cumsum_typeid != int32_typeid && cumsum_typeid != int64_typeid) {
+        throw py::value_error("Unexpected data type of cumsum array, expecting "
+                              "'int32' or 'int64'");
+    }
+
+    const bool use_i32 = (cumsum_typeid == int32_typeid);
+
+    if (dst_typeid != rhs_typeid) {
+        throw py::value_error(
+            "Destination array must have the same elemental data types");
+    }
+
+    char *dst_data_p = dst.get_data();
+    char *rhs_data_p = rhs.get_data();
+    char *cumsum_data_p = cumsum.get_data();
+
+    auto dst_shape_vec = dst.get_shape_vector();
+    auto dst_strides_vec = dst.get_strides_vector();
+
+    auto rhs_shape_vec = rhs.get_shape_vector();
+    auto rhs_strides_vec = rhs.get_strides_vector();
+
+    sycl::event place_ev;
+    std::vector<sycl::event> host_task_events{};
+    if (axis_start == 0 && axis_end == dst_nd) {
+        // empty orthogonal directions
+        auto fn = (use_i32)
+                      ? masked_place_all_slices_strided_i32_impl_dispatch_vector
+                            [dst_typeid]
+                      : masked_place_all_slices_strided_i64_impl_dispatch_vector
+                            [dst_typeid];
+
+        assert(rhs_shape_vec.size() == 1);
+        assert(rhs_strides_vec.size() == 1);
+
+        using dpctl::tensor::offset_utils::device_allocate_and_pack;
+        auto ptr_size_event_tuple1 = device_allocate_and_pack<py::ssize_t>(
+            exec_q, host_task_events, dst_shape_vec, dst_strides_vec);
+        auto packed_dst_shape_strides_owner =
+            std::move(std::get<0>(ptr_size_event_tuple1));
+        sycl::event copy_dst_shape_strides_ev =
+            std::get<2>(ptr_size_event_tuple1);
+        const py::ssize_t *packed_dst_shape_strides =
+            packed_dst_shape_strides_owner.get();
+
+        std::vector<sycl::event> all_deps;
+        all_deps.reserve(depends.size() + 1);
+        all_deps.insert(all_deps.end(), depends.begin(), depends.end());
+        all_deps.push_back(copy_dst_shape_strides_ev);
+
+        assert(all_deps.size() == depends.size() + 1);
+
+        place_ev = fn(exec_q, cumsum_sz, dst_data_p, cumsum_data_p, rhs_data_p,
+                      dst_nd, packed_dst_shape_strides, rhs_shape_vec[0],
+                      rhs_strides_vec[0], all_deps);
+
+        sycl::event cleanup_tmp_allocations_ev =
+            dpctl::tensor::alloc_utils::async_smart_free(
+                exec_q, {place_ev}, packed_dst_shape_strides_owner);
+        host_task_events.push_back(cleanup_tmp_allocations_ev);
+    }
+    else {
+        // non-empty orthogonal directions
+        auto fn =
+            (use_i32)
+                ? masked_place_some_slices_strided_i32_impl_dispatch_vector
+                      [dst_typeid]
+                : masked_place_some_slices_strided_i64_impl_dispatch_vector
+                      [dst_typeid];
+
+        int masked_dst_nd = mask_span_sz;
+        int ortho_nd = dst_nd - masked_dst_nd;
+
+        using shT = std::vector<py::ssize_t>;
+
+        shT ortho_dst_shape;
+        shT masked_dst_shape;
+        shT ortho_dst_strides;
+        shT masked_dst_strides;
+        dpctl::tensor::py_internal::split_iteration_space(
+            dst_shape_vec, dst_strides_vec, axis_start, axis_end,
+            ortho_dst_shape,
+            masked_dst_shape, // 4 vectors modified
+            ortho_dst_strides, masked_dst_strides);
+
+        shT ortho_rhs_shape;
+        shT masked_rhs_shape;
+        shT ortho_rhs_strides;
+        shT masked_rhs_strides;
+        dpctl::tensor::py_internal::split_iteration_space(
+            rhs_shape_vec, rhs_strides_vec, axis_start, axis_start + 1,
+            ortho_rhs_shape,
+            masked_rhs_shape, // 4 vectors modified
+            ortho_rhs_strides, masked_rhs_strides);
+
+        assert(ortho_dst_shape.size() == static_cast<std::size_t>(ortho_nd));
+        assert(ortho_rhs_shape.size() == static_cast<std::size_t>(ortho_nd));
+        assert(std::equal(ortho_dst_shape.begin(), ortho_dst_shape.end(),
+                          ortho_rhs_shape.begin()));
+
+        std::vector<py::ssize_t> simplified_ortho_shape;
+        std::vector<py::ssize_t> simplified_ortho_dst_strides;
+        std::vector<py::ssize_t> simplified_ortho_rhs_strides;
+
+        const py::ssize_t *_shape = ortho_dst_shape.data();
+
+        py::ssize_t ortho_dst_offset(0);
+        py::ssize_t ortho_rhs_offset(0);
+
+        dpctl::tensor::py_internal::simplify_iteration_space(
+            ortho_nd, _shape, ortho_dst_strides, ortho_rhs_strides,
+            simplified_ortho_shape, simplified_ortho_dst_strides,
+            simplified_ortho_rhs_strides, ortho_dst_offset, ortho_rhs_offset);
+
+        assert(masked_rhs_shape.size() == 1);
+        assert(masked_rhs_strides.size() == 1);
+
+        using dpctl::tensor::offset_utils::device_allocate_and_pack;
+        auto ptr_size_event_tuple1 = device_allocate_and_pack<py::ssize_t>(
+            exec_q, host_task_events, simplified_ortho_shape,
+            simplified_ortho_dst_strides, simplified_ortho_rhs_strides,
+            masked_dst_shape, masked_dst_strides);
+        auto packed_shapes_strides_owner =
+            std::move(std::get<0>(ptr_size_event_tuple1));
+        sycl::event copy_shapes_strides_ev = std::get<2>(ptr_size_event_tuple1);
+        const py::ssize_t *packed_shapes_strides =
+            packed_shapes_strides_owner.get();
+
+        const py::ssize_t *packed_ortho_dst_rhs_shape_strides =
+            packed_shapes_strides;
+        const py::ssize_t *packed_masked_dst_shape_strides =
+            packed_shapes_strides + (3 * ortho_nd);
+
+        std::vector<sycl::event> all_deps;
+        all_deps.reserve(depends.size() + 1);
+        all_deps.insert(all_deps.end(), depends.begin(), depends.end());
+        all_deps.push_back(copy_shapes_strides_ev);
+
+        assert(all_deps.size() == depends.size() + 1);
+
+        place_ev = fn(exec_q, ortho_nelems, masked_dst_nelems, dst_data_p,
+                      cumsum_data_p, rhs_data_p,
+                      // data to build orthog_dst_rhs_indexer
+                      ortho_nd, packed_ortho_dst_rhs_shape_strides,
+                      ortho_dst_offset, ortho_rhs_offset,
+                      // data to build masked_dst_indexer
+                      masked_dst_nd, packed_masked_dst_shape_strides,
+                      // data to build masked_dst_indexer,
+                      masked_rhs_shape[0], masked_rhs_strides[0], all_deps);
+
+        sycl::event cleanup_tmp_allocations_ev =
+            dpctl::tensor::alloc_utils::async_smart_free(
+                exec_q, {place_ev}, packed_shapes_strides_owner);
+        host_task_events.push_back(cleanup_tmp_allocations_ev);
+    }
+
+    sycl::event py_obj_management_host_task_ev = dpctl::utils::keep_args_alive(
+        exec_q, {dst, cumsum, rhs}, host_task_events);
+
+    return std::make_pair(py_obj_management_host_task_ev, place_ev);
+}
+
+// Non-zero
+
+std::pair<sycl::event, sycl::event>
+    py_nonzero(const dpctl::tensor::usm_ndarray
+                   &cumsum, // int32/int64 input array, 1D, C-contiguous
+               const dpctl::tensor::usm_ndarray
+                   &indexes, // int32/int64 2D output array, C-contiguous
+               const std::vector<py::ssize_t>
+                   &mask_shape, // shape of array from which cumsum was computed
+               sycl::queue &exec_q,
+               const std::vector<sycl::event> &depends)
+{
+    if (!dpctl::utils::queues_are_compatible(exec_q, {cumsum, indexes})) {
+        throw py::value_error(
+            "Execution queue is not compatible with allocation queues");
+    }
+
+    dpctl::tensor::validation::CheckWritable::throw_if_not_writable(indexes);
+
+    int cumsum_nd = cumsum.get_ndim();
+    if (cumsum_nd != 1 || !cumsum.is_c_contiguous()) {
+        throw py::value_error("Cumsum array must be a C-contiguous vector");
+    }
+
+    int indexes_nd = indexes.get_ndim();
+    if (indexes_nd != 2 || !indexes.is_c_contiguous()) {
+        throw py::value_error("Index array must be a C-contiguous matrix");
+    }
+
+    std::size_t _ndim = mask_shape.size();
+    if (_ndim > std::numeric_limits<int>::max()) {
+        throw py::value_error("Shape is too large");
+    }
+    int ndim = static_cast<int>(_ndim);
+
+    const py::ssize_t *indexes_shape = indexes.get_shape_raw();
+
+    if (ndim != indexes_shape[0]) {
+        throw py::value_error(
+            "Length of shape must equal width of index matrix");
+    }
+
+    auto cumsum_sz = cumsum.get_size();
+    py::ssize_t shape_nelems =
+        std::accumulate(mask_shape.begin(), mask_shape.end(), py::ssize_t(1),
+                        std::multiplies<py::ssize_t>());
+
+    if (cumsum_sz != shape_nelems) {
+        throw py::value_error("Shape and cumsum size are not consistent");
+    }
+
+    py::ssize_t nz_elems = indexes_shape[1];
+
+    int indexes_typenum = indexes.get_typenum();
+    auto const &array_types = td_ns::usm_ndarray_types();
+    int indexes_typeid = array_types.typenum_to_lookup_id(indexes_typenum);
+
+    int cumsum_typenum = cumsum.get_typenum();
+    int cumsum_typeid = array_types.typenum_to_lookup_id(cumsum_typenum);
+
+    constexpr int int32_typeid = static_cast<int>(td_ns::typenum_t::INT32);
+    constexpr int int64_typeid = static_cast<int>(td_ns::typenum_t::INT64);
+
+    // cumsum must be int32_t or int64_t only
+    if ((cumsum_typeid != int32_typeid && cumsum_typeid != int64_typeid) ||
+        (indexes_typeid != int32_typeid && indexes_typeid != int64_typeid))
+    {
+        throw py::value_error("Cumulative sum array and index array must have "
+                              "int32 or int64 data-type");
+    }
+
+    if (cumsum_sz == 0) {
+        return std::make_pair(sycl::event(), sycl::event());
+    }
+
+    auto const &overlap = dpctl::tensor::overlap::MemoryOverlap();
+    if (overlap(cumsum, indexes)) {
+        throw py::value_error("Arrays are expected to ave no memory overlap");
+    }
+
+    dpctl::tensor::validation::AmpleMemory::throw_if_not_ample(
+        indexes, nz_elems * _ndim);
+
+    std::vector<sycl::event> host_task_events;
+    host_task_events.reserve(2);
+
+    using dpctl::tensor::offset_utils::device_allocate_and_pack;
+    auto mask_shape_copying_tuple = device_allocate_and_pack<py::ssize_t>(
+        exec_q, host_task_events, mask_shape);
+    auto src_shape_device_owner =
+        std::move(std::get<0>(mask_shape_copying_tuple));
+    sycl::event copy_ev = std::get<2>(mask_shape_copying_tuple);
+    const py::ssize_t *src_shape_device_ptr = src_shape_device_owner.get();
+
+    std::vector<sycl::event> all_deps;
+    all_deps.reserve(depends.size() + 1);
+
+    all_deps.insert(all_deps.end(), depends.begin(), depends.end());
+    all_deps.push_back(copy_ev);
+
+    using dpctl::tensor::kernels::indexing::non_zero_indexes_fn_ptr_t;
+    using dpctl::tensor::kernels::indexing::non_zero_indexes_impl;
+
+    int fn_index = ((cumsum_typeid == int64_typeid) ? 1 : 0) +
+                   ((indexes_typeid == int64_typeid) ? 2 : 0);
+    std::array<non_zero_indexes_fn_ptr_t, 4> fn_impls = {
+        non_zero_indexes_impl<std::int32_t, std::int32_t>,
+        non_zero_indexes_impl<std::int64_t, std::int32_t>,
+        non_zero_indexes_impl<std::int32_t, std::int64_t>,
+        non_zero_indexes_impl<std::int64_t, std::int64_t>};
+    auto fn = fn_impls[fn_index];
+
+    sycl::event non_zero_indexes_ev =
+        fn(exec_q, cumsum_sz, nz_elems, ndim, cumsum.get_data(),
+           indexes.get_data(), src_shape_device_ptr, all_deps);
+
+    sycl::event temporaries_cleanup_ev =
+        dpctl::tensor::alloc_utils::async_smart_free(
+            exec_q, {non_zero_indexes_ev}, src_shape_device_owner);
+    host_task_events.push_back(temporaries_cleanup_ev);
+
+    sycl::event py_obj_management_host_task_ev = dpctl::utils::keep_args_alive(
+        exec_q, {cumsum, indexes}, host_task_events);
+
+    return std::make_pair(py_obj_management_host_task_ev, non_zero_indexes_ev);
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/boolean_advanced_indexing.hpp b/dpctl_ext/tensor/libtensor/source/boolean_advanced_indexing.hpp
new file mode 100644
index 000000000000..71eafc77b00c
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/boolean_advanced_indexing.hpp
@@ -0,0 +1,81 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions
+//===----------------------------------------------------------------------===//
+
+#pragma once
+#include <utility>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern std::pair<sycl::event, sycl::event>
+    py_extract(const dpctl::tensor::usm_ndarray &src,
+               const dpctl::tensor::usm_ndarray &cumsum,
+               int axis_start, // axis_start <= mask_i < axis_end
+               int axis_end,
+               const dpctl::tensor::usm_ndarray &dst,
+               sycl::queue &exec_q,
+               const std::vector<sycl::event> &depends = {});
+
+extern void populate_masked_extract_dispatch_vectors(void);
+
+extern std::pair<sycl::event, sycl::event>
+    py_place(const dpctl::tensor::usm_ndarray &dst,
+             const dpctl::tensor::usm_ndarray &cumsum,
+             int axis_start, // axis_start <= mask_i < axis_end
+             int axis_end,
+             const dpctl::tensor::usm_ndarray &rhs,
+             sycl::queue &exec_q,
+             const std::vector<sycl::event> &depends = {});
+
+extern void populate_masked_place_dispatch_vectors(void);
+
+extern std::pair<sycl::event, sycl::event>
+    py_nonzero(const dpctl::tensor::usm_ndarray
+                   &cumsum, // int32 input array, 1D, C-contiguous
+               const dpctl::tensor::usm_ndarray
+                   &indexes, // int32 2D output array, C-contiguous
+               const std::vector<py::ssize_t>
+                   &mask_shape, // shape of array from which cumsum was computed
+               sycl::queue &exec_q,
+               const std::vector<sycl::event> &depends = {});
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/eye_ctor.cpp b/dpctl_ext/tensor/libtensor/source/eye_ctor.cpp
new file mode 100644
index 000000000000..025a7d58d06e
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/eye_ctor.cpp
@@ -0,0 +1,142 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===--------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions
+//===--------------------------------------------------------------------===//
+
+#include <algorithm>
+#include <cstddef>
+#include <utility>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/pybind11.h>
+
+#include "eye_ctor.hpp"
+#include "kernels/constructors.hpp"
+#include "utils/output_validation.hpp"
+#include "utils/type_dispatch.hpp"
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace dpctl::tensor::py_internal
+{
+
+using dpctl::utils::keep_args_alive;
+
+using dpctl::tensor::kernels::constructors::eye_fn_ptr_t;
+static eye_fn_ptr_t eye_dispatch_vector[td_ns::num_types];
+
+std::pair<sycl::event, sycl::event>
+    usm_ndarray_eye(py::ssize_t k,
+                    const dpctl::tensor::usm_ndarray &dst,
+                    sycl::queue &exec_q,
+                    const std::vector<sycl::event> &depends)
+{
+    // dst must be 2D
+
+    if (dst.get_ndim() != 2) {
+        throw py::value_error(
+            "usm_ndarray_eye: Expecting 2D array to populate");
+    }
+
+    if (!dpctl::utils::queues_are_compatible(exec_q, {dst})) {
+        throw py::value_error("Execution queue is not compatible with the "
+                              "allocation queue");
+    }
+
+    dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst);
+
+    auto array_types = td_ns::usm_ndarray_types();
+    int dst_typenum = dst.get_typenum();
+    int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum);
+
+    const py::ssize_t nelem = dst.get_size();
+    const py::ssize_t rows = dst.get_shape(0);
+    const py::ssize_t cols = dst.get_shape(1);
+    if (rows == 0 || cols == 0) {
+        // nothing to do
+        return std::make_pair(sycl::event{}, sycl::event{});
+    }
+
+    bool is_dst_c_contig = dst.is_c_contiguous();
+    bool is_dst_f_contig = dst.is_f_contiguous();
+    if (!is_dst_c_contig && !is_dst_f_contig) {
+        throw py::value_error("USM array is not contiguous");
+    }
+
+    py::ssize_t start;
+    if (is_dst_c_contig) {
+        start = (k < 0) ? -k * cols : k;
+    }
+    else {
+        start = (k < 0) ? -k : k * rows;
+    }
+
+    const py::ssize_t *strides = dst.get_strides_raw();
+    py::ssize_t step;
+    if (strides == nullptr) {
+        step = (is_dst_c_contig) ? cols + 1 : rows + 1;
+    }
+    else {
+        step = strides[0] + strides[1];
+    }
+
+    const py::ssize_t length = std::min({rows, cols, rows + k, cols - k});
+    const py::ssize_t end = start + step * (length - 1);
+
+    char *dst_data = dst.get_data();
+    sycl::event eye_event;
+
+    auto fn = eye_dispatch_vector[dst_typeid];
+
+    eye_event = fn(exec_q, static_cast<std::size_t>(nelem), start, end, step,
+                   dst_data, depends);
+
+    return std::make_pair(keep_args_alive(exec_q, {dst}, {eye_event}),
+                          eye_event);
+}
+
+void init_eye_ctor_dispatch_vectors(void)
+{
+    using namespace td_ns;
+    using dpctl::tensor::kernels::constructors::EyeFactory;
+
+    DispatchVectorBuilder<eye_fn_ptr_t, EyeFactory, num_types> dvb;
+    dvb.populate_dispatch_vector(eye_dispatch_vector);
+
+    return;
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/eye_ctor.hpp b/dpctl_ext/tensor/libtensor/source/eye_ctor.hpp
new file mode 100644
index 000000000000..dda7f2c4813a
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/eye_ctor.hpp
@@ -0,0 +1,57 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===--------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions
+//===--------------------------------------------------------------------===//
+
+#pragma once
+#include <utility>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern std::pair<sycl::event, sycl::event>
+    usm_ndarray_eye(py::ssize_t k,
+                    const dpctl::tensor::usm_ndarray &dst,
+                    sycl::queue &exec_q,
+                    const std::vector<sycl::event> &depends = {});
+
+extern void init_eye_ctor_dispatch_vectors(void);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp b/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp
index 3e5be4d9e8fe..98ab488e5879 100644
--- a/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp
+++ b/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp
@@ -43,8 +43,8 @@
 
 #include "dpnp4pybind11.hpp"
 
-// #include "accumulators.hpp"
-// #include "boolean_advanced_indexing.hpp"
+#include "accumulators.hpp"
+#include "boolean_advanced_indexing.hpp"
 // #include "clip.hpp"
 #include "copy_and_cast_usm_to_usm.hpp"
 #include "copy_as_contig.hpp"
@@ -52,7 +52,7 @@
 #include "copy_for_roll.hpp"
 #include "copy_numpy_ndarray_into_usm_ndarray.hpp"
 #include "device_support_queries.hpp"
-// #include "eye_ctor.hpp"
+#include "eye_ctor.hpp"
 #include "full_ctor.hpp"
 #include "integer_advanced_indexing.hpp"
 #include "kernels/dpctl_tensor_types.hpp"
@@ -112,19 +112,19 @@ using dpctl::tensor::py_internal::usm_ndarray_zeros;
 using dpctl::tensor::py_internal::usm_ndarray_put;
 using dpctl::tensor::py_internal::usm_ndarray_take;
 
-// using dpctl::tensor::py_internal::py_extract;
-// using dpctl::tensor::py_internal::py_mask_positions;
-// using dpctl::tensor::py_internal::py_nonzero;
-// using dpctl::tensor::py_internal::py_place;
+using dpctl::tensor::py_internal::py_extract;
+using dpctl::tensor::py_internal::py_mask_positions;
+using dpctl::tensor::py_internal::py_nonzero;
+using dpctl::tensor::py_internal::py_place;
 
 /* ================= Repeat ====================*/
-// using dpctl::tensor::py_internal::py_cumsum_1d;
+using dpctl::tensor::py_internal::py_cumsum_1d;
 // using dpctl::tensor::py_internal::py_repeat_by_scalar;
 // using dpctl::tensor::py_internal::py_repeat_by_sequence;
 
 /* ================ Eye ================== */
 
-// using dpctl::tensor::py_internal::usm_ndarray_eye;
+using dpctl::tensor::py_internal::usm_ndarray_eye;
 
 /* =========================== Tril and triu ============================== */
 
@@ -160,15 +160,15 @@ void init_dispatch_vectors(void)
     // init_linear_sequences_dispatch_vectors();
     init_full_ctor_dispatch_vectors();
     init_zeros_ctor_dispatch_vectors();
-    // init_eye_ctor_dispatch_vectors();
+    init_eye_ctor_dispatch_vectors();
     init_triul_ctor_dispatch_vectors();
 
-    // populate_masked_extract_dispatch_vectors();
-    // populate_masked_place_dispatch_vectors();
+    populate_masked_extract_dispatch_vectors();
+    populate_masked_place_dispatch_vectors();
 
-    // populate_mask_positions_dispatch_vectors();
+    populate_mask_positions_dispatch_vectors();
 
-    // populate_cumsum_1d_dispatch_vectors();
+    populate_cumsum_1d_dispatch_vectors();
     // init_repeat_dispatch_vectors();
 
     // init_clip_dispatch_vectors();
@@ -348,15 +348,15 @@ PYBIND11_MODULE(_tensor_impl, m)
           py::arg("mode"), py::arg("sycl_queue"),
           py::arg("depends") = py::list());
 
-    // m.def("_eye", &usm_ndarray_eye,
-    //       "Fills input 2D contiguous usm_ndarray `dst` with "
-    //       "zeros outside of the diagonal "
-    //       "specified by "
-    //       "the diagonal index `k` "
-    //       "which is filled with ones."
-    //       "Returns a tuple of events: (ht_event, comp_event)",
-    //       py::arg("k"), py::arg("dst"), py::arg("sycl_queue"),
-    //       py::arg("depends") = py::list());
+    m.def("_eye", &usm_ndarray_eye,
+          "Fills input 2D contiguous usm_ndarray `dst` with "
+          "zeros outside of the diagonal "
+          "specified by "
+          "the diagonal index `k` "
+          "which is filled with ones."
+          "Returns a tuple of events: (ht_event, comp_event)",
+          py::arg("k"), py::arg("dst"), py::arg("sycl_queue"),
+          py::arg("depends") = py::list());
 
     m.def("default_device_fp_type",
           dpctl::tensor::py_internal::default_device_fp_type,
@@ -408,16 +408,16 @@ PYBIND11_MODULE(_tensor_impl, m)
           py::arg("dst"), py::arg("k") = 0, py::arg("sycl_queue"),
           py::arg("depends") = py::list());
 
-    // m.def("mask_positions", &py_mask_positions, "", py::arg("mask"),
-    //       py::arg("cumsum"), py::arg("sycl_queue"),
-    //       py::arg("depends") = py::list());
+    m.def("mask_positions", &py_mask_positions, "", py::arg("mask"),
+          py::arg("cumsum"), py::arg("sycl_queue"),
+          py::arg("depends") = py::list());
 
-    // m.def("_cumsum_1d", &py_cumsum_1d, "", py::arg("src"), py::arg("cumsum"),
-    //       py::arg("sycl_queue"), py::arg("depends") = py::list());
+    m.def("_cumsum_1d", &py_cumsum_1d, "", py::arg("src"), py::arg("cumsum"),
+          py::arg("sycl_queue"), py::arg("depends") = py::list());
 
-    // m.def("_extract", &py_extract, "", py::arg("src"), py::arg("cumsum"),
-    //       py::arg("axis_start"), py::arg("axis_end"), py::arg("dst"),
-    //       py::arg("sycl_queue"), py::arg("depends") = py::list());
+    m.def("_extract", &py_extract, "", py::arg("src"), py::arg("cumsum"),
+          py::arg("axis_start"), py::arg("axis_end"), py::arg("dst"),
+          py::arg("sycl_queue"), py::arg("depends") = py::list());
 
     auto overlap = [](const dpctl::tensor::usm_ndarray &x1,
                       const dpctl::tensor::usm_ndarray &x2) -> bool {
@@ -438,13 +438,13 @@ PYBIND11_MODULE(_tensor_impl, m)
           "Determines if the memory regions indexed by each array are the same",
           py::arg("array1"), py::arg("array2"));
 
-    // m.def("_place", &py_place, "", py::arg("dst"), py::arg("cumsum"),
-    //       py::arg("axis_start"), py::arg("axis_end"), py::arg("rhs"),
-    //       py::arg("sycl_queue"), py::arg("depends") = py::list());
+    m.def("_place", &py_place, "", py::arg("dst"), py::arg("cumsum"),
+          py::arg("axis_start"), py::arg("axis_end"), py::arg("rhs"),
+          py::arg("sycl_queue"), py::arg("depends") = py::list());
 
-    // m.def("_nonzero", &py_nonzero, "", py::arg("cumsum"), py::arg("indexes"),
-    //       py::arg("mask_shape"), py::arg("sycl_queue"),
-    //       py::arg("depends") = py::list());
+    m.def("_nonzero", &py_nonzero, "", py::arg("cumsum"), py::arg("indexes"),
+          py::arg("mask_shape"), py::arg("sycl_queue"),
+          py::arg("depends") = py::list());
 
     // m.def("_where", &py_where, "", py::arg("condition"), py::arg("x1"),
     //       py::arg("x2"), py::arg("dst"), py::arg("sycl_queue"),
diff --git a/dpnp/dpnp_container.py b/dpnp/dpnp_container.py
index acda579a5f5e..0727b9bfd775 100644
--- a/dpnp/dpnp_container.py
+++ b/dpnp/dpnp_container.py
@@ -196,7 +196,7 @@ def eye(
         order = "C"
 
     """Creates `dpnp_array` with ones on the `k`th diagonal."""
-    array_obj = dpt.eye(
+    array_obj = dpt_ext.eye(
         N,
         M,
         k=k,
diff --git a/dpnp/dpnp_iface_indexing.py b/dpnp/dpnp_iface_indexing.py
index b0769337c38b..f305b106221f 100644
--- a/dpnp/dpnp_iface_indexing.py
+++ b/dpnp/dpnp_iface_indexing.py
@@ -47,9 +47,6 @@
 import dpctl.tensor as dpt
 import dpctl.utils as dpu
 import numpy
-from dpctl.tensor._copy_utils import _nonzero_impl
-from dpctl.tensor._indexing_functions import _get_indexing_mode
-from dpctl.tensor._numpy_helper import normalize_axis_index
 
 # pylint: disable=no-name-in-module
 # TODO: revert to `import dpctl.tensor...`
@@ -60,6 +57,9 @@
 
 # pylint: disable=no-name-in-module
 import dpnp.backend.extensions.indexing._indexing_impl as indexing_ext
+from dpctl_ext.tensor._copy_utils import _nonzero_impl
+from dpctl_ext.tensor._indexing_functions import _get_indexing_mode
+from dpctl_ext.tensor._numpy_helper import normalize_axis_index
 
 # pylint: disable=no-name-in-module
 from .dpnp_algo import (
@@ -817,13 +817,13 @@ def extract(condition, a):
         usm_a = dpt_ext.reshape(usm_a, -1)
         usm_cond = dpt_ext.reshape(usm_cond, -1)
 
-        usm_res = dpt_ext.take(usm_a, dpt.nonzero(usm_cond)[0])
+        usm_res = dpt_ext.take(usm_a, dpt_ext.nonzero(usm_cond)[0])
     else:
         if usm_cond.shape != usm_a.shape:
             usm_a = dpt_ext.reshape(usm_a, -1)
             usm_cond = dpt_ext.reshape(usm_cond, -1)
 
-        usm_res = dpt.extract(usm_cond, usm_a)
+        usm_res = dpt_ext.extract(usm_cond, usm_a)
 
     return dpnp_array._create_from_usm_ndarray(usm_res)
 
@@ -1546,7 +1546,7 @@ def nonzero(a):
 
     usm_a = dpnp.get_usm_ndarray(a)
     return tuple(
-        dpnp_array._create_from_usm_ndarray(y) for y in dpt.nonzero(usm_a)
+        dpnp_array._create_from_usm_ndarray(y) for y in dpt_ext.nonzero(usm_a)
     )
 
 
@@ -1619,7 +1619,7 @@ def place(a, mask, vals):
             usm_vals, usm_a.dtype, casting="safe", copy=False
         )
 
-    dpt.place(usm_a, usm_mask, usm_vals)
+    dpt_ext.place(usm_a, usm_mask, usm_vals)
 
 
 def put(a, ind, v, /, *, axis=None, mode="wrap"):
@@ -1807,7 +1807,7 @@ def put_along_axis(a, ind, values, axis, mode="wrap"):
             values, usm_type=a.usm_type, sycl_queue=a.sycl_queue
         )
 
-    dpt.put_along_axis(usm_a, usm_ind, usm_vals, axis=axis, mode=mode)
+    dpt_ext.put_along_axis(usm_a, usm_ind, usm_vals, axis=axis, mode=mode)
 
 
 def putmask(x1, mask, values):
@@ -2295,7 +2295,7 @@ def take_along_axis(a, indices, axis=-1, mode="wrap"):
     usm_a = dpnp.get_usm_ndarray(a)
     usm_ind = dpnp.get_usm_ndarray(indices)
 
-    usm_res = dpt.take_along_axis(usm_a, usm_ind, axis=axis, mode=mode)
+    usm_res = dpt_ext.take_along_axis(usm_a, usm_ind, axis=axis, mode=mode)
     return dpnp_array._create_from_usm_ndarray(usm_res)
 
 
diff --git a/dpnp/dpnp_iface_manipulation.py b/dpnp/dpnp_iface_manipulation.py
index 4866c912ab6a..e988bbaa237b 100644
--- a/dpnp/dpnp_iface_manipulation.py
+++ b/dpnp/dpnp_iface_manipulation.py
@@ -415,7 +415,7 @@ def _get_first_nan_index(usm_a):
         if first_nan is not None:
             # all NaNs are collapsed, so need to replace the indices with
             # the index of the first NaN value in result array of unique values
-            dpt.place(
+            dpt_ext.place(
                 usm_res.inverse_indices,
                 usm_res.inverse_indices > first_nan,
                 dpt_ext.reshape(first_nan, 1),

From ecd4991fe7eba4fa5c6c58a89152250c53d8f6b3 Mon Sep 17 00:00:00 2001
From: Anton <100830759+antonwolfy@users.noreply.github.com>
Date: Thu, 5 Mar 2026 10:42:03 +0100
Subject: [PATCH 06/43] Clean up dpctl.tensor code (#2797)

This PR adds a small clean up to already porting dpctl.tensor code:
* remove unused includes
* add missing includes
* remove redundant namespace qualifications when calling function from
the same namespace
---
 .../include/kernels/constructors.hpp          |  2 +-
 .../kernels/elementwise_functions/common.hpp  |  3 --
 .../elementwise_functions/logaddexp.hpp       |  1 -
 .../kernels/elementwise_functions/maximum.hpp |  1 -
 .../kernels/elementwise_functions/minimum.hpp |  1 -
 .../kernels/integer_advanced_indexing.hpp     |  2 +-
 .../tensor/libtensor/source/accumulators.cpp  | 13 +++---
 .../tensor/libtensor/source/accumulators.hpp  |  1 -
 .../source/boolean_advanced_indexing.cpp      | 40 ++++++++----------
 .../source/copy_and_cast_usm_to_usm.cpp       |  9 ++--
 .../libtensor/source/copy_as_contig.cpp       | 42 +++++++++----------
 .../tensor/libtensor/source/copy_for_roll.cpp |  9 ++--
 .../source/device_support_queries.cpp         |  2 +-
 .../tensor/libtensor/source/full_ctor.cpp     |  5 ++-
 .../source/integer_advanced_indexing.cpp      |  3 --
 .../tensor/libtensor/source/tensor_ctors.cpp  |  3 --
 .../tensor/libtensor/source/zeros_ctor.cpp    |  2 -
 .../tensor/libtensor/source/zeros_ctor.hpp    |  1 -
 18 files changed, 59 insertions(+), 81 deletions(-)

diff --git a/dpctl_ext/tensor/libtensor/include/kernels/constructors.hpp b/dpctl_ext/tensor/libtensor/include/kernels/constructors.hpp
index 26ae46707a6b..f48dfa4d4077 100644
--- a/dpctl_ext/tensor/libtensor/include/kernels/constructors.hpp
+++ b/dpctl_ext/tensor/libtensor/include/kernels/constructors.hpp
@@ -33,8 +33,8 @@
 //===----------------------------------------------------------------------===//
 
 #pragma once
+
 #include <array>
-#include <complex>
 #include <cstddef>
 #include <vector>
 
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/common.hpp b/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/common.hpp
index d19930b722a9..e83426df8aa9 100644
--- a/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/common.hpp
+++ b/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/common.hpp
@@ -33,11 +33,8 @@
 #pragma once
 
 #include <algorithm>
-#include <cmath>
-#include <complex>
 #include <cstddef>
 #include <cstdint>
-#include <limits>
 #include <type_traits>
 #include <vector>
 
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/logaddexp.hpp b/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/logaddexp.hpp
index 8565df2cf528..7337b6e43eab 100644
--- a/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/logaddexp.hpp
+++ b/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/logaddexp.hpp
@@ -46,7 +46,6 @@
 #include "vec_size_util.hpp"
 
 #include "utils/math_utils.hpp"
-#include "utils/offset_utils.hpp"
 #include "utils/type_dispatch_building.hpp"
 #include "utils/type_utils.hpp"
 
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/maximum.hpp b/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/maximum.hpp
index 067ccd84f059..f204b6640042 100644
--- a/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/maximum.hpp
+++ b/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/maximum.hpp
@@ -45,7 +45,6 @@
 #include "vec_size_util.hpp"
 
 #include "utils/math_utils.hpp"
-#include "utils/offset_utils.hpp"
 #include "utils/type_dispatch_building.hpp"
 #include "utils/type_utils.hpp"
 
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/minimum.hpp b/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/minimum.hpp
index a38945f89a25..d18577a5cf4e 100644
--- a/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/minimum.hpp
+++ b/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/minimum.hpp
@@ -44,7 +44,6 @@
 #include "vec_size_util.hpp"
 
 #include "utils/math_utils.hpp"
-#include "utils/offset_utils.hpp"
 #include "utils/type_dispatch_building.hpp"
 #include "utils/type_utils.hpp"
 
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/integer_advanced_indexing.hpp b/dpctl_ext/tensor/libtensor/include/kernels/integer_advanced_indexing.hpp
index 7be2b3ea8591..f6d2f0175ce8 100644
--- a/dpctl_ext/tensor/libtensor/include/kernels/integer_advanced_indexing.hpp
+++ b/dpctl_ext/tensor/libtensor/include/kernels/integer_advanced_indexing.hpp
@@ -33,7 +33,7 @@
 //===----------------------------------------------------------------------===//
 
 #pragma once
-#include <complex>
+
 #include <cstddef>
 #include <type_traits>
 #include <vector>
diff --git a/dpctl_ext/tensor/libtensor/source/accumulators.cpp b/dpctl_ext/tensor/libtensor/source/accumulators.cpp
index 82913010755a..c6ab96418d47 100644
--- a/dpctl_ext/tensor/libtensor/source/accumulators.cpp
+++ b/dpctl_ext/tensor/libtensor/source/accumulators.cpp
@@ -32,17 +32,18 @@
 /// This file defines functions of dpctl.tensor._tensor_impl extensions
 //===----------------------------------------------------------------------===//
 
-#include <algorithm>
 #include <cstddef>
 #include <cstdint>
 #include <stdexcept>
+#include <string>
+#include <tuple>
+#include <utility>
 #include <vector>
 
 #include <sycl/sycl.hpp>
 
 #include "dpnp4pybind11.hpp"
 #include <pybind11/pybind11.h>
-#include <pybind11/stl.h>
 
 #include "kernels/accumulators.hpp"
 #include "simplify_iteration_space.hpp"
@@ -196,8 +197,8 @@ std::size_t py_mask_positions(const dpctl::tensor::usm_ndarray &mask,
     int mask_nd = mask.get_ndim();
     int nd = mask_nd;
 
-    dpctl::tensor::py_internal::compact_iteration_space(
-        nd, shape, strides_vector, compact_shape, compact_strides);
+    compact_iteration_space(nd, shape, strides_vector, compact_shape,
+                            compact_strides);
 
     // Strided implementation
     auto strided_fn =
@@ -351,8 +352,8 @@ std::size_t py_cumsum_1d(const dpctl::tensor::usm_ndarray &src,
     int src_nd = src.get_ndim();
     int nd = src_nd;
 
-    dpctl::tensor::py_internal::compact_iteration_space(
-        nd, shape, strides_vector, compact_shape, compact_strides);
+    compact_iteration_space(nd, shape, strides_vector, compact_shape,
+                            compact_strides);
 
     // Strided implementation
     auto strided_fn = cumsum_1d_strided_dispatch_vector[src_typeid];
diff --git a/dpctl_ext/tensor/libtensor/source/accumulators.hpp b/dpctl_ext/tensor/libtensor/source/accumulators.hpp
index 42503093789b..e400aad2dceb 100644
--- a/dpctl_ext/tensor/libtensor/source/accumulators.hpp
+++ b/dpctl_ext/tensor/libtensor/source/accumulators.hpp
@@ -39,7 +39,6 @@
 #include <sycl/sycl.hpp>
 
 #include "dpnp4pybind11.hpp"
-#include <pybind11/pybind11.h>
 
 namespace dpctl::tensor::py_internal
 {
diff --git a/dpctl_ext/tensor/libtensor/source/boolean_advanced_indexing.cpp b/dpctl_ext/tensor/libtensor/source/boolean_advanced_indexing.cpp
index a78cb1750b81..4c46e1e2fec8 100644
--- a/dpctl_ext/tensor/libtensor/source/boolean_advanced_indexing.cpp
+++ b/dpctl_ext/tensor/libtensor/source/boolean_advanced_indexing.cpp
@@ -336,21 +336,19 @@ std::pair<sycl::event, sycl::event>
         shT masked_src_shape;
         shT ortho_src_strides;
         shT masked_src_strides;
-        dpctl::tensor::py_internal::split_iteration_space(
-            src_shape_vec, src_strides_vec, axis_start, axis_end,
-            ortho_src_shape,
-            masked_src_shape, // 4 vectors modified
-            ortho_src_strides, masked_src_strides);
+        split_iteration_space(src_shape_vec, src_strides_vec, axis_start,
+                              axis_end, ortho_src_shape,
+                              masked_src_shape, // 4 vectors modified
+                              ortho_src_strides, masked_src_strides);
 
         shT ortho_dst_shape;
         shT masked_dst_shape;
         shT ortho_dst_strides;
         shT masked_dst_strides;
-        dpctl::tensor::py_internal::split_iteration_space(
-            dst_shape_vec, dst_strides_vec, axis_start, axis_start + 1,
-            ortho_dst_shape,
-            masked_dst_shape, // 4 vectors modified
-            ortho_dst_strides, masked_dst_strides);
+        split_iteration_space(dst_shape_vec, dst_strides_vec, axis_start,
+                              axis_start + 1, ortho_dst_shape,
+                              masked_dst_shape, // 4 vectors modified
+                              ortho_dst_strides, masked_dst_strides);
 
         assert(ortho_src_shape.size() == static_cast<std::size_t>(ortho_nd));
         assert(ortho_dst_shape.size() == static_cast<std::size_t>(ortho_nd));
@@ -366,7 +364,7 @@ std::pair<sycl::event, sycl::event>
         py::ssize_t ortho_src_offset(0);
         py::ssize_t ortho_dst_offset(0);
 
-        dpctl::tensor::py_internal::simplify_iteration_space(
+        simplify_iteration_space(
             ortho_nd, _shape, ortho_src_strides, ortho_dst_strides,
             // output
             simplified_ortho_shape, simplified_ortho_src_strides,
@@ -646,21 +644,19 @@ std::pair<sycl::event, sycl::event>
         shT masked_dst_shape;
         shT ortho_dst_strides;
         shT masked_dst_strides;
-        dpctl::tensor::py_internal::split_iteration_space(
-            dst_shape_vec, dst_strides_vec, axis_start, axis_end,
-            ortho_dst_shape,
-            masked_dst_shape, // 4 vectors modified
-            ortho_dst_strides, masked_dst_strides);
+        split_iteration_space(dst_shape_vec, dst_strides_vec, axis_start,
+                              axis_end, ortho_dst_shape,
+                              masked_dst_shape, // 4 vectors modified
+                              ortho_dst_strides, masked_dst_strides);
 
         shT ortho_rhs_shape;
         shT masked_rhs_shape;
         shT ortho_rhs_strides;
         shT masked_rhs_strides;
-        dpctl::tensor::py_internal::split_iteration_space(
-            rhs_shape_vec, rhs_strides_vec, axis_start, axis_start + 1,
-            ortho_rhs_shape,
-            masked_rhs_shape, // 4 vectors modified
-            ortho_rhs_strides, masked_rhs_strides);
+        split_iteration_space(rhs_shape_vec, rhs_strides_vec, axis_start,
+                              axis_start + 1, ortho_rhs_shape,
+                              masked_rhs_shape, // 4 vectors modified
+                              ortho_rhs_strides, masked_rhs_strides);
 
         assert(ortho_dst_shape.size() == static_cast<std::size_t>(ortho_nd));
         assert(ortho_rhs_shape.size() == static_cast<std::size_t>(ortho_nd));
@@ -676,7 +672,7 @@ std::pair<sycl::event, sycl::event>
         py::ssize_t ortho_dst_offset(0);
         py::ssize_t ortho_rhs_offset(0);
 
-        dpctl::tensor::py_internal::simplify_iteration_space(
+        simplify_iteration_space(
             ortho_nd, _shape, ortho_dst_strides, ortho_rhs_strides,
             simplified_ortho_shape, simplified_ortho_dst_strides,
             simplified_ortho_rhs_strides, ortho_dst_offset, ortho_rhs_offset);
diff --git a/dpctl_ext/tensor/libtensor/source/copy_and_cast_usm_to_usm.cpp b/dpctl_ext/tensor/libtensor/source/copy_and_cast_usm_to_usm.cpp
index 3d20be02f885..9ea49ae1d88b 100644
--- a/dpctl_ext/tensor/libtensor/source/copy_and_cast_usm_to_usm.cpp
+++ b/dpctl_ext/tensor/libtensor/source/copy_and_cast_usm_to_usm.cpp
@@ -188,11 +188,10 @@ std::pair<sycl::event, sycl::event> copy_usm_ndarray_into_usm_ndarray(
     const py::ssize_t *shape = src_shape;
 
     // nd, simplified_* and *_offset are modified by reference
-    dpctl::tensor::py_internal::simplify_iteration_space(
-        nd, shape, src_strides, dst_strides,
-        // output
-        simplified_shape, simplified_src_strides, simplified_dst_strides,
-        src_offset, dst_offset);
+    simplify_iteration_space(nd, shape, src_strides, dst_strides,
+                             // output
+                             simplified_shape, simplified_src_strides,
+                             simplified_dst_strides, src_offset, dst_offset);
 
     if (nd < 2) {
         if (nd == 1) {
diff --git a/dpctl_ext/tensor/libtensor/source/copy_as_contig.cpp b/dpctl_ext/tensor/libtensor/source/copy_as_contig.cpp
index bbee24c95d4d..5d78862651fc 100644
--- a/dpctl_ext/tensor/libtensor/source/copy_as_contig.cpp
+++ b/dpctl_ext/tensor/libtensor/source/copy_as_contig.cpp
@@ -225,11 +225,11 @@ std::pair<sycl::event, sycl::event>
     int nd = src_nd;
 
     // nd, simplified_* and *_offset are modified by reference
-    dpctl::tensor::py_internal::simplify_iteration_space(
-        nd, src_shape_vec.data(), src_strides_vec, dst.get_strides_vector(),
-        // output
-        simplified_shape, simplified_src_strides, simplified_dst_strides,
-        src_offset, dst_offset);
+    simplify_iteration_space(nd, src_shape_vec.data(), src_strides_vec,
+                             dst.get_strides_vector(),
+                             // output
+                             simplified_shape, simplified_src_strides,
+                             simplified_dst_strides, src_offset, dst_offset);
 
     if (!((0 == src_offset) && (0 == dst_offset))) {
         throw std::runtime_error(
@@ -359,11 +359,11 @@ std::pair<sycl::event, sycl::event>
     int nd = src_nd;
 
     // nd, simplified_* and *_offset are modified by reference
-    dpctl::tensor::py_internal::simplify_iteration_space(
-        nd, src_shape_vec.data(), src_strides_vec, dst.get_strides_vector(),
-        // output
-        simplified_shape, simplified_src_strides, simplified_dst_strides,
-        src_offset, dst_offset);
+    simplify_iteration_space(nd, src_shape_vec.data(), src_strides_vec,
+                             dst.get_strides_vector(),
+                             // output
+                             simplified_shape, simplified_src_strides,
+                             simplified_dst_strides, src_offset, dst_offset);
 
     if (!((0 == src_offset) && (0 == dst_offset))) {
         throw std::runtime_error(
@@ -521,12 +521,11 @@ std::pair<sycl::event, sycl::event>
     int nd = static_cast<int>(batch_shape_vec.size());
 
     // nd, simplified_* and *_offset are modified by reference
-    dpctl::tensor::py_internal::simplify_iteration_space(
-        nd, batch_shape_vec.data(), src_batch_strides_vec,
-        dst_batch_strides_vec,
-        // output
-        simplified_shape, simplified_src_strides, simplified_dst_strides,
-        src_offset, dst_offset);
+    simplify_iteration_space(nd, batch_shape_vec.data(), src_batch_strides_vec,
+                             dst_batch_strides_vec,
+                             // output
+                             simplified_shape, simplified_src_strides,
+                             simplified_dst_strides, src_offset, dst_offset);
 
     if (!((0 == src_offset) && (0 == dst_offset))) {
         throw std::runtime_error(
@@ -714,12 +713,11 @@ std::pair<sycl::event, sycl::event>
     int nd = static_cast<int>(batch_shape_vec.size());
 
     // nd, simplified_* and *_offset are modified by reference
-    dpctl::tensor::py_internal::simplify_iteration_space(
-        nd, batch_shape_vec.data(), src_batch_strides_vec,
-        dst_batch_strides_vec,
-        // output
-        simplified_shape, simplified_src_strides, simplified_dst_strides,
-        src_offset, dst_offset);
+    simplify_iteration_space(nd, batch_shape_vec.data(), src_batch_strides_vec,
+                             dst_batch_strides_vec,
+                             // output
+                             simplified_shape, simplified_src_strides,
+                             simplified_dst_strides, src_offset, dst_offset);
 
     if (!((0 == src_offset) && (0 == dst_offset))) {
         throw std::runtime_error(
diff --git a/dpctl_ext/tensor/libtensor/source/copy_for_roll.cpp b/dpctl_ext/tensor/libtensor/source/copy_for_roll.cpp
index a187b2247677..7742c1c96a4e 100644
--- a/dpctl_ext/tensor/libtensor/source/copy_for_roll.cpp
+++ b/dpctl_ext/tensor/libtensor/source/copy_for_roll.cpp
@@ -197,11 +197,10 @@ std::pair<sycl::event, sycl::event>
     const py::ssize_t *shape = src_shape_ptr;
 
     // nd, simplified_* and *_offset are modified by reference
-    dpctl::tensor::py_internal::simplify_iteration_space(
-        nd, shape, src_strides, dst_strides,
-        // output
-        simplified_shape, simplified_src_strides, simplified_dst_strides,
-        src_offset, dst_offset);
+    simplify_iteration_space(nd, shape, src_strides, dst_strides,
+                             // output
+                             simplified_shape, simplified_src_strides,
+                             simplified_dst_strides, src_offset, dst_offset);
 
     if (nd == 1 && simplified_src_strides[0] == 1 &&
         simplified_dst_strides[0] == 1) {
diff --git a/dpctl_ext/tensor/libtensor/source/device_support_queries.cpp b/dpctl_ext/tensor/libtensor/source/device_support_queries.cpp
index 97a8ba83831e..3cc0952c2080 100644
--- a/dpctl_ext/tensor/libtensor/source/device_support_queries.cpp
+++ b/dpctl_ext/tensor/libtensor/source/device_support_queries.cpp
@@ -36,7 +36,7 @@
 
 #include "dpnp4pybind11.hpp"
 #include <pybind11/pybind11.h>
-#include <pybind11/stl.h>
+
 #include <sycl/sycl.hpp>
 
 namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/full_ctor.cpp b/dpctl_ext/tensor/libtensor/source/full_ctor.cpp
index aef57836666e..dfe1d25b769c 100644
--- a/dpctl_ext/tensor/libtensor/source/full_ctor.cpp
+++ b/dpctl_ext/tensor/libtensor/source/full_ctor.cpp
@@ -32,7 +32,6 @@
 /// This file defines functions of dpctl.tensor._tensor_impl extensions
 //===--------------------------------------------------------------------===//
 
-#include <complex>
 #include <cstddef>
 #include <cstdint>
 #include <tuple>
@@ -42,11 +41,13 @@
 #include <sycl/sycl.hpp>
 
 #include "dpnp4pybind11.hpp"
-#include <pybind11/complex.h>
+#include <pybind11/complex.h> // py::cast<std::complex<T>>
 #include <pybind11/pybind11.h>
 
 #include "kernels/constructors.hpp"
+#include "utils/offset_utils.hpp"
 #include "utils/output_validation.hpp"
+#include "utils/sycl_alloc_utils.hpp"
 #include "utils/type_dispatch.hpp"
 #include "utils/type_utils.hpp"
 
diff --git a/dpctl_ext/tensor/libtensor/source/integer_advanced_indexing.cpp b/dpctl_ext/tensor/libtensor/source/integer_advanced_indexing.cpp
index 925cc2e895ed..c6021bdfd2d1 100644
--- a/dpctl_ext/tensor/libtensor/source/integer_advanced_indexing.cpp
+++ b/dpctl_ext/tensor/libtensor/source/integer_advanced_indexing.cpp
@@ -34,7 +34,6 @@
 //===----------------------------------------------------------------------===//
 
 #include <algorithm>
-#include <complex>
 #include <cstddef>
 #include <cstdint>
 #include <iterator>
@@ -47,9 +46,7 @@
 #include <sycl/sycl.hpp>
 
 #include "dpnp4pybind11.hpp"
-#include <pybind11/complex.h>
 #include <pybind11/pybind11.h>
-#include <pybind11/stl.h>
 
 #include "kernels/integer_advanced_indexing.hpp"
 #include "utils/memory_overlap.hpp"
diff --git a/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp b/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp
index 98ab488e5879..7b151c773fe0 100644
--- a/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp
+++ b/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp
@@ -72,9 +72,6 @@ static_assert(std::is_same_v<py::ssize_t, dpctl::tensor::ssize_t>);
 namespace
 {
 
-using dpctl::tensor::c_contiguous_strides;
-using dpctl::tensor::f_contiguous_strides;
-
 using dpctl::tensor::overlap::MemoryOverlap;
 using dpctl::tensor::overlap::SameLogicalTensors;
 
diff --git a/dpctl_ext/tensor/libtensor/source/zeros_ctor.cpp b/dpctl_ext/tensor/libtensor/source/zeros_ctor.cpp
index 2eb05e49f382..b9a2e01bea4a 100644
--- a/dpctl_ext/tensor/libtensor/source/zeros_ctor.cpp
+++ b/dpctl_ext/tensor/libtensor/source/zeros_ctor.cpp
@@ -32,7 +32,6 @@
 /// This file defines functions of dpctl.tensor._tensor_impl extensions
 //===--------------------------------------------------------------------===//
 
-#include <complex>
 #include <cstddef>
 #include <stdexcept>
 #include <utility>
@@ -41,7 +40,6 @@
 #include <sycl/sycl.hpp>
 
 #include "dpnp4pybind11.hpp"
-#include <pybind11/complex.h>
 #include <pybind11/pybind11.h>
 
 #include "utils/output_validation.hpp"
diff --git a/dpctl_ext/tensor/libtensor/source/zeros_ctor.hpp b/dpctl_ext/tensor/libtensor/source/zeros_ctor.hpp
index 51a1903a0f36..d104e37f5533 100644
--- a/dpctl_ext/tensor/libtensor/source/zeros_ctor.hpp
+++ b/dpctl_ext/tensor/libtensor/source/zeros_ctor.hpp
@@ -39,7 +39,6 @@
 #include <sycl/sycl.hpp>
 
 #include "dpnp4pybind11.hpp"
-#include <pybind11/pybind11.h>
 
 namespace dpctl::tensor::py_internal
 {

From 0edd3b18eb128a8ed45ca81cb54d4c143df9d206 Mon Sep 17 00:00:00 2001
From: vlad-perevezentsev <vladislav.perevezentsev@intel.com>
Date: Thu, 5 Mar 2026 13:27:40 +0100
Subject: [PATCH 07/43] Extend `._tensor_impl` with where(), clip() and type
 utils functions (#2778)

This PR extends `_tensor_impl` in `dpctl_ext.tensor` with the `_where,
_clip` and repeat functions
(`_repeat_by_sequence, _repeat_by_scalar`)

It also adds `repeat(), where(), clip()` and `can_cast, finfo, iinfo,
isdtype, result_type` from `_type_utils.py` `to dpctl_ext.tensor and
updates the corresponding dpnp functions to use these implementations
internally
---
 dpctl_ext/tensor/CMakeLists.txt               |  12 +-
 dpctl_ext/tensor/__init__.py                  |  22 +-
 dpctl_ext/tensor/_clip.py                     | 781 ++++++++++++++
 dpctl_ext/tensor/_copy_utils.py               |   6 +-
 dpctl_ext/tensor/_manipulation_functions.py   | 273 ++++-
 dpctl_ext/tensor/_scalar_utils.py             | 122 +++
 dpctl_ext/tensor/_search_functions.py         | 419 ++++++++
 dpctl_ext/tensor/_type_utils.py               | 999 ++++++++++++++++++
 .../tensor/libtensor/include/kernels/clip.hpp | 357 +++++++
 .../libtensor/include/kernels/repeat.hpp      | 460 ++++++++
 .../libtensor/include/kernels/where.hpp       | 338 ++++++
 dpctl_ext/tensor/libtensor/source/clip.cpp    | 265 +++++
 dpctl_ext/tensor/libtensor/source/clip.hpp    |  57 +
 dpctl_ext/tensor/libtensor/source/repeat.cpp  | 820 ++++++++++++++
 dpctl_ext/tensor/libtensor/source/repeat.hpp  |  83 ++
 .../tensor/libtensor/source/tensor_ctors.cpp  | 120 ++-
 dpctl_ext/tensor/libtensor/source/where.cpp   | 265 +++++
 dpctl_ext/tensor/libtensor/source/where.hpp   |  57 +
 dpnp/dpnp_algo/dpnp_arraycreation.py          |   2 +-
 dpnp/dpnp_algo/dpnp_elementwise_common.py     |   2 +-
 dpnp/dpnp_array.py                            |   4 +-
 dpnp/dpnp_iface_functional.py                 |   8 +-
 dpnp/dpnp_iface_indexing.py                   |   2 +-
 dpnp/dpnp_iface_manipulation.py               |  16 +-
 dpnp/dpnp_iface_mathematical.py               |  19 +-
 dpnp/dpnp_iface_searching.py                  |   5 +-
 dpnp/dpnp_iface_sorting.py                    |   2 +-
 dpnp/dpnp_iface_statistics.py                 |   2 +-
 dpnp/dpnp_iface_trigonometric.py              |   4 +-
 dpnp/dpnp_iface_types.py                      |   9 +-
 dpnp/dpnp_utils/dpnp_utils_common.py          |   5 +-
 dpnp/dpnp_utils/dpnp_utils_linearalgebra.py   |  10 +-
 dpnp/dpnp_utils/dpnp_utils_statistics.py      |   5 +-
 dpnp/fft/dpnp_utils_fft.py                    |   8 +-
 dpnp/linalg/dpnp_iface_linalg.py              |   5 +-
 dpnp/linalg/dpnp_utils_linalg.py              |   2 +-
 dpnp/tests/test_arraymanipulation.py          |   5 +-
 dpnp/tests/test_counting.py                   |   5 +-
 dpnp/tests/test_flipping.py                   |   5 +-
 dpnp/tests/test_indexing.py                   |   7 +-
 dpnp/tests/test_linalg.py                     |   5 +-
 dpnp/tests/test_manipulation.py               |   5 +-
 dpnp/tests/test_mathematical.py               |  11 +-
 dpnp/tests/test_product.py                    |   5 +-
 dpnp/tests/test_sort.py                       |   5 +-
 .../cupy/core_tests/test_ndarray.py           |   5 +-
 .../cupy/lib_tests/test_shape_base.py         |   5 +-
 .../cupy/manipulation_tests/test_dims.py      |   5 +-
 .../cupy/manipulation_tests/test_transpose.py |   5 +-
 .../cupy/math_tests/test_sumprod.py           |   5 +-
 .../cupy/sorting_tests/test_sort.py           |   5 +-
 .../cupy/statistics_tests/test_meanvar.py     |   5 +-
 dpnp/tests/third_party/cupy/testing/_loops.py |   5 +-
 53 files changed, 5517 insertions(+), 142 deletions(-)
 create mode 100644 dpctl_ext/tensor/_clip.py
 create mode 100644 dpctl_ext/tensor/_scalar_utils.py
 create mode 100644 dpctl_ext/tensor/_search_functions.py
 create mode 100644 dpctl_ext/tensor/_type_utils.py
 create mode 100644 dpctl_ext/tensor/libtensor/include/kernels/clip.hpp
 create mode 100644 dpctl_ext/tensor/libtensor/include/kernels/repeat.hpp
 create mode 100644 dpctl_ext/tensor/libtensor/include/kernels/where.hpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/clip.cpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/clip.hpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/repeat.cpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/repeat.hpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/where.cpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/where.hpp

diff --git a/dpctl_ext/tensor/CMakeLists.txt b/dpctl_ext/tensor/CMakeLists.txt
index 0b166a202735..6f823a818ce7 100644
--- a/dpctl_ext/tensor/CMakeLists.txt
+++ b/dpctl_ext/tensor/CMakeLists.txt
@@ -58,10 +58,10 @@ set(_tensor_impl_sources
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/full_ctor.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/zeros_ctor.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/triul_ctor.cpp
-    # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/where.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/where.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/device_support_queries.cpp
-    # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/repeat.cpp
-    # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/clip.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/repeat.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/clip.cpp
 )
 
 set(_static_lib_trgt simplify_iteration_space)
@@ -92,10 +92,10 @@ endif()
 
 set(_no_fast_math_sources
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_and_cast_usm_to_usm.cpp
-    # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/full_ctor.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/full_ctor.cpp
     # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/linear_sequences.cpp
-    # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/clip.cpp
-    # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/where.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/clip.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/where.cpp
 )
 #list(
 #APPEND _no_fast_math_sources
diff --git a/dpctl_ext/tensor/__init__.py b/dpctl_ext/tensor/__init__.py
index fa76faccc632..2c1e761beb3b 100644
--- a/dpctl_ext/tensor/__init__.py
+++ b/dpctl_ext/tensor/__init__.py
@@ -27,20 +27,21 @@
 # *****************************************************************************
 
 
-from dpctl_ext.tensor._copy_utils import (
+from ._clip import clip
+from ._copy_utils import (
     asnumpy,
     astype,
     copy,
     from_numpy,
     to_numpy,
 )
-from dpctl_ext.tensor._ctors import (
+from ._ctors import (
     eye,
     full,
     tril,
     triu,
 )
-from dpctl_ext.tensor._indexing_functions import (
+from ._indexing_functions import (
     extract,
     nonzero,
     place,
@@ -49,28 +50,39 @@
     take,
     take_along_axis,
 )
-from dpctl_ext.tensor._manipulation_functions import (
+from ._manipulation_functions import (
+    repeat,
     roll,
 )
-from dpctl_ext.tensor._reshape import reshape
+from ._reshape import reshape
+from ._search_functions import where
+from ._type_utils import can_cast, finfo, iinfo, isdtype, result_type
 
 __all__ = [
     "asnumpy",
     "astype",
+    "can_cast",
     "copy",
+    "clip",
     "extract",
     "eye",
+    "finfo",
     "from_numpy",
     "full",
+    "iinfo",
+    "isdtype",
     "nonzero",
     "place",
     "put",
     "put_along_axis",
+    "repeat",
     "reshape",
+    "result_type",
     "roll",
     "take",
     "take_along_axis",
     "to_numpy",
     "tril",
     "triu",
+    "where",
 ]
diff --git a/dpctl_ext/tensor/_clip.py b/dpctl_ext/tensor/_clip.py
new file mode 100644
index 000000000000..f145e9f2d98d
--- /dev/null
+++ b/dpctl_ext/tensor/_clip.py
@@ -0,0 +1,781 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+import dpctl
+import dpctl.tensor as dpt
+import dpctl.tensor._tensor_elementwise_impl as tei
+from dpctl.utils import ExecutionPlacementError, SequentialOrderManager
+
+# TODO: revert to `import dpctl.tensor...`
+# when dpnp fully migrates dpctl/tensor
+import dpctl_ext.tensor as dpt_ext
+import dpctl_ext.tensor._tensor_impl as ti
+
+from ._copy_utils import (
+    _empty_like_orderK,
+    _empty_like_pair_orderK,
+    _empty_like_triple_orderK,
+)
+from ._manipulation_functions import _broadcast_shape_impl
+from ._scalar_utils import (
+    _get_dtype,
+    _get_queue_usm_type,
+    _get_shape,
+    _validate_dtype,
+)
+from ._type_utils import (
+    _can_cast,
+    _resolve_one_strong_one_weak_types,
+    _resolve_one_strong_two_weak_types,
+)
+
+
+def _check_clip_dtypes(res_dtype, arg1_dtype, arg2_dtype, sycl_dev):
+    """
+    Checks if both types `arg1_dtype` and `arg2_dtype` can be
+    cast to `res_dtype` according to the rule `safe`
+    """
+    if arg1_dtype == res_dtype and arg2_dtype == res_dtype:
+        return None, None, res_dtype
+
+    _fp16 = sycl_dev.has_aspect_fp16
+    _fp64 = sycl_dev.has_aspect_fp64
+    if _can_cast(arg1_dtype, res_dtype, _fp16, _fp64) and _can_cast(
+        arg2_dtype, res_dtype, _fp16, _fp64
+    ):
+        # prevent unnecessary casting
+        ret_buf1_dt = None if res_dtype == arg1_dtype else res_dtype
+        ret_buf2_dt = None if res_dtype == arg2_dtype else res_dtype
+        return ret_buf1_dt, ret_buf2_dt, res_dtype
+    else:
+        return None, None, None
+
+
+def _clip_none(x, val, out, order, _binary_fn):
+    q1, x_usm_type = x.sycl_queue, x.usm_type
+    q2, val_usm_type = _get_queue_usm_type(val)
+    if q2 is None:
+        exec_q = q1
+        res_usm_type = x_usm_type
+    else:
+        exec_q = dpctl.utils.get_execution_queue((q1, q2))
+        if exec_q is None:
+            raise ExecutionPlacementError(
+                "Execution placement can not be unambiguously inferred "
+                "from input arguments."
+            )
+        res_usm_type = dpctl.utils.get_coerced_usm_type(
+            (
+                x_usm_type,
+                val_usm_type,
+            )
+        )
+    dpctl.utils.validate_usm_type(res_usm_type, allow_none=False)
+    x_shape = x.shape
+    val_shape = _get_shape(val)
+    if not isinstance(val_shape, (tuple, list)):
+        raise TypeError(
+            "Shape of arguments can not be inferred. "
+            "Arguments are expected to be "
+            "lists, tuples, or both"
+        )
+    try:
+        res_shape = _broadcast_shape_impl(
+            [
+                x_shape,
+                val_shape,
+            ]
+        )
+    except ValueError:
+        raise ValueError(
+            "operands could not be broadcast together with shapes "
+            f"{x_shape} and {val_shape}"
+        )
+    sycl_dev = exec_q.sycl_device
+    x_dtype = x.dtype
+    val_dtype = _get_dtype(val, sycl_dev)
+    if not _validate_dtype(val_dtype):
+        raise ValueError("Operands have unsupported data types")
+
+    val_dtype = _resolve_one_strong_one_weak_types(x_dtype, val_dtype, sycl_dev)
+
+    res_dt = x.dtype
+    _fp16 = sycl_dev.has_aspect_fp16
+    _fp64 = sycl_dev.has_aspect_fp64
+    if not _can_cast(val_dtype, res_dt, _fp16, _fp64):
+        raise ValueError(
+            f"function 'clip' does not support input types "
+            f"({x_dtype}, {val_dtype}), "
+            "and the inputs could not be safely coerced to any "
+            "supported types according to the casting rule ''safe''."
+        )
+
+    orig_out = out
+    if out is not None:
+        if not isinstance(out, dpt.usm_ndarray):
+            raise TypeError(
+                f"output array must be of usm_ndarray type, got {type(out)}"
+            )
+
+        if not out.flags.writable:
+            raise ValueError("provided `out` array is read-only")
+
+        if out.shape != res_shape:
+            raise ValueError(
+                "The shape of input and output arrays are inconsistent. "
+                f"Expected output shape is {res_shape}, got {out.shape}"
+            )
+
+        if res_dt != out.dtype:
+            raise ValueError(
+                f"Output array of type {res_dt} is needed, got {out.dtype}"
+            )
+
+        if dpctl.utils.get_execution_queue((exec_q, out.sycl_queue)) is None:
+            raise ExecutionPlacementError(
+                "Input and output allocation queues are not compatible"
+            )
+
+        if ti._array_overlap(x, out):
+            if not ti._same_logical_tensors(x, out):
+                out = dpt.empty_like(out)
+
+        if isinstance(val, dpt.usm_ndarray):
+            if (
+                ti._array_overlap(val, out)
+                and not ti._same_logical_tensors(val, out)
+                and val_dtype == res_dt
+            ):
+                out = dpt.empty_like(out)
+
+    if isinstance(val, dpt.usm_ndarray):
+        val_ary = val
+    else:
+        val_ary = dpt.asarray(val, dtype=val_dtype, sycl_queue=exec_q)
+
+    if order == "A":
+        order = (
+            "F"
+            if all(
+                arr.flags.f_contiguous
+                for arr in (
+                    x,
+                    val_ary,
+                )
+            )
+            else "C"
+        )
+    if val_dtype == res_dt:
+        if out is None:
+            if order == "K":
+                out = _empty_like_pair_orderK(
+                    x, val_ary, res_dt, res_shape, res_usm_type, exec_q
+                )
+            else:
+                out = dpt.empty(
+                    res_shape,
+                    dtype=res_dt,
+                    usm_type=res_usm_type,
+                    sycl_queue=exec_q,
+                    order=order,
+                )
+        if x_shape != res_shape:
+            x = dpt.broadcast_to(x, res_shape)
+        if val_ary.shape != res_shape:
+            val_ary = dpt.broadcast_to(val_ary, res_shape)
+        _manager = SequentialOrderManager[exec_q]
+        dep_evs = _manager.submitted_events
+        ht_binary_ev, binary_ev = _binary_fn(
+            src1=x, src2=val_ary, dst=out, sycl_queue=exec_q, depends=dep_evs
+        )
+        _manager.add_event_pair(ht_binary_ev, binary_ev)
+        if not (orig_out is None or orig_out is out):
+            # Copy the out data from temporary buffer to original memory
+            ht_copy_out_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
+                src=out,
+                dst=orig_out,
+                sycl_queue=exec_q,
+                depends=[binary_ev],
+            )
+            _manager.add_event_pair(ht_copy_out_ev, copy_ev)
+            out = orig_out
+        return out
+    else:
+        if order == "K":
+            buf = _empty_like_orderK(val_ary, res_dt)
+        else:
+            buf = dpt.empty_like(val_ary, dtype=res_dt, order=order)
+        _manager = SequentialOrderManager[exec_q]
+        dep_evs = _manager.submitted_events
+        ht_copy_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
+            src=val_ary, dst=buf, sycl_queue=exec_q, depends=dep_evs
+        )
+        _manager.add_event_pair(ht_copy_ev, copy_ev)
+        if out is None:
+            if order == "K":
+                out = _empty_like_pair_orderK(
+                    x, buf, res_dt, res_shape, res_usm_type, exec_q
+                )
+            else:
+                out = dpt.empty(
+                    res_shape,
+                    dtype=res_dt,
+                    usm_type=res_usm_type,
+                    sycl_queue=exec_q,
+                    order=order,
+                )
+
+        if x_shape != res_shape:
+            x = dpt.broadcast_to(x, res_shape)
+        buf = dpt.broadcast_to(buf, res_shape)
+        ht_binary_ev, binary_ev = _binary_fn(
+            src1=x,
+            src2=buf,
+            dst=out,
+            sycl_queue=exec_q,
+            depends=[copy_ev],
+        )
+        _manager.add_event_pair(ht_binary_ev, binary_ev)
+        if not (orig_out is None or orig_out is out):
+            # Copy the out data from temporary buffer to original memory
+            ht_copy_out_ev, cpy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
+                src=out,
+                dst=orig_out,
+                sycl_queue=exec_q,
+                depends=[binary_ev],
+            )
+            _manager.add_event_pair(ht_copy_out_ev, cpy_ev)
+            out = orig_out
+        return out
+
+
+def clip(x, /, min=None, max=None, out=None, order="K"):
+    """clip(x, min=None, max=None, out=None, order="K")
+
+    Clips to the range [`min_i`, `max_i`] for each element `x_i`
+    in `x`.
+
+    Args:
+        x (usm_ndarray): Array containing elements to clip.
+            Must be compatible with `min` and `max` according
+            to broadcasting rules.
+        min ({None, Union[usm_ndarray, bool, int, float, complex]}, optional):
+            Array containing minimum values.
+            Must be compatible with `x` and `max` according
+            to broadcasting rules.
+        max ({None, Union[usm_ndarray, bool, int, float, complex]}, optional):
+            Array containing maximum values.
+            Must be compatible with `x` and `min` according
+            to broadcasting rules.
+        out ({None, usm_ndarray}, optional):
+            Output array to populate.
+            Array must have the correct shape and the expected data type.
+        order ("C","F","A","K", optional):
+            Memory layout of the newly output array, if parameter `out` is
+            `None`.
+            Default: "K".
+
+    Returns:
+        usm_ndarray:
+            An array with elements clipped to the range [`min`, `max`].
+            The returned array has the same data type as `x`.
+    """
+    if not isinstance(x, dpt.usm_ndarray):
+        raise TypeError(
+            "Expected `x` to be of dpctl.tensor.usm_ndarray type, got "
+            f"{type(x)}"
+        )
+    if order not in ["K", "C", "F", "A"]:
+        order = "K"
+    if x.dtype.kind in "iu":
+        if isinstance(min, int) and min <= dpt_ext.iinfo(x.dtype).min:
+            min = None
+        if isinstance(max, int) and max >= dpt_ext.iinfo(x.dtype).max:
+            max = None
+    if min is None and max is None:
+        exec_q = x.sycl_queue
+        orig_out = out
+        if out is not None:
+            if not isinstance(out, dpt.usm_ndarray):
+                raise TypeError(
+                    "output array must be of usm_ndarray type, got "
+                    f"{type(out)}"
+                )
+
+            if not out.flags.writable:
+                raise ValueError("provided `out` array is read-only")
+
+            if out.shape != x.shape:
+                raise ValueError(
+                    "The shape of input and output arrays are "
+                    f"inconsistent. Expected output shape is {x.shape}, "
+                    f"got {out.shape}"
+                )
+
+            if x.dtype != out.dtype:
+                raise ValueError(
+                    f"Output array of type {x.dtype} is needed, "
+                    f"got {out.dtype}"
+                )
+
+            if (
+                dpctl.utils.get_execution_queue((exec_q, out.sycl_queue))
+                is None
+            ):
+                raise ExecutionPlacementError(
+                    "Input and output allocation queues are not compatible"
+                )
+
+            if ti._array_overlap(x, out):
+                if not ti._same_logical_tensors(x, out):
+                    out = dpt.empty_like(out)
+                else:
+                    return out
+        else:
+            if order == "K":
+                out = _empty_like_orderK(x, x.dtype)
+            else:
+                out = dpt.empty_like(x, order=order)
+
+        _manager = SequentialOrderManager[exec_q]
+        dep_evs = _manager.submitted_events
+        ht_copy_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
+            src=x, dst=out, sycl_queue=exec_q, depends=dep_evs
+        )
+        _manager.add_event_pair(ht_copy_ev, copy_ev)
+        if not (orig_out is None or orig_out is out):
+            # Copy the out data from temporary buffer to original memory
+            ht_copy_out_ev, cpy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
+                src=out,
+                dst=orig_out,
+                sycl_queue=exec_q,
+                depends=[copy_ev],
+            )
+            _manager.add_event_pair(ht_copy_ev, cpy_ev)
+            out = orig_out
+        return out
+    elif max is None:
+        return _clip_none(x, min, out, order, tei._maximum)
+    elif min is None:
+        return _clip_none(x, max, out, order, tei._minimum)
+    else:
+        q1, x_usm_type = x.sycl_queue, x.usm_type
+        q2, min_usm_type = _get_queue_usm_type(min)
+        q3, max_usm_type = _get_queue_usm_type(max)
+        if q2 is None and q3 is None:
+            exec_q = q1
+            res_usm_type = x_usm_type
+        elif q3 is None:
+            exec_q = dpctl.utils.get_execution_queue((q1, q2))
+            if exec_q is None:
+                raise ExecutionPlacementError(
+                    "Execution placement can not be unambiguously inferred "
+                    "from input arguments."
+                )
+            res_usm_type = dpctl.utils.get_coerced_usm_type(
+                (
+                    x_usm_type,
+                    min_usm_type,
+                )
+            )
+        elif q2 is None:
+            exec_q = dpctl.utils.get_execution_queue((q1, q3))
+            if exec_q is None:
+                raise ExecutionPlacementError(
+                    "Execution placement can not be unambiguously inferred "
+                    "from input arguments."
+                )
+            res_usm_type = dpctl.utils.get_coerced_usm_type(
+                (
+                    x_usm_type,
+                    max_usm_type,
+                )
+            )
+        else:
+            exec_q = dpctl.utils.get_execution_queue((q1, q2, q3))
+            if exec_q is None:
+                raise ExecutionPlacementError(
+                    "Execution placement can not be unambiguously inferred "
+                    "from input arguments."
+                )
+            res_usm_type = dpctl.utils.get_coerced_usm_type(
+                (
+                    x_usm_type,
+                    min_usm_type,
+                    max_usm_type,
+                )
+            )
+        dpctl.utils.validate_usm_type(res_usm_type, allow_none=False)
+        x_shape = x.shape
+        min_shape = _get_shape(min)
+        max_shape = _get_shape(max)
+        if not all(
+            isinstance(s, (tuple, list))
+            for s in (
+                min_shape,
+                max_shape,
+            )
+        ):
+            raise TypeError(
+                "Shape of arguments can not be inferred. "
+                "Arguments are expected to be "
+                "lists, tuples, or both"
+            )
+        try:
+            res_shape = _broadcast_shape_impl(
+                [
+                    x_shape,
+                    min_shape,
+                    max_shape,
+                ]
+            )
+        except ValueError:
+            raise ValueError(
+                "operands could not be broadcast together with shapes "
+                f"{x_shape}, {min_shape}, and {max_shape}"
+            )
+        sycl_dev = exec_q.sycl_device
+        x_dtype = x.dtype
+        min_dtype = _get_dtype(min, sycl_dev)
+        max_dtype = _get_dtype(max, sycl_dev)
+        if not all(_validate_dtype(o) for o in (min_dtype, max_dtype)):
+            raise ValueError("Operands have unsupported data types")
+
+        min_dtype, max_dtype = _resolve_one_strong_two_weak_types(
+            x_dtype, min_dtype, max_dtype, sycl_dev
+        )
+
+        buf1_dt, buf2_dt, res_dt = _check_clip_dtypes(
+            x_dtype,
+            min_dtype,
+            max_dtype,
+            sycl_dev,
+        )
+
+        if res_dt is None:
+            raise ValueError(
+                f"function '{clip}' does not support input types "
+                f"({x_dtype}, {min_dtype}, {max_dtype}), "
+                "and the inputs could not be safely coerced to any "
+                "supported types according to the casting rule ''safe''."
+            )
+
+        orig_out = out
+        if out is not None:
+            if not isinstance(out, dpt.usm_ndarray):
+                raise TypeError(
+                    "output array must be of usm_ndarray type, got "
+                    f"{type(out)}"
+                )
+
+            if not out.flags.writable:
+                raise ValueError("provided `out` array is read-only")
+
+            if out.shape != res_shape:
+                raise ValueError(
+                    "The shape of input and output arrays are "
+                    f"inconsistent. Expected output shape is {res_shape}, "
+                    f"got {out.shape}"
+                )
+
+            if res_dt != out.dtype:
+                raise ValueError(
+                    f"Output array of type {res_dt} is needed, "
+                    f"got {out.dtype}"
+                )
+
+            if (
+                dpctl.utils.get_execution_queue((exec_q, out.sycl_queue))
+                is None
+            ):
+                raise ExecutionPlacementError(
+                    "Input and output allocation queues are not compatible"
+                )
+
+            if ti._array_overlap(x, out):
+                if not ti._same_logical_tensors(x, out):
+                    out = dpt.empty_like(out)
+
+            if isinstance(min, dpt.usm_ndarray):
+                if (
+                    ti._array_overlap(min, out)
+                    and not ti._same_logical_tensors(min, out)
+                    and buf1_dt is None
+                ):
+                    out = dpt.empty_like(out)
+
+            if isinstance(max, dpt.usm_ndarray):
+                if (
+                    ti._array_overlap(max, out)
+                    and not ti._same_logical_tensors(max, out)
+                    and buf2_dt is None
+                ):
+                    out = dpt.empty_like(out)
+
+        if isinstance(min, dpt.usm_ndarray):
+            a_min = min
+        else:
+            a_min = dpt.asarray(min, dtype=min_dtype, sycl_queue=exec_q)
+        if isinstance(max, dpt.usm_ndarray):
+            a_max = max
+        else:
+            a_max = dpt.asarray(max, dtype=max_dtype, sycl_queue=exec_q)
+
+        if order == "A":
+            order = (
+                "F"
+                if all(
+                    arr.flags.f_contiguous
+                    for arr in (
+                        x,
+                        a_min,
+                        a_max,
+                    )
+                )
+                else "C"
+            )
+        if buf1_dt is None and buf2_dt is None:
+            if out is None:
+                if order == "K":
+                    out = _empty_like_triple_orderK(
+                        x,
+                        a_min,
+                        a_max,
+                        res_dt,
+                        res_shape,
+                        res_usm_type,
+                        exec_q,
+                    )
+                else:
+                    out = dpt.empty(
+                        res_shape,
+                        dtype=res_dt,
+                        usm_type=res_usm_type,
+                        sycl_queue=exec_q,
+                        order=order,
+                    )
+            if x_shape != res_shape:
+                x = dpt.broadcast_to(x, res_shape)
+            if a_min.shape != res_shape:
+                a_min = dpt.broadcast_to(a_min, res_shape)
+            if a_max.shape != res_shape:
+                a_max = dpt.broadcast_to(a_max, res_shape)
+            _manager = SequentialOrderManager[exec_q]
+            dep_ev = _manager.submitted_events
+            ht_binary_ev, binary_ev = ti._clip(
+                src=x,
+                min=a_min,
+                max=a_max,
+                dst=out,
+                sycl_queue=exec_q,
+                depends=dep_ev,
+            )
+            _manager.add_event_pair(ht_binary_ev, binary_ev)
+            if not (orig_out is None or orig_out is out):
+                # Copy the out data from temporary buffer to original memory
+                ht_copy_out_ev, cpy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
+                    src=out,
+                    dst=orig_out,
+                    sycl_queue=exec_q,
+                    depends=[binary_ev],
+                )
+                _manager.add_event_pair(ht_copy_out_ev, cpy_ev)
+                out = orig_out
+            return out
+
+        elif buf1_dt is None:
+            if order == "K":
+                buf2 = _empty_like_orderK(a_max, buf2_dt)
+            else:
+                buf2 = dpt.empty_like(a_max, dtype=buf2_dt, order=order)
+            _manager = SequentialOrderManager[exec_q]
+            dep_ev = _manager.submitted_events
+            ht_copy_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
+                src=a_max, dst=buf2, sycl_queue=exec_q, depends=dep_ev
+            )
+            _manager.add_event_pair(ht_copy_ev, copy_ev)
+            if out is None:
+                if order == "K":
+                    out = _empty_like_triple_orderK(
+                        x,
+                        a_min,
+                        buf2,
+                        res_dt,
+                        res_shape,
+                        res_usm_type,
+                        exec_q,
+                    )
+                else:
+                    out = dpt.empty(
+                        res_shape,
+                        dtype=res_dt,
+                        usm_type=res_usm_type,
+                        sycl_queue=exec_q,
+                        order=order,
+                    )
+
+            x = dpt.broadcast_to(x, res_shape)
+            if a_min.shape != res_shape:
+                a_min = dpt.broadcast_to(a_min, res_shape)
+            buf2 = dpt.broadcast_to(buf2, res_shape)
+            ht_binary_ev, binary_ev = ti._clip(
+                src=x,
+                min=a_min,
+                max=buf2,
+                dst=out,
+                sycl_queue=exec_q,
+                depends=[copy_ev],
+            )
+            _manager.add_event_pair(ht_binary_ev, binary_ev)
+            if not (orig_out is None or orig_out is out):
+                # Copy the out data from temporary buffer to original memory
+                ht_copy_out_ev, cpy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
+                    src=out,
+                    dst=orig_out,
+                    sycl_queue=exec_q,
+                    depends=[binary_ev],
+                )
+                _manager.add_event_pair(ht_copy_out_ev, cpy_ev)
+                out = orig_out
+            return out
+
+        elif buf2_dt is None:
+            if order == "K":
+                buf1 = _empty_like_orderK(a_min, buf1_dt)
+            else:
+                buf1 = dpt.empty_like(a_min, dtype=buf1_dt, order=order)
+            _manager = SequentialOrderManager[exec_q]
+            dep_ev = _manager.submitted_events
+            ht_copy_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
+                src=a_min, dst=buf1, sycl_queue=exec_q, depends=dep_ev
+            )
+            _manager.add_event_pair(ht_copy_ev, copy_ev)
+            if out is None:
+                if order == "K":
+                    out = _empty_like_triple_orderK(
+                        x,
+                        buf1,
+                        a_max,
+                        res_dt,
+                        res_shape,
+                        res_usm_type,
+                        exec_q,
+                    )
+                else:
+                    out = dpt.empty(
+                        res_shape,
+                        dtype=res_dt,
+                        usm_type=res_usm_type,
+                        sycl_queue=exec_q,
+                        order=order,
+                    )
+
+            x = dpt.broadcast_to(x, res_shape)
+            buf1 = dpt.broadcast_to(buf1, res_shape)
+            if a_max.shape != res_shape:
+                a_max = dpt.broadcast_to(a_max, res_shape)
+            ht_binary_ev, binary_ev = ti._clip(
+                src=x,
+                min=buf1,
+                max=a_max,
+                dst=out,
+                sycl_queue=exec_q,
+                depends=[copy_ev],
+            )
+            _manager.add_event_pair(ht_binary_ev, binary_ev)
+            if not (orig_out is None or orig_out is out):
+                # Copy the out data from temporary buffer to original memory
+                ht_copy_out_ev, cpy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
+                    src=out,
+                    dst=orig_out,
+                    sycl_queue=exec_q,
+                    depends=[binary_ev],
+                )
+                _manager.add_event_pair(ht_copy_out_ev, cpy_ev)
+                out = orig_out
+            return out
+
+        if order == "K":
+            if (
+                x.flags.c_contiguous
+                and a_min.flags.c_contiguous
+                and a_max.flags.c_contiguous
+            ):
+                order = "C"
+            elif (
+                x.flags.f_contiguous
+                and a_min.flags.f_contiguous
+                and a_max.flags.f_contiguous
+            ):
+                order = "F"
+        if order == "K":
+            buf1 = _empty_like_orderK(a_min, buf1_dt)
+        else:
+            buf1 = dpt.empty_like(a_min, dtype=buf1_dt, order=order)
+
+        _manager = SequentialOrderManager[exec_q]
+        dep_evs = _manager.submitted_events
+        ht_copy1_ev, copy1_ev = ti._copy_usm_ndarray_into_usm_ndarray(
+            src=a_min, dst=buf1, sycl_queue=exec_q, depends=dep_evs
+        )
+        _manager.add_event_pair(ht_copy1_ev, copy1_ev)
+        if order == "K":
+            buf2 = _empty_like_orderK(a_max, buf2_dt)
+        else:
+            buf2 = dpt.empty_like(a_max, dtype=buf2_dt, order=order)
+        ht_copy2_ev, copy2_ev = ti._copy_usm_ndarray_into_usm_ndarray(
+            src=a_max, dst=buf2, sycl_queue=exec_q, depends=dep_evs
+        )
+        _manager.add_event_pair(ht_copy2_ev, copy2_ev)
+        if out is None:
+            if order == "K":
+                out = _empty_like_triple_orderK(
+                    x, buf1, buf2, res_dt, res_shape, res_usm_type, exec_q
+                )
+            else:
+                out = dpt.empty(
+                    res_shape,
+                    dtype=res_dt,
+                    usm_type=res_usm_type,
+                    sycl_queue=exec_q,
+                    order=order,
+                )
+
+        x = dpt.broadcast_to(x, res_shape)
+        buf1 = dpt.broadcast_to(buf1, res_shape)
+        buf2 = dpt.broadcast_to(buf2, res_shape)
+        ht_, clip_ev = ti._clip(
+            src=x,
+            min=buf1,
+            max=buf2,
+            dst=out,
+            sycl_queue=exec_q,
+            depends=[copy1_ev, copy2_ev],
+        )
+        _manager.add_event_pair(ht_, clip_ev)
+        return out
diff --git a/dpctl_ext/tensor/_copy_utils.py b/dpctl_ext/tensor/_copy_utils.py
index 5d1ac209c86b..64689057eb84 100644
--- a/dpctl_ext/tensor/_copy_utils.py
+++ b/dpctl_ext/tensor/_copy_utils.py
@@ -37,7 +37,6 @@
 import numpy as np
 from dpctl.tensor._data_types import _get_dtype
 from dpctl.tensor._device import normalize_queue_device
-from dpctl.tensor._type_utils import _dtype_supported_by_device_impl
 
 # TODO: revert to `import dpctl.tensor...`
 # when dpnp fully migrates dpctl/tensor
@@ -45,6 +44,7 @@
 import dpctl_ext.tensor._tensor_impl as ti
 
 from ._numpy_helper import normalize_axis_index
+from ._type_utils import _dtype_supported_by_device_impl
 
 __doc__ = (
     "Implementation module for copy- and cast- operations on "
@@ -291,7 +291,7 @@ def _prepare_indices_arrays(inds, q, usm_type):
     )
 
     # promote to a common integral type if possible
-    ind_dt = dpt.result_type(*inds)
+    ind_dt = dpt_ext.result_type(*inds)
     if ind_dt.kind not in "ui":
         raise ValueError(
             "cannot safely promote indices to an integer data type"
@@ -1013,7 +1013,7 @@ def astype(
     else:
         target_dtype = _get_dtype(newdtype, usm_ary.sycl_queue)
 
-    if not dpt.can_cast(ary_dtype, target_dtype, casting=casting):
+    if not dpt_ext.can_cast(ary_dtype, target_dtype, casting=casting):
         raise TypeError(
             f"Can not cast from {ary_dtype} to {newdtype} "
             f"according to rule {casting}."
diff --git a/dpctl_ext/tensor/_manipulation_functions.py b/dpctl_ext/tensor/_manipulation_functions.py
index fa8fc27876b3..f1b8b46dbcbc 100644
--- a/dpctl_ext/tensor/_manipulation_functions.py
+++ b/dpctl_ext/tensor/_manipulation_functions.py
@@ -26,17 +26,20 @@
 # THE POSSIBILITY OF SUCH DAMAGE.
 # *****************************************************************************
 
+import itertools
 import operator
 
+import dpctl
 import dpctl.tensor as dpt
 import dpctl.utils as dputils
 import numpy as np
 
 # TODO: revert to `import dpctl.tensor...`
 # when dpnp fully migrates dpctl/tensor
+import dpctl_ext.tensor as dpt_ext
 import dpctl_ext.tensor._tensor_impl as ti
 
-from ._numpy_helper import normalize_axis_tuple
+from ._numpy_helper import normalize_axis_index, normalize_axis_tuple
 
 __doc__ = (
     "Implementation module for array manipulation "
@@ -44,6 +47,274 @@
 )
 
 
+def _broadcast_shape_impl(shapes):
+    if len(set(shapes)) == 1:
+        return shapes[0]
+    mutable_shapes = False
+    nds = [len(s) for s in shapes]
+    biggest = max(nds)
+    sh_len = len(shapes)
+    for i in range(sh_len):
+        diff = biggest - nds[i]
+        if diff > 0:
+            ty = type(shapes[i])
+            shapes[i] = ty(
+                itertools.chain(itertools.repeat(1, diff), shapes[i])
+            )
+    common_shape = []
+    for axis in range(biggest):
+        lengths = [s[axis] for s in shapes]
+        unique = set(lengths + [1])
+        if len(unique) > 2:
+            raise ValueError(
+                "Shape mismatch: two or more arrays have "
+                f"incompatible dimensions on axis ({axis},)"
+            )
+        elif len(unique) == 2:
+            unique.remove(1)
+            new_length = unique.pop()
+            common_shape.append(new_length)
+            for i in range(sh_len):
+                if shapes[i][axis] == 1:
+                    if not mutable_shapes:
+                        shapes = [list(s) for s in shapes]
+                        mutable_shapes = True
+                    shapes[i][axis] = new_length
+        else:
+            common_shape.append(1)
+
+    return tuple(common_shape)
+
+
+def repeat(x, repeats, /, *, axis=None):
+    """repeat(x, repeats, axis=None)
+
+    Repeat elements of an array on a per-element basis.
+
+    Args:
+        x (usm_ndarray): input array
+
+        repeats (Union[int, Sequence[int, ...], usm_ndarray]):
+            The number of repetitions for each element.
+
+            `repeats` must be broadcast-compatible with `N` where `N` is
+            `prod(x.shape)` if `axis` is `None` and `x.shape[axis]`
+            otherwise.
+
+            If `repeats` is an array, it must have an integer data type.
+            Otherwise, `repeats` must be a Python integer or sequence of
+            Python integers (i.e., a tuple, list, or range).
+
+        axis (Optional[int]):
+            The axis along which to repeat values. If `axis` is `None`, the
+            function repeats elements of the flattened array. Default: `None`.
+
+    Returns:
+        usm_ndarray:
+            output array with repeated elements.
+
+            If `axis` is `None`, the returned array is one-dimensional,
+            otherwise, it has the same shape as `x`, except for the axis along
+            which elements were repeated.
+
+            The returned array will have the same data type as `x`.
+            The returned array will be located on the same device as `x` and
+            have the same USM allocation type as `x`.
+
+    Raises:
+        AxisError: if `axis` value is invalid.
+    """
+    if not isinstance(x, dpt.usm_ndarray):
+        raise TypeError(f"Expected usm_ndarray type, got {type(x)}.")
+
+    x_ndim = x.ndim
+    x_shape = x.shape
+    if axis is not None:
+        axis = normalize_axis_index(operator.index(axis), x_ndim)
+        axis_size = x_shape[axis]
+    else:
+        axis_size = x.size
+
+    scalar = False
+    if isinstance(repeats, int):
+        if repeats < 0:
+            raise ValueError("`repeats` must be a positive integer")
+        usm_type = x.usm_type
+        exec_q = x.sycl_queue
+        scalar = True
+    elif isinstance(repeats, dpt.usm_ndarray):
+        if repeats.ndim > 1:
+            raise ValueError(
+                "`repeats` array must be 0- or 1-dimensional, got "
+                f"{repeats.ndim}"
+            )
+        exec_q = dpctl.utils.get_execution_queue(
+            (x.sycl_queue, repeats.sycl_queue)
+        )
+        if exec_q is None:
+            raise dputils.ExecutionPlacementError(
+                "Execution placement can not be unambiguously inferred "
+                "from input arguments."
+            )
+        usm_type = dpctl.utils.get_coerced_usm_type(
+            (
+                x.usm_type,
+                repeats.usm_type,
+            )
+        )
+        dpctl.utils.validate_usm_type(usm_type, allow_none=False)
+        if not dpt_ext.can_cast(repeats.dtype, dpt.int64, casting="same_kind"):
+            raise TypeError(
+                f"'repeats' data type {repeats.dtype} cannot be cast to "
+                "'int64' according to the casting rule ''safe.''"
+            )
+        if repeats.size == 1:
+            scalar = True
+            # bring the single element to the host
+            if repeats.ndim == 0:
+                repeats = int(repeats)
+            else:
+                # Get the single element explicitly
+                # since non-0D arrays can not be converted to scalars
+                repeats = int(repeats[0])
+            if repeats < 0:
+                raise ValueError("`repeats` elements must be positive")
+        else:
+            if repeats.size != axis_size:
+                raise ValueError(
+                    "'repeats' array must be broadcastable to the size of "
+                    "the repeated axis"
+                )
+            if not dpt.all(repeats >= 0):
+                raise ValueError("'repeats' elements must be positive")
+
+    elif isinstance(repeats, (tuple, list, range)):
+        usm_type = x.usm_type
+        exec_q = x.sycl_queue
+
+        len_reps = len(repeats)
+        if len_reps == 1:
+            repeats = repeats[0]
+            if repeats < 0:
+                raise ValueError("`repeats` elements must be positive")
+            scalar = True
+        else:
+            if len_reps != axis_size:
+                raise ValueError(
+                    "`repeats` sequence must have the same length as the "
+                    "repeated axis"
+                )
+            repeats = dpt.asarray(
+                repeats, dtype=dpt.int64, usm_type=usm_type, sycl_queue=exec_q
+            )
+            if not dpt.all(repeats >= 0):
+                raise ValueError("`repeats` elements must be positive")
+    else:
+        raise TypeError(
+            "Expected int, sequence, or `usm_ndarray` for second argument,"
+            f"got {type(repeats)}"
+        )
+
+    _manager = dputils.SequentialOrderManager[exec_q]
+    dep_evs = _manager.submitted_events
+    if scalar:
+        res_axis_size = repeats * axis_size
+        if axis is not None:
+            res_shape = x_shape[:axis] + (res_axis_size,) + x_shape[axis + 1 :]
+        else:
+            res_shape = (res_axis_size,)
+        res = dpt.empty(
+            res_shape, dtype=x.dtype, usm_type=usm_type, sycl_queue=exec_q
+        )
+        if res_axis_size > 0:
+            ht_rep_ev, rep_ev = ti._repeat_by_scalar(
+                src=x,
+                dst=res,
+                reps=repeats,
+                axis=axis,
+                sycl_queue=exec_q,
+                depends=dep_evs,
+            )
+            _manager.add_event_pair(ht_rep_ev, rep_ev)
+    else:
+        if repeats.dtype != dpt.int64:
+            rep_buf = dpt.empty(
+                repeats.shape,
+                dtype=dpt.int64,
+                usm_type=usm_type,
+                sycl_queue=exec_q,
+            )
+            ht_copy_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
+                src=repeats, dst=rep_buf, sycl_queue=exec_q, depends=dep_evs
+            )
+            _manager.add_event_pair(ht_copy_ev, copy_ev)
+            cumsum = dpt.empty(
+                (axis_size,),
+                dtype=dpt.int64,
+                usm_type=usm_type,
+                sycl_queue=exec_q,
+            )
+            # _cumsum_1d synchronizes so `depends` ends here safely
+            res_axis_size = ti._cumsum_1d(
+                rep_buf, cumsum, sycl_queue=exec_q, depends=[copy_ev]
+            )
+            if axis is not None:
+                res_shape = (
+                    x_shape[:axis] + (res_axis_size,) + x_shape[axis + 1 :]
+                )
+            else:
+                res_shape = (res_axis_size,)
+            res = dpt.empty(
+                res_shape,
+                dtype=x.dtype,
+                usm_type=usm_type,
+                sycl_queue=exec_q,
+            )
+            if res_axis_size > 0:
+                ht_rep_ev, rep_ev = ti._repeat_by_sequence(
+                    src=x,
+                    dst=res,
+                    reps=rep_buf,
+                    cumsum=cumsum,
+                    axis=axis,
+                    sycl_queue=exec_q,
+                )
+                _manager.add_event_pair(ht_rep_ev, rep_ev)
+        else:
+            cumsum = dpt.empty(
+                (axis_size,),
+                dtype=dpt.int64,
+                usm_type=usm_type,
+                sycl_queue=exec_q,
+            )
+            res_axis_size = ti._cumsum_1d(
+                repeats, cumsum, sycl_queue=exec_q, depends=dep_evs
+            )
+            if axis is not None:
+                res_shape = (
+                    x_shape[:axis] + (res_axis_size,) + x_shape[axis + 1 :]
+                )
+            else:
+                res_shape = (res_axis_size,)
+            res = dpt.empty(
+                res_shape,
+                dtype=x.dtype,
+                usm_type=usm_type,
+                sycl_queue=exec_q,
+            )
+            if res_axis_size > 0:
+                ht_rep_ev, rep_ev = ti._repeat_by_sequence(
+                    src=x,
+                    dst=res,
+                    reps=repeats,
+                    cumsum=cumsum,
+                    axis=axis,
+                    sycl_queue=exec_q,
+                )
+                _manager.add_event_pair(ht_rep_ev, rep_ev)
+    return res
+
+
 def roll(x, /, shift, *, axis=None):
     """
     roll(x, shift, axis)
diff --git a/dpctl_ext/tensor/_scalar_utils.py b/dpctl_ext/tensor/_scalar_utils.py
new file mode 100644
index 000000000000..86787baea8cc
--- /dev/null
+++ b/dpctl_ext/tensor/_scalar_utils.py
@@ -0,0 +1,122 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+import numbers
+
+import dpctl.memory as dpm
+import dpctl.tensor as dpt
+import numpy as np
+from dpctl.tensor._usmarray import _is_object_with_buffer_protocol as _is_buffer
+
+from ._type_utils import (
+    WeakBooleanType,
+    WeakComplexType,
+    WeakFloatingType,
+    WeakIntegralType,
+    _to_device_supported_dtype,
+)
+
+
+def _get_queue_usm_type(o):
+    """Return SYCL device where object `o` allocated memory, or None."""
+    if isinstance(o, dpt.usm_ndarray):
+        return o.sycl_queue, o.usm_type
+    elif hasattr(o, "__sycl_usm_array_interface__"):
+        try:
+            m = dpm.as_usm_memory(o)
+            return m.sycl_queue, m.get_usm_type()
+        except Exception:
+            return None, None
+    return None, None
+
+
+def _get_dtype(o, dev):
+    if isinstance(o, dpt.usm_ndarray):
+        return o.dtype
+    if hasattr(o, "__sycl_usm_array_interface__"):
+        return dpt.asarray(o).dtype
+    if _is_buffer(o):
+        host_dt = np.array(o).dtype
+        dev_dt = _to_device_supported_dtype(host_dt, dev)
+        return dev_dt
+    if hasattr(o, "dtype"):
+        dev_dt = _to_device_supported_dtype(o.dtype, dev)
+        return dev_dt
+    if isinstance(o, bool):
+        return WeakBooleanType(o)
+    if isinstance(o, int):
+        return WeakIntegralType(o)
+    if isinstance(o, float):
+        return WeakFloatingType(o)
+    if isinstance(o, complex):
+        return WeakComplexType(o)
+    return np.object_
+
+
+def _validate_dtype(dt) -> bool:
+    return isinstance(
+        dt,
+        (WeakBooleanType, WeakIntegralType, WeakFloatingType, WeakComplexType),
+    ) or (
+        isinstance(dt, dpt.dtype)
+        and dt
+        in [
+            dpt.bool,
+            dpt.int8,
+            dpt.uint8,
+            dpt.int16,
+            dpt.uint16,
+            dpt.int32,
+            dpt.uint32,
+            dpt.int64,
+            dpt.uint64,
+            dpt.float16,
+            dpt.float32,
+            dpt.float64,
+            dpt.complex64,
+            dpt.complex128,
+        ]
+    )
+
+
+def _get_shape(o):
+    if isinstance(o, dpt.usm_ndarray):
+        return o.shape
+    if _is_buffer(o):
+        return memoryview(o).shape
+    if isinstance(o, numbers.Number):
+        return ()
+    return getattr(o, "shape", tuple())
+
+
+__all__ = [
+    "_get_dtype",
+    "_get_queue_usm_type",
+    "_get_shape",
+    "_validate_dtype",
+]
diff --git a/dpctl_ext/tensor/_search_functions.py b/dpctl_ext/tensor/_search_functions.py
new file mode 100644
index 000000000000..a82845e3520c
--- /dev/null
+++ b/dpctl_ext/tensor/_search_functions.py
@@ -0,0 +1,419 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+import dpctl
+import dpctl.tensor as dpt
+from dpctl.utils import ExecutionPlacementError, SequentialOrderManager
+
+# TODO: revert to `import dpctl.tensor...`
+# when dpnp fully migrates dpctl/tensor
+import dpctl_ext.tensor as dpt_ext
+import dpctl_ext.tensor._tensor_impl as ti
+
+from ._copy_utils import _empty_like_orderK, _empty_like_triple_orderK
+from ._manipulation_functions import _broadcast_shape_impl
+from ._scalar_utils import (
+    _get_dtype,
+    _get_queue_usm_type,
+    _get_shape,
+    _validate_dtype,
+)
+from ._type_utils import (
+    WeakBooleanType,
+    WeakComplexType,
+    WeakFloatingType,
+    WeakIntegralType,
+    _all_data_types,
+    _can_cast,
+    _is_weak_dtype,
+    _strong_dtype_num_kind,
+    _to_device_supported_dtype,
+    _weak_type_num_kind,
+)
+
+
+def _default_dtype_from_weak_type(dt, dev):
+    if isinstance(dt, WeakBooleanType):
+        return dpt.bool
+    if isinstance(dt, WeakIntegralType):
+        return dpt.dtype(ti.default_device_int_type(dev))
+    if isinstance(dt, WeakFloatingType):
+        return dpt.dtype(ti.default_device_fp_type(dev))
+    if isinstance(dt, WeakComplexType):
+        return dpt.dtype(ti.default_device_complex_type(dev))
+
+
+def _resolve_two_weak_types(o1_dtype, o2_dtype, dev):
+    """Resolves two weak data types per NEP-0050"""
+    if _is_weak_dtype(o1_dtype):
+        if _is_weak_dtype(o2_dtype):
+            return _default_dtype_from_weak_type(
+                o1_dtype, dev
+            ), _default_dtype_from_weak_type(o2_dtype, dev)
+        o1_kind_num = _weak_type_num_kind(o1_dtype)
+        o2_kind_num = _strong_dtype_num_kind(o2_dtype)
+        if o1_kind_num > o2_kind_num:
+            if isinstance(o1_dtype, WeakIntegralType):
+                return dpt.dtype(ti.default_device_int_type(dev)), o2_dtype
+            if isinstance(o1_dtype, WeakComplexType):
+                if o2_dtype is dpt.float16 or o2_dtype is dpt.float32:
+                    return dpt.complex64, o2_dtype
+                return (
+                    _to_device_supported_dtype(dpt.complex128, dev),
+                    o2_dtype,
+                )
+            return _to_device_supported_dtype(dpt.float64, dev), o2_dtype
+        else:
+            return o2_dtype, o2_dtype
+    elif _is_weak_dtype(o2_dtype):
+        o1_kind_num = _strong_dtype_num_kind(o1_dtype)
+        o2_kind_num = _weak_type_num_kind(o2_dtype)
+        if o2_kind_num > o1_kind_num:
+            if isinstance(o2_dtype, WeakIntegralType):
+                return o1_dtype, dpt.dtype(ti.default_device_int_type(dev))
+            if isinstance(o2_dtype, WeakComplexType):
+                if o1_dtype is dpt.float16 or o1_dtype is dpt.float32:
+                    return o1_dtype, dpt.complex64
+                return o1_dtype, _to_device_supported_dtype(dpt.complex128, dev)
+            return (
+                o1_dtype,
+                _to_device_supported_dtype(dpt.float64, dev),
+            )
+        else:
+            return o1_dtype, o1_dtype
+    else:
+        return o1_dtype, o2_dtype
+
+
+def _where_result_type(dt1, dt2, dev):
+    res_dtype = dpt_ext.result_type(dt1, dt2)
+    fp16 = dev.has_aspect_fp16
+    fp64 = dev.has_aspect_fp64
+
+    all_dts = _all_data_types(fp16, fp64)
+    if res_dtype in all_dts:
+        return res_dtype
+    else:
+        for res_dtype_ in all_dts:
+            if _can_cast(dt1, res_dtype_, fp16, fp64) and _can_cast(
+                dt2, res_dtype_, fp16, fp64
+            ):
+                return res_dtype_
+        return None
+
+
+def where(condition, x1, x2, /, *, order="K", out=None):
+    """
+    Returns :class:`dpctl.tensor.usm_ndarray` with elements chosen
+    from ``x1`` or ``x2`` depending on ``condition``.
+
+    Args:
+        condition (usm_ndarray): When ``True`` yields from ``x1``,
+            and otherwise yields from ``x2``.
+            Must be compatible with ``x1`` and ``x2`` according
+            to broadcasting rules.
+        x1 (Union[usm_ndarray, bool, int, float, complex]):
+            Array from which values are chosen when ``condition`` is ``True``.
+            Must be compatible with ``condition`` and ``x2`` according
+            to broadcasting rules.
+        x2 (Union[usm_ndarray, bool, int, float, complex]):
+            Array from which values are chosen when ``condition`` is not
+            ``True``.
+            Must be compatible with ``condition`` and ``x2`` according
+            to broadcasting rules.
+        order (``"K"``, ``"C"``, ``"F"``, ``"A"``, optional):
+            Memory layout of the new output array,
+            if parameter ``out`` is ``None``.
+            Default: ``"K"``.
+        out (Optional[usm_ndarray]):
+            the array into which the result is written.
+            The data type of `out` must match the expected shape and the
+            expected data type of the result.
+            If ``None`` then a new array is returned. Default: ``None``.
+
+    Returns:
+        usm_ndarray:
+            An array with elements from ``x1`` where ``condition`` is ``True``,
+            and elements from ``x2`` elsewhere.
+
+    The data type of the returned array is determined by applying
+    the Type Promotion Rules to ``x1`` and ``x2``.
+    """
+    if not isinstance(condition, dpt.usm_ndarray):
+        raise TypeError(
+            "Expecting dpctl.tensor.usm_ndarray type, " f"got {type(condition)}"
+        )
+    if order not in ["K", "C", "F", "A"]:
+        order = "K"
+    q1, condition_usm_type = condition.sycl_queue, condition.usm_type
+    q2, x1_usm_type = _get_queue_usm_type(x1)
+    q3, x2_usm_type = _get_queue_usm_type(x2)
+    if q2 is None and q3 is None:
+        exec_q = q1
+        out_usm_type = condition_usm_type
+    elif q3 is None:
+        exec_q = dpctl.utils.get_execution_queue((q1, q2))
+        if exec_q is None:
+            raise ExecutionPlacementError(
+                "Execution placement can not be unambiguously inferred "
+                "from input arguments."
+            )
+        out_usm_type = dpctl.utils.get_coerced_usm_type(
+            (
+                condition_usm_type,
+                x1_usm_type,
+            )
+        )
+    elif q2 is None:
+        exec_q = dpctl.utils.get_execution_queue((q1, q3))
+        if exec_q is None:
+            raise ExecutionPlacementError(
+                "Execution placement can not be unambiguously inferred "
+                "from input arguments."
+            )
+        out_usm_type = dpctl.utils.get_coerced_usm_type(
+            (
+                condition_usm_type,
+                x2_usm_type,
+            )
+        )
+    else:
+        exec_q = dpctl.utils.get_execution_queue((q1, q2, q3))
+        if exec_q is None:
+            raise ExecutionPlacementError(
+                "Execution placement can not be unambiguously inferred "
+                "from input arguments."
+            )
+        out_usm_type = dpctl.utils.get_coerced_usm_type(
+            (
+                condition_usm_type,
+                x1_usm_type,
+                x2_usm_type,
+            )
+        )
+    dpctl.utils.validate_usm_type(out_usm_type, allow_none=False)
+    condition_shape = condition.shape
+    x1_shape = _get_shape(x1)
+    x2_shape = _get_shape(x2)
+    if not all(
+        isinstance(s, (tuple, list))
+        for s in (
+            x1_shape,
+            x2_shape,
+        )
+    ):
+        raise TypeError(
+            "Shape of arguments can not be inferred. "
+            "Arguments are expected to be "
+            "lists, tuples, or both"
+        )
+    try:
+        res_shape = _broadcast_shape_impl(
+            [
+                condition_shape,
+                x1_shape,
+                x2_shape,
+            ]
+        )
+    except ValueError:
+        raise ValueError(
+            "operands could not be broadcast together with shapes "
+            f"{condition_shape}, {x1_shape}, and {x2_shape}"
+        )
+    sycl_dev = exec_q.sycl_device
+    x1_dtype = _get_dtype(x1, sycl_dev)
+    x2_dtype = _get_dtype(x2, sycl_dev)
+    if not all(_validate_dtype(o) for o in (x1_dtype, x2_dtype)):
+        raise ValueError("Operands have unsupported data types")
+    x1_dtype, x2_dtype = _resolve_two_weak_types(x1_dtype, x2_dtype, sycl_dev)
+    out_dtype = _where_result_type(x1_dtype, x2_dtype, sycl_dev)
+    if out_dtype is None:
+        raise TypeError(
+            "function 'where' does not support input "
+            f"types ({x1_dtype}, {x2_dtype}), "
+            "and the inputs could not be safely coerced "
+            "to any supported types according to the casting rule ''safe''."
+        )
+
+    orig_out = out
+    if out is not None:
+        if not isinstance(out, dpt.usm_ndarray):
+            raise TypeError(
+                "output array must be of usm_ndarray type, got " f"{type(out)}"
+            )
+
+        if not out.flags.writable:
+            raise ValueError("provided `out` array is read-only")
+
+        if out.shape != res_shape:
+            raise ValueError(
+                "The shape of input and output arrays are "
+                f"inconsistent. Expected output shape is {res_shape}, "
+                f"got {out.shape}"
+            )
+
+        if out_dtype != out.dtype:
+            raise ValueError(
+                f"Output array of type {out_dtype} is needed, "
+                f"got {out.dtype}"
+            )
+
+        if dpctl.utils.get_execution_queue((exec_q, out.sycl_queue)) is None:
+            raise ExecutionPlacementError(
+                "Input and output allocation queues are not compatible"
+            )
+
+        if ti._array_overlap(condition, out) and not ti._same_logical_tensors(
+            condition, out
+        ):
+            out = dpt.empty_like(out)
+
+        if isinstance(x1, dpt.usm_ndarray):
+            if (
+                ti._array_overlap(x1, out)
+                and not ti._same_logical_tensors(x1, out)
+                and x1_dtype == out_dtype
+            ):
+                out = dpt.empty_like(out)
+
+        if isinstance(x2, dpt.usm_ndarray):
+            if (
+                ti._array_overlap(x2, out)
+                and not ti._same_logical_tensors(x2, out)
+                and x2_dtype == out_dtype
+            ):
+                out = dpt.empty_like(out)
+
+    if order == "A":
+        order = (
+            "F"
+            if all(
+                arr.flags.f_contiguous
+                for arr in (
+                    condition,
+                    x1,
+                    x2,
+                )
+            )
+            else "C"
+        )
+    if not isinstance(x1, dpt.usm_ndarray):
+        x1 = dpt.asarray(x1, dtype=x1_dtype, sycl_queue=exec_q)
+    if not isinstance(x2, dpt.usm_ndarray):
+        x2 = dpt.asarray(x2, dtype=x2_dtype, sycl_queue=exec_q)
+
+    if condition.size == 0:
+        if out is not None:
+            return out
+        else:
+            if order == "K":
+                return _empty_like_triple_orderK(
+                    condition,
+                    x1,
+                    x2,
+                    out_dtype,
+                    res_shape,
+                    out_usm_type,
+                    exec_q,
+                )
+            else:
+                return dpt.empty(
+                    res_shape,
+                    dtype=out_dtype,
+                    order=order,
+                    usm_type=out_usm_type,
+                    sycl_queue=exec_q,
+                )
+
+    _manager = SequentialOrderManager[exec_q]
+    dep_evs = _manager.submitted_events
+    if x1_dtype != out_dtype:
+        if order == "K":
+            _x1 = _empty_like_orderK(x1, out_dtype)
+        else:
+            _x1 = dpt.empty_like(x1, dtype=out_dtype, order=order)
+        ht_copy1_ev, copy1_ev = ti._copy_usm_ndarray_into_usm_ndarray(
+            src=x1, dst=_x1, sycl_queue=exec_q, depends=dep_evs
+        )
+        x1 = _x1
+        _manager.add_event_pair(ht_copy1_ev, copy1_ev)
+
+    if x2_dtype != out_dtype:
+        if order == "K":
+            _x2 = _empty_like_orderK(x2, out_dtype)
+        else:
+            _x2 = dpt.empty_like(x2, dtype=out_dtype, order=order)
+        ht_copy2_ev, copy2_ev = ti._copy_usm_ndarray_into_usm_ndarray(
+            src=x2, dst=_x2, sycl_queue=exec_q, depends=dep_evs
+        )
+        x2 = _x2
+        _manager.add_event_pair(ht_copy2_ev, copy2_ev)
+
+    if out is None:
+        if order == "K":
+            out = _empty_like_triple_orderK(
+                condition, x1, x2, out_dtype, res_shape, out_usm_type, exec_q
+            )
+        else:
+            out = dpt.empty(
+                res_shape,
+                dtype=out_dtype,
+                order=order,
+                usm_type=out_usm_type,
+                sycl_queue=exec_q,
+            )
+
+    if condition_shape != res_shape:
+        condition = dpt.broadcast_to(condition, res_shape)
+    if x1_shape != res_shape:
+        x1 = dpt.broadcast_to(x1, res_shape)
+    if x2_shape != res_shape:
+        x2 = dpt.broadcast_to(x2, res_shape)
+
+    dep_evs = _manager.submitted_events
+    hev, where_ev = ti._where(
+        condition=condition,
+        x1=x1,
+        x2=x2,
+        dst=out,
+        sycl_queue=exec_q,
+        depends=dep_evs,
+    )
+    _manager.add_event_pair(hev, where_ev)
+    if not (orig_out is None or orig_out is out):
+        # Copy the out data from temporary buffer to original memory
+        ht_copy_out_ev, cpy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
+            src=out,
+            dst=orig_out,
+            sycl_queue=exec_q,
+            depends=[where_ev],
+        )
+        _manager.add_event_pair(ht_copy_out_ev, cpy_ev)
+        out = orig_out
+
+    return out
diff --git a/dpctl_ext/tensor/_type_utils.py b/dpctl_ext/tensor/_type_utils.py
new file mode 100644
index 000000000000..1e386e15dfa3
--- /dev/null
+++ b/dpctl_ext/tensor/_type_utils.py
@@ -0,0 +1,999 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+from __future__ import annotations
+
+import dpctl.tensor as dpt
+import numpy as np
+
+# TODO: revert to `import dpctl.tensor...`
+# when dpnp fully migrates dpctl/tensor
+import dpctl_ext.tensor as dpt_ext
+import dpctl_ext.tensor._tensor_impl as ti
+
+
+def _all_data_types(_fp16, _fp64):
+    _non_fp_types = [
+        dpt.bool,
+        dpt.int8,
+        dpt.uint8,
+        dpt.int16,
+        dpt.uint16,
+        dpt.int32,
+        dpt.uint32,
+        dpt.int64,
+        dpt.uint64,
+    ]
+    if _fp64:
+        if _fp16:
+            return _non_fp_types + [
+                dpt.float16,
+                dpt.float32,
+                dpt.float64,
+                dpt.complex64,
+                dpt.complex128,
+            ]
+        else:
+            return _non_fp_types + [
+                dpt.float32,
+                dpt.float64,
+                dpt.complex64,
+                dpt.complex128,
+            ]
+    else:
+        if _fp16:
+            return _non_fp_types + [
+                dpt.float16,
+                dpt.float32,
+                dpt.complex64,
+            ]
+        else:
+            return _non_fp_types + [
+                dpt.float32,
+                dpt.complex64,
+            ]
+
+
+def _acceptance_fn_default_binary(
+    arg1_dtype, arg2_dtype, ret_buf1_dt, ret_buf2_dt, res_dt, sycl_dev
+):
+    return True
+
+
+def _acceptance_fn_default_unary(arg_dtype, ret_buf_dt, res_dt, sycl_dev):
+    return True
+
+
+def _acceptance_fn_divide(
+    arg1_dtype, arg2_dtype, ret_buf1_dt, ret_buf2_dt, res_dt, sycl_dev
+):
+    # both are being promoted, if the kind of result is
+    # different than the kind of original input dtypes,
+    # we use default dtype for the resulting kind.
+    # This covers, e.g. (array_dtype_i1 / array_dtype_u1)
+    # result of which in divide is double (in NumPy), but
+    # regular type promotion rules peg at float16
+    if (ret_buf1_dt.kind != arg1_dtype.kind) and (
+        ret_buf2_dt.kind != arg2_dtype.kind
+    ):
+        default_dt = _get_device_default_dtype(res_dt.kind, sycl_dev)
+        if res_dt == default_dt:
+            return True
+        else:
+            return False
+    else:
+        return True
+
+
+def _acceptance_fn_negative(arg_dtype, buf_dt, res_dt, sycl_dev):
+    # negative is not defined for boolean data type
+    if arg_dtype.char == "?":
+        raise ValueError(
+            "The `negative` function, the `-` operator, is not supported "
+            "for inputs of data type bool, use the `~` operator or the "
+            "`logical_not` function instead"
+        )
+    else:
+        return True
+
+
+def _acceptance_fn_reciprocal(arg_dtype, buf_dt, res_dt, sycl_dev):
+    # if the kind of result is different from the kind of input, we use the
+    # default floating-point dtype for the resulting kind. This guarantees
+    # alignment of reciprocal and divide output types.
+    if buf_dt.kind != arg_dtype.kind:
+        default_dt = _get_device_default_dtype(res_dt.kind, sycl_dev)
+        if res_dt == default_dt:
+            return True
+        else:
+            return False
+    else:
+        return True
+
+
+def _acceptance_fn_subtract(
+    arg1_dtype, arg2_dtype, buf1_dt, buf2_dt, res_dt, sycl_dev
+):
+    # subtract is not defined for boolean data type
+    if arg1_dtype.char == "?" and arg2_dtype.char == "?":
+        raise ValueError(
+            "The `subtract` function, the `-` operator, is not supported "
+            "for inputs of data type bool, use the `^` operator,  the "
+            "`bitwise_xor`, or the `logical_xor` function instead"
+        )
+    else:
+        return True
+
+
+def _can_cast(
+    from_: dpt.dtype, to_: dpt.dtype, _fp16: bool, _fp64: bool, casting="safe"
+) -> bool:
+    """
+    Can `from_` be cast to `to_` safely on a device with
+    fp16 and fp64 aspects as given?
+    """
+    if not _dtype_supported_by_device_impl(to_, _fp16, _fp64):
+        return False
+    can_cast_v = np.can_cast(from_, to_, casting=casting)  # ask NumPy
+    if _fp16 and _fp64:
+        return can_cast_v
+    if not can_cast_v:
+        if (
+            from_.kind in "biu"
+            and to_.kind in "fc"
+            and _is_maximal_inexact_type(to_, _fp16, _fp64)
+        ):
+            return True
+
+    return can_cast_v
+
+
+def _dtype_supported_by_device_impl(
+    dt: dpt.dtype, has_fp16: bool, has_fp64: bool
+) -> bool:
+    if has_fp64:
+        if not has_fp16:
+            if dt is dpt.float16:
+                return False
+    else:
+        if dt is dpt.float64:
+            return False
+        elif dt is dpt.complex128:
+            return False
+        if not has_fp16 and dt is dpt.float16:
+            return False
+    return True
+
+
+def _find_buf_dtype(arg_dtype, query_fn, sycl_dev, acceptance_fn):
+    res_dt = query_fn(arg_dtype)
+    if res_dt:
+        return None, res_dt
+
+    _fp16 = sycl_dev.has_aspect_fp16
+    _fp64 = sycl_dev.has_aspect_fp64
+    all_dts = _all_data_types(_fp16, _fp64)
+    for buf_dt in all_dts:
+        if _can_cast(arg_dtype, buf_dt, _fp16, _fp64):
+            res_dt = query_fn(buf_dt)
+            if res_dt:
+                acceptable = acceptance_fn(arg_dtype, buf_dt, res_dt, sycl_dev)
+                if acceptable:
+                    return buf_dt, res_dt
+                else:
+                    continue
+
+    return None, None
+
+
+def _find_buf_dtype2(arg1_dtype, arg2_dtype, query_fn, sycl_dev, acceptance_fn):
+    res_dt = query_fn(arg1_dtype, arg2_dtype)
+    if res_dt:
+        return None, None, res_dt
+
+    _fp16 = sycl_dev.has_aspect_fp16
+    _fp64 = sycl_dev.has_aspect_fp64
+    all_dts = _all_data_types(_fp16, _fp64)
+    for buf1_dt in all_dts:
+        for buf2_dt in all_dts:
+            if _can_cast(arg1_dtype, buf1_dt, _fp16, _fp64) and _can_cast(
+                arg2_dtype, buf2_dt, _fp16, _fp64
+            ):
+                res_dt = query_fn(buf1_dt, buf2_dt)
+                if res_dt:
+                    ret_buf1_dt = None if buf1_dt == arg1_dtype else buf1_dt
+                    ret_buf2_dt = None if buf2_dt == arg2_dtype else buf2_dt
+                    if ret_buf1_dt is None or ret_buf2_dt is None:
+                        return ret_buf1_dt, ret_buf2_dt, res_dt
+                    else:
+                        acceptable = acceptance_fn(
+                            arg1_dtype,
+                            arg2_dtype,
+                            ret_buf1_dt,
+                            ret_buf2_dt,
+                            res_dt,
+                            sycl_dev,
+                        )
+                        if acceptable:
+                            return ret_buf1_dt, ret_buf2_dt, res_dt
+                        else:
+                            continue
+
+    return None, None, None
+
+
+def _find_buf_dtype_in_place_op(arg1_dtype, arg2_dtype, query_fn, sycl_dev):
+    res_dt = query_fn(arg1_dtype, arg2_dtype)
+    if res_dt:
+        return None, res_dt
+
+    _fp16 = sycl_dev.has_aspect_fp16
+    _fp64 = sycl_dev.has_aspect_fp64
+    if _can_cast(arg2_dtype, arg1_dtype, _fp16, _fp64, casting="same_kind"):
+        res_dt = query_fn(arg1_dtype, arg1_dtype)
+        if res_dt:
+            return arg1_dtype, res_dt
+
+    return None, None
+
+
+def _get_device_default_dtype(dt_kind, sycl_dev):
+    if dt_kind == "b":
+        return dpt.dtype(ti.default_device_bool_type(sycl_dev))
+    elif dt_kind == "i":
+        return dpt.dtype(ti.default_device_int_type(sycl_dev))
+    elif dt_kind == "u":
+        return dpt.dtype(ti.default_device_uint_type(sycl_dev))
+    elif dt_kind == "f":
+        return dpt.dtype(ti.default_device_fp_type(sycl_dev))
+    elif dt_kind == "c":
+        return dpt.dtype(ti.default_device_complex_type(sycl_dev))
+    raise RuntimeError
+
+
+def _is_maximal_inexact_type(dt: dpt.dtype, _fp16: bool, _fp64: bool):
+    """
+    Return True if data type `dt` is the
+    maximal size inexact data type
+    """
+    if _fp64:
+        return dt in [dpt.float64, dpt.complex128]
+    return dt in [dpt.float32, dpt.complex64]
+
+
+def _to_device_supported_dtype(dt, dev):
+    has_fp16 = dev.has_aspect_fp16
+    has_fp64 = dev.has_aspect_fp64
+
+    return _to_device_supported_dtype_impl(dt, has_fp16, has_fp64)
+
+
+def _to_device_supported_dtype_impl(dt, has_fp16, has_fp64):
+    if has_fp64:
+        if not has_fp16:
+            if dt is dpt.float16:
+                return dpt.float32
+    else:
+        if dt is dpt.float64:
+            return dpt.float32
+        elif dt is dpt.complex128:
+            return dpt.complex64
+        if not has_fp16 and dt is dpt.float16:
+            return dpt.float32
+    return dt
+
+
+class WeakBooleanType:
+    """Python type representing type of Python boolean objects"""
+
+    def __init__(self, o):
+        self.o_ = o
+
+    def get(self):
+        return self.o_
+
+
+class WeakIntegralType:
+    """Python type representing type of Python integral objects"""
+
+    def __init__(self, o):
+        self.o_ = o
+
+    def get(self):
+        return self.o_
+
+
+class WeakFloatingType:
+    """Python type representing type of Python floating point objects"""
+
+    def __init__(self, o):
+        self.o_ = o
+
+    def get(self):
+        return self.o_
+
+
+class WeakComplexType:
+    """Python type representing type of Python complex floating point objects"""
+
+    def __init__(self, o):
+        self.o_ = o
+
+    def get(self):
+        return self.o_
+
+
+def _weak_type_num_kind(o):
+    _map = {"?": 0, "i": 1, "f": 2, "c": 3}
+    if isinstance(o, WeakBooleanType):
+        return _map["?"]
+    if isinstance(o, WeakIntegralType):
+        return _map["i"]
+    if isinstance(o, WeakFloatingType):
+        return _map["f"]
+    if isinstance(o, WeakComplexType):
+        return _map["c"]
+    raise TypeError(
+        f"Unexpected type {o} while expecting "
+        "`WeakBooleanType`, `WeakIntegralType`,"
+        "`WeakFloatingType`, or `WeakComplexType`."
+    )
+
+
+def _strong_dtype_num_kind(o):
+    _map = {"b": 0, "i": 1, "u": 1, "f": 2, "c": 3}
+    if not isinstance(o, dpt.dtype):
+        raise TypeError
+    k = o.kind
+    if k in _map:
+        return _map[k]
+    raise ValueError(f"Unrecognized kind {k} for dtype {o}")
+
+
+def _is_weak_dtype(dtype):
+    return isinstance(
+        dtype,
+        (WeakBooleanType, WeakIntegralType, WeakFloatingType, WeakComplexType),
+    )
+
+
+def _resolve_weak_types(o1_dtype, o2_dtype, dev):
+    """Resolves weak data type per NEP-0050"""
+    if _is_weak_dtype(o1_dtype):
+        if _is_weak_dtype(o2_dtype):
+            raise ValueError
+        o1_kind_num = _weak_type_num_kind(o1_dtype)
+        o2_kind_num = _strong_dtype_num_kind(o2_dtype)
+        if o1_kind_num > o2_kind_num:
+            if isinstance(o1_dtype, WeakIntegralType):
+                return dpt.dtype(ti.default_device_int_type(dev)), o2_dtype
+            if isinstance(o1_dtype, WeakComplexType):
+                if o2_dtype is dpt.float16 or o2_dtype is dpt.float32:
+                    return dpt.complex64, o2_dtype
+                return (
+                    _to_device_supported_dtype(dpt.complex128, dev),
+                    o2_dtype,
+                )
+            return _to_device_supported_dtype(dpt.float64, dev), o2_dtype
+        else:
+            return o2_dtype, o2_dtype
+    elif _is_weak_dtype(o2_dtype):
+        o1_kind_num = _strong_dtype_num_kind(o1_dtype)
+        o2_kind_num = _weak_type_num_kind(o2_dtype)
+        if o2_kind_num > o1_kind_num:
+            if isinstance(o2_dtype, WeakIntegralType):
+                return o1_dtype, dpt.dtype(ti.default_device_int_type(dev))
+            if isinstance(o2_dtype, WeakComplexType):
+                if o1_dtype is dpt.float16 or o1_dtype is dpt.float32:
+                    return o1_dtype, dpt.complex64
+                return o1_dtype, _to_device_supported_dtype(dpt.complex128, dev)
+            return (
+                o1_dtype,
+                _to_device_supported_dtype(dpt.float64, dev),
+            )
+        else:
+            return o1_dtype, o1_dtype
+    else:
+        return o1_dtype, o2_dtype
+
+
+def _resolve_weak_types_all_py_ints(o1_dtype, o2_dtype, dev):
+    """
+    Resolves weak data type per NEP-0050 for comparisons and
+    divide, where result type is known and special behavior
+    is needed to handle mixed integer kinds and Python integers
+    without overflow
+    """
+    if _is_weak_dtype(o1_dtype):
+        if _is_weak_dtype(o2_dtype):
+            raise ValueError
+        o1_kind_num = _weak_type_num_kind(o1_dtype)
+        o2_kind_num = _strong_dtype_num_kind(o2_dtype)
+        if o1_kind_num > o2_kind_num:
+            if isinstance(o1_dtype, WeakIntegralType):
+                return dpt.dtype(ti.default_device_int_type(dev)), o2_dtype
+            if isinstance(o1_dtype, WeakComplexType):
+                if o2_dtype is dpt.float16 or o2_dtype is dpt.float32:
+                    return dpt.complex64, o2_dtype
+                return (
+                    _to_device_supported_dtype(dpt.complex128, dev),
+                    o2_dtype,
+                )
+            return _to_device_supported_dtype(dpt.float64, dev), o2_dtype
+        else:
+            if o1_kind_num == o2_kind_num and isinstance(
+                o1_dtype, WeakIntegralType
+            ):
+                o1_val = o1_dtype.get()
+                o2_iinfo = dpt_ext.iinfo(o2_dtype)
+                if (o1_val < o2_iinfo.min) or (o1_val > o2_iinfo.max):
+                    return dpt.dtype(np.min_scalar_type(o1_val)), o2_dtype
+            return o2_dtype, o2_dtype
+    elif _is_weak_dtype(o2_dtype):
+        o1_kind_num = _strong_dtype_num_kind(o1_dtype)
+        o2_kind_num = _weak_type_num_kind(o2_dtype)
+        if o2_kind_num > o1_kind_num:
+            if isinstance(o2_dtype, WeakIntegralType):
+                return o1_dtype, dpt.dtype(ti.default_device_int_type(dev))
+            if isinstance(o2_dtype, WeakComplexType):
+                if o1_dtype is dpt.float16 or o1_dtype is dpt.float32:
+                    return o1_dtype, dpt.complex64
+                return o1_dtype, _to_device_supported_dtype(dpt.complex128, dev)
+            return (
+                o1_dtype,
+                _to_device_supported_dtype(dpt.float64, dev),
+            )
+        else:
+            if o1_kind_num == o2_kind_num and isinstance(
+                o2_dtype, WeakIntegralType
+            ):
+                o2_val = o2_dtype.get()
+                o1_iinfo = dpt_ext.iinfo(o1_dtype)
+                if (o2_val < o1_iinfo.min) or (o2_val > o1_iinfo.max):
+                    return o1_dtype, dpt.dtype(np.min_scalar_type(o2_val))
+            return o1_dtype, o1_dtype
+    else:
+        return o1_dtype, o2_dtype
+
+
+def _resolve_one_strong_two_weak_types(st_dtype, dtype1, dtype2, dev):
+    """
+    Resolves weak data types per NEP-0050,
+    where the second and third arguments are
+    permitted to be weak types
+    """
+    if _is_weak_dtype(st_dtype):
+        raise ValueError
+    if _is_weak_dtype(dtype1):
+        if _is_weak_dtype(dtype2):
+            kind_num1 = _weak_type_num_kind(dtype1)
+            kind_num2 = _weak_type_num_kind(dtype2)
+            st_kind_num = _strong_dtype_num_kind(st_dtype)
+
+            if kind_num1 > st_kind_num:
+                if isinstance(dtype1, WeakIntegralType):
+                    ret_dtype1 = dpt.dtype(ti.default_device_int_type(dev))
+                elif isinstance(dtype1, WeakComplexType):
+                    if st_dtype is dpt.float16 or st_dtype is dpt.float32:
+                        ret_dtype1 = dpt.complex64
+                    ret_dtype1 = _to_device_supported_dtype(dpt.complex128, dev)
+                else:
+                    ret_dtype1 = _to_device_supported_dtype(dpt.float64, dev)
+            else:
+                ret_dtype1 = st_dtype
+
+            if kind_num2 > st_kind_num:
+                if isinstance(dtype2, WeakIntegralType):
+                    ret_dtype2 = dpt.dtype(ti.default_device_int_type(dev))
+                elif isinstance(dtype2, WeakComplexType):
+                    if st_dtype is dpt.float16 or st_dtype is dpt.float32:
+                        ret_dtype2 = dpt.complex64
+                    ret_dtype2 = _to_device_supported_dtype(dpt.complex128, dev)
+                else:
+                    ret_dtype2 = _to_device_supported_dtype(dpt.float64, dev)
+            else:
+                ret_dtype2 = st_dtype
+
+            return ret_dtype1, ret_dtype2
+
+        max_dt_num_kind, max_dtype = max(
+            [
+                (_strong_dtype_num_kind(st_dtype), st_dtype),
+                (_strong_dtype_num_kind(dtype2), dtype2),
+            ]
+        )
+        dt1_kind_num = _weak_type_num_kind(dtype1)
+        if dt1_kind_num > max_dt_num_kind:
+            if isinstance(dtype1, WeakIntegralType):
+                return dpt.dtype(ti.default_device_int_type(dev)), dtype2
+            if isinstance(dtype1, WeakComplexType):
+                if max_dtype is dpt.float16 or max_dtype is dpt.float32:
+                    return dpt.complex64, dtype2
+                return (
+                    _to_device_supported_dtype(dpt.complex128, dev),
+                    dtype2,
+                )
+            return _to_device_supported_dtype(dpt.float64, dev), dtype2
+        else:
+            return max_dtype, dtype2
+    elif _is_weak_dtype(dtype2):
+        max_dt_num_kind, max_dtype = max(
+            [
+                (_strong_dtype_num_kind(st_dtype), st_dtype),
+                (_strong_dtype_num_kind(dtype1), dtype1),
+            ]
+        )
+        dt2_kind_num = _weak_type_num_kind(dtype2)
+        if dt2_kind_num > max_dt_num_kind:
+            if isinstance(dtype2, WeakIntegralType):
+                return dtype1, dpt.dtype(ti.default_device_int_type(dev))
+            if isinstance(dtype2, WeakComplexType):
+                if max_dtype is dpt.float16 or max_dtype is dpt.float32:
+                    return dtype1, dpt.complex64
+                return (
+                    dtype1,
+                    _to_device_supported_dtype(dpt.complex128, dev),
+                )
+            return dtype1, _to_device_supported_dtype(dpt.float64, dev)
+        else:
+            return dtype1, max_dtype
+    else:
+        # both are strong dtypes
+        # return unmodified
+        return dtype1, dtype2
+
+
+def _resolve_one_strong_one_weak_types(st_dtype, dtype, dev):
+    """Resolves one weak data type with one strong data type per NEP-0050"""
+    if _is_weak_dtype(st_dtype):
+        raise ValueError
+    if _is_weak_dtype(dtype):
+        st_kind_num = _strong_dtype_num_kind(st_dtype)
+        kind_num = _weak_type_num_kind(dtype)
+        if kind_num > st_kind_num:
+            if isinstance(dtype, WeakIntegralType):
+                return dpt.dtype(ti.default_device_int_type(dev))
+            if isinstance(dtype, WeakComplexType):
+                if st_dtype is dpt.float16 or st_dtype is dpt.float32:
+                    return dpt.complex64
+                return _to_device_supported_dtype(dpt.complex128, dev)
+            return _to_device_supported_dtype(dpt.float64, dev)
+        else:
+            return st_dtype
+    else:
+        return dtype
+
+
+class finfo_object:
+    """
+    `numpy.finfo` subclass which returns Python floating-point scalars for
+    `eps`, `max`, `min`, and `smallest_normal` attributes.
+    """
+
+    def __init__(self, dtype):
+        _supported_dtype([dpt.dtype(dtype)])
+        self._finfo = np.finfo(dtype)
+
+    @property
+    def bits(self):
+        """Number of bits occupied by the real-valued floating-point data type."""
+        return int(self._finfo.bits)
+
+    @property
+    def smallest_normal(self):
+        """
+        Smallest positive real-valued floating-point number with full
+        precision.
+        """
+        return float(self._finfo.smallest_normal)
+
+    @property
+    def tiny(self):
+        """An alias for `smallest_normal`"""
+        return float(self._finfo.tiny)
+
+    @property
+    def eps(self):
+        """
+        Difference between 1.0 and the next smallest representable real-valued
+        floating-point number larger than 1.0 according to the IEEE-754
+        standard.
+        """
+        return float(self._finfo.eps)
+
+    @property
+    def epsneg(self):
+        """
+        Difference between 1.0 and the next smallest representable real-valued
+        floating-point number smaller than 1.0 according to the IEEE-754
+        standard.
+        """
+        return float(self._finfo.epsneg)
+
+    @property
+    def min(self):
+        """Smallest representable real-valued number."""
+        return float(self._finfo.min)
+
+    @property
+    def max(self):
+        """Largest representable real-valued number."""
+        return float(self._finfo.max)
+
+    @property
+    def resolution(self):
+        """The approximate decimal resolution of this type."""
+        return float(self._finfo.resolution)
+
+    @property
+    def precision(self):
+        """
+        The approximate number of decimal digits to which this kind of
+        floating point type is precise.
+        """
+        return float(self._finfo.precision)
+
+    @property
+    def dtype(self):
+        """
+        The dtype for which finfo returns information. For complex input, the
+        returned dtype is the associated floating point dtype for its real and
+        complex components.
+        """
+        return self._finfo.dtype
+
+    def __str__(self):
+        return self._finfo.__str__()
+
+    def __repr__(self):
+        return self._finfo.__repr__()
+
+
+def can_cast(from_, to, /, *, casting="safe") -> bool:
+    """can_cast(from, to, casting="safe")
+
+    Determines if one data type can be cast to another data type according \
+    to Type Promotion Rules.
+
+    Args:
+       from_ (Union[usm_ndarray, dtype]):
+           source data type. If `from_` is an array, a device-specific type
+           promotion rules apply.
+       to (dtype):
+           target data type
+       casting (Optional[str]):
+            controls what kind of data casting may occur.
+
+                * "no" means data types should not be cast at all.
+                * "safe" means only casts that preserve values are allowed.
+                * "same_kind" means only safe casts and casts within a kind,
+                  like `float64` to `float32`, are allowed.
+                * "unsafe" means any data conversion can be done.
+
+            Default: `"safe"`.
+
+    Returns:
+        bool:
+            Gives `True` if cast can occur according to the casting rule.
+
+    Device-specific type promotion rules take into account which data type are
+    and are not supported by a specific device.
+    """
+    if isinstance(to, dpt.usm_ndarray):
+        raise TypeError(f"Expected `dpt.dtype` type, got {type(to)}.")
+
+    dtype_to = dpt.dtype(to)
+    _supported_dtype([dtype_to])
+
+    if isinstance(from_, dpt.usm_ndarray):
+        dtype_from = from_.dtype
+        return _can_cast(
+            dtype_from,
+            dtype_to,
+            from_.sycl_device.has_aspect_fp16,
+            from_.sycl_device.has_aspect_fp64,
+            casting=casting,
+        )
+    else:
+        dtype_from = dpt.dtype(from_)
+        _supported_dtype([dtype_from])
+        # query casting as if all dtypes are supported
+        return _can_cast(dtype_from, dtype_to, True, True, casting=casting)
+
+
+def result_type(*arrays_and_dtypes):
+    """
+    result_type(*arrays_and_dtypes)
+
+    Returns the dtype that results from applying the Type Promotion Rules to \
+        the arguments.
+
+    Args:
+        arrays_and_dtypes (Union[usm_ndarray, dtype]):
+            An arbitrary length sequence of usm_ndarray objects or dtypes.
+
+    Returns:
+        dtype:
+            The dtype resulting from an operation involving the
+            input arrays and dtypes.
+    """
+    dtypes = []
+    devices = []
+    weak_dtypes = []
+    for arg_i in arrays_and_dtypes:
+        if isinstance(arg_i, dpt.usm_ndarray):
+            devices.append(arg_i.sycl_device)
+            dtypes.append(arg_i.dtype)
+        elif isinstance(arg_i, int):
+            weak_dtypes.append(WeakIntegralType(arg_i))
+        elif isinstance(arg_i, float):
+            weak_dtypes.append(WeakFloatingType(arg_i))
+        elif isinstance(arg_i, complex):
+            weak_dtypes.append(WeakComplexType(arg_i))
+        elif isinstance(arg_i, bool):
+            weak_dtypes.append(WeakBooleanType(arg_i))
+        else:
+            dt = dpt.dtype(arg_i)
+            _supported_dtype([dt])
+            dtypes.append(dt)
+
+    has_fp16 = True
+    has_fp64 = True
+    target_dev = None
+    if devices:
+        inspected = False
+        for d in devices:
+            if inspected:
+                unsame_fp16_support = d.has_aspect_fp16 != has_fp16
+                unsame_fp64_support = d.has_aspect_fp64 != has_fp64
+                if unsame_fp16_support or unsame_fp64_support:
+                    raise ValueError(
+                        "Input arrays reside on devices "
+                        "with different device supports; "
+                        "unable to determine which "
+                        "device-specific type promotion rules "
+                        "to use."
+                    )
+            else:
+                has_fp16 = d.has_aspect_fp16
+                has_fp64 = d.has_aspect_fp64
+                target_dev = d
+                inspected = True
+
+    if not dtypes and weak_dtypes:
+        dtypes.append(weak_dtypes[0].get())
+
+    if not (has_fp16 and has_fp64):
+        for dt in dtypes:
+            if not _dtype_supported_by_device_impl(dt, has_fp16, has_fp64):
+                raise ValueError(
+                    f"Argument {dt} is not supported by the device"
+                )
+        res_dt = np.result_type(*dtypes)
+        res_dt = _to_device_supported_dtype_impl(res_dt, has_fp16, has_fp64)
+        for wdt in weak_dtypes:
+            pair = _resolve_weak_types(wdt, res_dt, target_dev)
+            res_dt = np.result_type(*pair)
+            res_dt = _to_device_supported_dtype_impl(res_dt, has_fp16, has_fp64)
+    else:
+        res_dt = np.result_type(*dtypes)
+        if weak_dtypes:
+            weak_dt_obj = [wdt.get() for wdt in weak_dtypes]
+            res_dt = np.result_type(res_dt, *weak_dt_obj)
+
+    return res_dt
+
+
+def iinfo(dtype, /):
+    """iinfo(dtype)
+
+    Returns machine limits for integer data types.
+
+    Args:
+        dtype (dtype, usm_ndarray):
+            integer dtype or
+            an array with integer dtype.
+
+    Returns:
+        iinfo_object:
+            An object with the following attributes:
+
+            * bits: int
+                number of bits occupied by the data type
+            * max: int
+                largest representable number.
+            * min: int
+                smallest representable number.
+            * dtype: dtype
+                integer data type.
+    """
+    if isinstance(dtype, dpt.usm_ndarray):
+        dtype = dtype.dtype
+    _supported_dtype([dpt.dtype(dtype)])
+    return np.iinfo(dtype)
+
+
+def finfo(dtype, /):
+    """finfo(type)
+
+    Returns machine limits for floating-point data types.
+
+    Args:
+        dtype (dtype, usm_ndarray): floating-point dtype or
+            an array with floating point data type.
+            If complex, the information is about its component
+            data type.
+
+    Returns:
+        finfo_object:
+            an object have the following attributes:
+
+                * bits: int
+                    number of bits occupied by dtype.
+                * eps: float
+                    difference between 1.0 and the next smallest representable
+                    real-valued floating-point number larger than 1.0 according
+                    to the IEEE-754 standard.
+                * max: float
+                    largest representable real-valued number.
+                * min: float
+                    smallest representable real-valued number.
+                * smallest_normal: float
+                    smallest positive real-valued floating-point number with
+                    full precision.
+                * dtype: dtype
+                    real-valued floating-point data type.
+
+    """
+    if isinstance(dtype, dpt.usm_ndarray):
+        dtype = dtype.dtype
+    _supported_dtype([dpt.dtype(dtype)])
+    return finfo_object(dtype)
+
+
+def _supported_dtype(dtypes):
+    for dtype in dtypes:
+        if dtype.char not in "?bBhHiIlLqQefdFD":
+            raise ValueError(f"Dpctl doesn't support dtype {dtype}.")
+    return True
+
+
+def isdtype(dtype, kind):
+    """isdtype(dtype, kind)
+
+    Returns a boolean indicating whether a provided `dtype` is
+    of a specified data type `kind`.
+
+    See [array API](array_api) for more information.
+
+    [array_api]: https://data-apis.org/array-api/latest/
+    """
+
+    if not isinstance(dtype, np.dtype):
+        raise TypeError(f"Expected instance of `dpt.dtype`, got {dtype}")
+
+    if isinstance(kind, np.dtype):
+        return dtype == kind
+
+    elif isinstance(kind, str):
+        if kind == "bool":
+            return dtype == np.dtype("bool")
+        elif kind == "signed integer":
+            return dtype.kind == "i"
+        elif kind == "unsigned integer":
+            return dtype.kind == "u"
+        elif kind == "integral":
+            return dtype.kind in "iu"
+        elif kind == "real floating":
+            return dtype.kind == "f"
+        elif kind == "complex floating":
+            return dtype.kind == "c"
+        elif kind == "numeric":
+            return dtype.kind in "iufc"
+        else:
+            raise ValueError(f"Unrecognized data type kind: {kind}")
+
+    elif isinstance(kind, tuple):
+        return any(isdtype(dtype, k) for k in kind)
+
+    else:
+        raise TypeError(f"Unsupported data type kind: {kind}")
+
+
+def _default_accumulation_dtype(inp_dt, q):
+    """Gives default output data type for given input data
+    type `inp_dt` when accumulation is performed on queue `q`
+    """
+    inp_kind = inp_dt.kind
+    if inp_kind in "bi":
+        res_dt = dpt.dtype(ti.default_device_int_type(q))
+        if inp_dt.itemsize > res_dt.itemsize:
+            res_dt = inp_dt
+    elif inp_kind in "u":
+        res_dt = dpt.dtype(ti.default_device_uint_type(q))
+        res_ii = dpt_ext.iinfo(res_dt)
+        inp_ii = dpt_ext.iinfo(inp_dt)
+        if inp_ii.min >= res_ii.min and inp_ii.max <= res_ii.max:
+            pass
+        else:
+            res_dt = inp_dt
+    elif inp_kind in "fc":
+        res_dt = inp_dt
+
+    return res_dt
+
+
+def _default_accumulation_dtype_fp_types(inp_dt, q):
+    """Gives default output data type for given input data
+    type `inp_dt` when accumulation is performed on queue `q`
+    and the accumulation supports only floating-point data types
+    """
+    inp_kind = inp_dt.kind
+    if inp_kind in "biu":
+        res_dt = dpt.dtype(ti.default_device_fp_type(q))
+        can_cast_v = dpt_ext.can_cast(inp_dt, res_dt)
+        if not can_cast_v:
+            _fp64 = q.sycl_device.has_aspect_fp64
+            res_dt = dpt.float64 if _fp64 else dpt.float32
+    elif inp_kind in "f":
+        res_dt = inp_dt
+    elif inp_kind in "c":
+        raise ValueError("function not defined for complex types")
+
+    return res_dt
+
+
+__all__ = [
+    "_find_buf_dtype",
+    "_find_buf_dtype2",
+    "_to_device_supported_dtype",
+    "_acceptance_fn_default_unary",
+    "_acceptance_fn_reciprocal",
+    "_acceptance_fn_default_binary",
+    "_acceptance_fn_divide",
+    "_acceptance_fn_negative",
+    "_acceptance_fn_subtract",
+    "_resolve_one_strong_one_weak_types",
+    "_resolve_one_strong_two_weak_types",
+    "_resolve_weak_types",
+    "_resolve_weak_types_all_py_ints",
+    "_weak_type_num_kind",
+    "_strong_dtype_num_kind",
+    "can_cast",
+    "finfo",
+    "iinfo",
+    "isdtype",
+    "result_type",
+    "WeakBooleanType",
+    "WeakIntegralType",
+    "WeakFloatingType",
+    "WeakComplexType",
+    "_default_accumulation_dtype",
+    "_default_accumulation_dtype_fp_types",
+    "_find_buf_dtype_in_place_op",
+]
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/clip.hpp b/dpctl_ext/tensor/libtensor/include/kernels/clip.hpp
new file mode 100644
index 000000000000..58a86a8f82d6
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/include/kernels/clip.hpp
@@ -0,0 +1,357 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for dpctl.tensor.clip.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <algorithm>
+#include <cmath>
+#include <cstddef>
+#include <cstdint>
+#include <type_traits>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpctl_tensor_types.hpp"
+#include "kernels/alignment.hpp"
+#include "utils/math_utils.hpp"
+#include "utils/offset_utils.hpp"
+#include "utils/sycl_utils.hpp"
+#include "utils/type_utils.hpp"
+
+namespace dpctl::tensor::kernels::clip
+{
+
+using dpctl::tensor::ssize_t;
+using namespace dpctl::tensor::offset_utils;
+
+using dpctl::tensor::kernels::alignment_utils::
+    disabled_sg_loadstore_wrapper_krn;
+using dpctl::tensor::kernels::alignment_utils::is_aligned;
+using dpctl::tensor::kernels::alignment_utils::required_alignment;
+
+using dpctl::tensor::sycl_utils::sub_group_load;
+using dpctl::tensor::sycl_utils::sub_group_store;
+
+template <typename T>
+T clip(const T &x, const T &min, const T &max)
+{
+    using dpctl::tensor::type_utils::is_complex;
+    if constexpr (is_complex<T>::value) {
+        using dpctl::tensor::math_utils::max_complex;
+        using dpctl::tensor::math_utils::min_complex;
+        return min_complex(max_complex(x, min), max);
+    }
+    else if constexpr (std::is_floating_point_v<T> ||
+                       std::is_same_v<T, sycl::half>) {
+        auto tmp = (std::isnan(x) || x > min) ? x : min;
+        return (std::isnan(tmp) || tmp < max) ? tmp : max;
+    }
+    else if constexpr (std::is_same_v<T, bool>) {
+        return (x || min) && max;
+    }
+    else {
+        auto tmp = (x > min) ? x : min;
+        return (tmp < max) ? tmp : max;
+    }
+}
+
+template <typename T,
+          std::uint8_t vec_sz = 4,
+          std::uint8_t n_vecs = 2,
+          bool enable_sg_loadstore = true>
+class ClipContigFunctor
+{
+private:
+    std::size_t nelems = 0;
+    const T *x_p = nullptr;
+    const T *min_p = nullptr;
+    const T *max_p = nullptr;
+    T *dst_p = nullptr;
+
+public:
+    ClipContigFunctor(std::size_t nelems_,
+                      const T *x_p_,
+                      const T *min_p_,
+                      const T *max_p_,
+                      T *dst_p_)
+        : nelems(nelems_), x_p(x_p_), min_p(min_p_), max_p(max_p_),
+          dst_p(dst_p_)
+    {
+    }
+
+    void operator()(sycl::nd_item<1> ndit) const
+    {
+        static constexpr std::uint8_t nelems_per_wi = n_vecs * vec_sz;
+
+        using dpctl::tensor::type_utils::is_complex;
+        if constexpr (is_complex<T>::value || !enable_sg_loadstore) {
+            const std::uint16_t sgSize =
+                ndit.get_sub_group().get_local_range()[0];
+            const std::size_t gid = ndit.get_global_linear_id();
+            const std::uint16_t nelems_per_sg = sgSize * nelems_per_wi;
+
+            const std::size_t start =
+                (gid / sgSize) * (nelems_per_sg - sgSize) + gid;
+            const std::size_t end = std::min(nelems, start + nelems_per_sg);
+
+            for (std::size_t offset = start; offset < end; offset += sgSize) {
+                dst_p[offset] = clip(x_p[offset], min_p[offset], max_p[offset]);
+            }
+        }
+        else {
+            auto sg = ndit.get_sub_group();
+            const std::uint16_t sgSize = sg.get_max_local_range()[0];
+
+            const std::size_t base =
+                nelems_per_wi * (ndit.get_group(0) * ndit.get_local_range(0) +
+                                 sg.get_group_id()[0] * sgSize);
+
+            if (base + nelems_per_wi * sgSize < nelems) {
+                sycl::vec<T, vec_sz> dst_vec;
+#pragma unroll
+                for (std::uint8_t it = 0; it < n_vecs * vec_sz; it += vec_sz) {
+                    const std::size_t idx = base + it * sgSize;
+                    auto x_multi_ptr = sycl::address_space_cast<
+                        sycl::access::address_space::global_space,
+                        sycl::access::decorated::yes>(&x_p[idx]);
+                    auto min_multi_ptr = sycl::address_space_cast<
+                        sycl::access::address_space::global_space,
+                        sycl::access::decorated::yes>(&min_p[idx]);
+                    auto max_multi_ptr = sycl::address_space_cast<
+                        sycl::access::address_space::global_space,
+                        sycl::access::decorated::yes>(&max_p[idx]);
+                    auto dst_multi_ptr = sycl::address_space_cast<
+                        sycl::access::address_space::global_space,
+                        sycl::access::decorated::yes>(&dst_p[idx]);
+
+                    const sycl::vec<T, vec_sz> x_vec =
+                        sub_group_load<vec_sz>(sg, x_multi_ptr);
+                    const sycl::vec<T, vec_sz> min_vec =
+                        sub_group_load<vec_sz>(sg, min_multi_ptr);
+                    const sycl::vec<T, vec_sz> max_vec =
+                        sub_group_load<vec_sz>(sg, max_multi_ptr);
+#pragma unroll
+                    for (std::uint8_t vec_id = 0; vec_id < vec_sz; ++vec_id) {
+                        dst_vec[vec_id] = clip(x_vec[vec_id], min_vec[vec_id],
+                                               max_vec[vec_id]);
+                    }
+                    sub_group_store<vec_sz>(sg, dst_vec, dst_multi_ptr);
+                }
+            }
+            else {
+                const std::size_t lane_id = sg.get_local_id()[0];
+                for (std::size_t k = base + lane_id; k < nelems; k += sgSize) {
+                    dst_p[k] = clip(x_p[k], min_p[k], max_p[k]);
+                }
+            }
+        }
+    }
+};
+
+template <typename T, int vec_sz, int n_vecs>
+class clip_contig_kernel;
+
+typedef sycl::event (*clip_contig_impl_fn_ptr_t)(
+    sycl::queue &,
+    std::size_t,
+    const char *,
+    const char *,
+    const char *,
+    char *,
+    const std::vector<sycl::event> &);
+
+template <typename T>
+sycl::event clip_contig_impl(sycl::queue &q,
+                             std::size_t nelems,
+                             const char *x_cp,
+                             const char *min_cp,
+                             const char *max_cp,
+                             char *dst_cp,
+                             const std::vector<sycl::event> &depends)
+{
+    const T *x_tp = reinterpret_cast<const T *>(x_cp);
+    const T *min_tp = reinterpret_cast<const T *>(min_cp);
+    const T *max_tp = reinterpret_cast<const T *>(max_cp);
+    T *dst_tp = reinterpret_cast<T *>(dst_cp);
+
+    sycl::event clip_ev = q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+
+        std::size_t lws = 64;
+        static constexpr std::uint8_t vec_sz = 4;
+        static constexpr std::uint8_t n_vecs = 2;
+        const std::size_t n_groups =
+            ((nelems + lws * n_vecs * vec_sz - 1) / (lws * n_vecs * vec_sz));
+        const auto gws_range = sycl::range<1>(n_groups * lws);
+        const auto lws_range = sycl::range<1>(lws);
+
+        if (is_aligned<required_alignment>(x_cp) &&
+            is_aligned<required_alignment>(min_cp) &&
+            is_aligned<required_alignment>(max_cp) &&
+            is_aligned<required_alignment>(dst_cp))
+        {
+            static constexpr bool enable_sg_loadstore = true;
+            using KernelName = clip_contig_kernel<T, vec_sz, n_vecs>;
+            using Impl =
+                ClipContigFunctor<T, vec_sz, n_vecs, enable_sg_loadstore>;
+
+            cgh.parallel_for<KernelName>(
+                sycl::nd_range<1>(gws_range, lws_range),
+                Impl(nelems, x_tp, min_tp, max_tp, dst_tp));
+        }
+        else {
+            static constexpr bool disable_sg_loadstore = false;
+            using InnerKernelName = clip_contig_kernel<T, vec_sz, n_vecs>;
+            using KernelName =
+                disabled_sg_loadstore_wrapper_krn<InnerKernelName>;
+            using Impl =
+                ClipContigFunctor<T, vec_sz, n_vecs, disable_sg_loadstore>;
+
+            cgh.parallel_for<KernelName>(
+                sycl::nd_range<1>(gws_range, lws_range),
+                Impl(nelems, x_tp, min_tp, max_tp, dst_tp));
+        }
+    });
+
+    return clip_ev;
+}
+
+template <typename T, typename IndexerT>
+class ClipStridedFunctor
+{
+private:
+    const T *x_p = nullptr;
+    const T *min_p = nullptr;
+    const T *max_p = nullptr;
+    T *dst_p = nullptr;
+    IndexerT indexer;
+
+public:
+    ClipStridedFunctor(const T *x_p_,
+                       const T *min_p_,
+                       const T *max_p_,
+                       T *dst_p_,
+                       const IndexerT &indexer_)
+        : x_p(x_p_), min_p(min_p_), max_p(max_p_), dst_p(dst_p_),
+          indexer(indexer_)
+    {
+    }
+
+    void operator()(sycl::id<1> id) const
+    {
+        std::size_t gid = id[0];
+        auto offsets = indexer(static_cast<ssize_t>(gid));
+        dst_p[offsets.get_fourth_offset()] = clip(
+            x_p[offsets.get_first_offset()], min_p[offsets.get_second_offset()],
+            max_p[offsets.get_third_offset()]);
+    }
+};
+
+template <typename T, typename IndexerT>
+class clip_strided_kernel;
+
+typedef sycl::event (*clip_strided_impl_fn_ptr_t)(
+    sycl::queue &,
+    std::size_t,
+    int,
+    const char *,
+    const char *,
+    const char *,
+    char *,
+    const ssize_t *,
+    ssize_t,
+    ssize_t,
+    ssize_t,
+    ssize_t,
+    const std::vector<sycl::event> &);
+
+template <typename T>
+sycl::event clip_strided_impl(sycl::queue &q,
+                              std::size_t nelems,
+                              int nd,
+                              const char *x_cp,
+                              const char *min_cp,
+                              const char *max_cp,
+                              char *dst_cp,
+                              const ssize_t *shape_strides,
+                              ssize_t x_offset,
+                              ssize_t min_offset,
+                              ssize_t max_offset,
+                              ssize_t dst_offset,
+                              const std::vector<sycl::event> &depends)
+{
+    const T *x_tp = reinterpret_cast<const T *>(x_cp);
+    const T *min_tp = reinterpret_cast<const T *>(min_cp);
+    const T *max_tp = reinterpret_cast<const T *>(max_cp);
+    T *dst_tp = reinterpret_cast<T *>(dst_cp);
+
+    sycl::event clip_ev = q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+
+        const FourOffsets_StridedIndexer indexer{
+            nd, x_offset, min_offset, max_offset, dst_offset, shape_strides};
+
+        using KernelName = clip_strided_kernel<T, FourOffsets_StridedIndexer>;
+        using Impl = ClipStridedFunctor<T, FourOffsets_StridedIndexer>;
+
+        cgh.parallel_for<KernelName>(
+            sycl::range<1>(nelems),
+            Impl(x_tp, min_tp, max_tp, dst_tp, indexer));
+    });
+
+    return clip_ev;
+}
+
+template <typename fnT, typename T>
+struct ClipStridedFactory
+{
+    fnT get()
+    {
+        fnT fn = clip_strided_impl<T>;
+        return fn;
+    }
+};
+
+template <typename fnT, typename T>
+struct ClipContigFactory
+{
+    fnT get()
+    {
+
+        fnT fn = clip_contig_impl<T>;
+        return fn;
+    }
+};
+
+} // namespace dpctl::tensor::kernels::clip
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/repeat.hpp b/dpctl_ext/tensor/libtensor/include/kernels/repeat.hpp
new file mode 100644
index 000000000000..83a520adb538
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/include/kernels/repeat.hpp
@@ -0,0 +1,460 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for tensor repeating operations.
+//===----------------------------------------------------------------------===//
+
+#pragma once
+#include <cstddef>
+#include <cstdint>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpctl_tensor_types.hpp"
+#include "utils/offset_utils.hpp"
+
+namespace dpctl::tensor::kernels::repeat
+{
+
+using dpctl::tensor::ssize_t;
+using namespace dpctl::tensor::offset_utils;
+
+template <typename OrthogIndexer,
+          typename SrcAxisIndexer,
+          typename DstAxisIndexer,
+          typename RepIndexer,
+          typename T,
+          typename repT>
+class repeat_by_sequence_kernel;
+
+template <typename OrthogIndexer,
+          typename SrcAxisIndexer,
+          typename DstAxisIndexer,
+          typename RepIndexer,
+          typename T,
+          typename repT>
+class RepeatSequenceFunctor
+{
+private:
+    const T *src = nullptr;
+    T *dst = nullptr;
+    const repT *reps = nullptr;
+    const repT *cumsum = nullptr;
+    std::size_t src_axis_nelems = 1;
+    OrthogIndexer orthog_strider;
+    SrcAxisIndexer src_axis_strider;
+    DstAxisIndexer dst_axis_strider;
+    RepIndexer reps_strider;
+
+public:
+    RepeatSequenceFunctor(const T *src_,
+                          T *dst_,
+                          const repT *reps_,
+                          const repT *cumsum_,
+                          std::size_t src_axis_nelems_,
+                          const OrthogIndexer &orthog_strider_,
+                          const SrcAxisIndexer &src_axis_strider_,
+                          const DstAxisIndexer &dst_axis_strider_,
+                          const RepIndexer &reps_strider_)
+        : src(src_), dst(dst_), reps(reps_), cumsum(cumsum_),
+          src_axis_nelems(src_axis_nelems_), orthog_strider(orthog_strider_),
+          src_axis_strider(src_axis_strider_),
+          dst_axis_strider(dst_axis_strider_), reps_strider(reps_strider_)
+    {
+    }
+
+    void operator()(sycl::id<1> idx) const
+    {
+        std::size_t id = idx[0];
+        auto i_orthog = id / src_axis_nelems;
+        auto i_along = id - (i_orthog * src_axis_nelems);
+
+        auto orthog_offsets = orthog_strider(i_orthog);
+        auto src_offset = orthog_offsets.get_first_offset();
+        auto dst_offset = orthog_offsets.get_second_offset();
+
+        auto val = src[src_offset + src_axis_strider(i_along)];
+        auto last = cumsum[i_along];
+        auto first = last - reps[reps_strider(i_along)];
+        for (auto i = first; i < last; ++i) {
+            dst[dst_offset + dst_axis_strider(i)] = val;
+        }
+    }
+};
+
+typedef sycl::event (*repeat_by_sequence_fn_ptr_t)(
+    sycl::queue &,
+    std::size_t,
+    std::size_t,
+    const char *,
+    char *,
+    const char *,
+    const char *,
+    int,
+    const ssize_t *,
+    ssize_t,
+    ssize_t,
+    ssize_t,
+    ssize_t,
+    ssize_t,
+    ssize_t,
+    ssize_t,
+    ssize_t,
+    const std::vector<sycl::event> &);
+
+template <typename T, typename repT>
+sycl::event
+    repeat_by_sequence_impl(sycl::queue &q,
+                            std::size_t orthog_nelems,
+                            std::size_t src_axis_nelems,
+                            const char *src_cp,
+                            char *dst_cp,
+                            const char *reps_cp,
+                            const char *cumsum_cp,
+                            int orthog_nd,
+                            const ssize_t *orthog_src_dst_shape_and_strides,
+                            ssize_t src_offset,
+                            ssize_t dst_offset,
+                            ssize_t src_axis_shape,
+                            ssize_t src_axis_stride,
+                            ssize_t dst_axis_shape,
+                            ssize_t dst_axis_stride,
+                            ssize_t reps_shape,
+                            ssize_t reps_stride,
+                            const std::vector<sycl::event> &depends)
+{
+    sycl::event repeat_ev = q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+
+        const T *src_tp = reinterpret_cast<const T *>(src_cp);
+        const repT *reps_tp = reinterpret_cast<const repT *>(reps_cp);
+        const repT *cumsum_tp = reinterpret_cast<const repT *>(cumsum_cp);
+        T *dst_tp = reinterpret_cast<T *>(dst_cp);
+
+        // orthog ndim indexer
+        const TwoOffsets_StridedIndexer orthog_indexer{
+            orthog_nd, src_offset, dst_offset,
+            orthog_src_dst_shape_and_strides};
+        // indexers along repeated axis
+        const Strided1DIndexer src_axis_indexer{/* size */ src_axis_shape,
+                                                /* step */ src_axis_stride};
+        const Strided1DIndexer dst_axis_indexer{/* size */ dst_axis_shape,
+                                                /* step */ dst_axis_stride};
+        // indexer along reps array
+        const Strided1DIndexer reps_indexer{/* size */ reps_shape,
+                                            /* step */ reps_stride};
+
+        const std::size_t gws = orthog_nelems * src_axis_nelems;
+
+        cgh.parallel_for<repeat_by_sequence_kernel<
+            TwoOffsets_StridedIndexer, Strided1DIndexer, Strided1DIndexer,
+            Strided1DIndexer, T, repT>>(
+            sycl::range<1>(gws),
+            RepeatSequenceFunctor<TwoOffsets_StridedIndexer, Strided1DIndexer,
+                                  Strided1DIndexer, Strided1DIndexer, T, repT>(
+                src_tp, dst_tp, reps_tp, cumsum_tp, src_axis_nelems,
+                orthog_indexer, src_axis_indexer, dst_axis_indexer,
+                reps_indexer));
+    });
+
+    return repeat_ev;
+}
+
+template <typename fnT, typename T>
+struct RepeatSequenceFactory
+{
+    fnT get()
+    {
+        fnT fn = repeat_by_sequence_impl<T, std::int64_t>;
+        return fn;
+    }
+};
+
+typedef sycl::event (*repeat_by_sequence_1d_fn_ptr_t)(
+    sycl::queue &,
+    std::size_t,
+    const char *,
+    char *,
+    const char *,
+    const char *,
+    int,
+    const ssize_t *,
+    ssize_t,
+    ssize_t,
+    ssize_t,
+    ssize_t,
+    const std::vector<sycl::event> &);
+
+template <typename T, typename repT>
+sycl::event repeat_by_sequence_1d_impl(sycl::queue &q,
+                                       std::size_t src_nelems,
+                                       const char *src_cp,
+                                       char *dst_cp,
+                                       const char *reps_cp,
+                                       const char *cumsum_cp,
+                                       int src_nd,
+                                       const ssize_t *src_shape_strides,
+                                       ssize_t dst_shape,
+                                       ssize_t dst_stride,
+                                       ssize_t reps_shape,
+                                       ssize_t reps_stride,
+                                       const std::vector<sycl::event> &depends)
+{
+    sycl::event repeat_ev = q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+
+        const T *src_tp = reinterpret_cast<const T *>(src_cp);
+        const repT *reps_tp = reinterpret_cast<const repT *>(reps_cp);
+        const repT *cumsum_tp = reinterpret_cast<const repT *>(cumsum_cp);
+        T *dst_tp = reinterpret_cast<T *>(dst_cp);
+
+        // orthog ndim indexer
+        static constexpr TwoZeroOffsets_Indexer orthog_indexer{};
+        // indexers along repeated axis
+        const StridedIndexer src_indexer{src_nd, 0, src_shape_strides};
+        const Strided1DIndexer dst_indexer{/* size */ dst_shape,
+                                           /* step */ dst_stride};
+        // indexer along reps array
+        const Strided1DIndexer reps_indexer{/* size */ reps_shape,
+                                            /* step */ reps_stride};
+
+        const std::size_t gws = src_nelems;
+
+        cgh.parallel_for<repeat_by_sequence_kernel<
+            TwoZeroOffsets_Indexer, StridedIndexer, Strided1DIndexer,
+            Strided1DIndexer, T, repT>>(
+            sycl::range<1>(gws),
+            RepeatSequenceFunctor<TwoZeroOffsets_Indexer, StridedIndexer,
+                                  Strided1DIndexer, Strided1DIndexer, T, repT>(
+                src_tp, dst_tp, reps_tp, cumsum_tp, src_nelems, orthog_indexer,
+                src_indexer, dst_indexer, reps_indexer));
+    });
+
+    return repeat_ev;
+}
+
+template <typename fnT, typename T>
+struct RepeatSequence1DFactory
+{
+    fnT get()
+    {
+        fnT fn = repeat_by_sequence_1d_impl<T, std::int64_t>;
+        return fn;
+    }
+};
+
+template <typename OrthogIndexer,
+          typename SrcAxisIndexer,
+          typename DstAxisIndexer,
+          typename T>
+class repeat_by_scalar_kernel;
+
+template <typename OrthogIndexer,
+          typename SrcAxisIndexer,
+          typename DstAxisIndexer,
+          typename T>
+class RepeatScalarFunctor
+{
+private:
+    const T *src = nullptr;
+    T *dst = nullptr;
+    ssize_t reps = 1;
+    std::size_t dst_axis_nelems = 0;
+    OrthogIndexer orthog_strider;
+    SrcAxisIndexer src_axis_strider;
+    DstAxisIndexer dst_axis_strider;
+
+public:
+    RepeatScalarFunctor(const T *src_,
+                        T *dst_,
+                        const ssize_t reps_,
+                        std::size_t dst_axis_nelems_,
+                        const OrthogIndexer &orthog_strider_,
+                        const SrcAxisIndexer &src_axis_strider_,
+                        const DstAxisIndexer &dst_axis_strider_)
+        : src(src_), dst(dst_), reps(reps_), dst_axis_nelems(dst_axis_nelems_),
+          orthog_strider(orthog_strider_), src_axis_strider(src_axis_strider_),
+          dst_axis_strider(dst_axis_strider_)
+    {
+    }
+
+    void operator()(sycl::id<1> idx) const
+    {
+        std::size_t id = idx[0];
+        auto i_orthog = id / dst_axis_nelems;
+        auto i_along = id - (i_orthog * dst_axis_nelems);
+
+        auto orthog_offsets = orthog_strider(i_orthog);
+        auto src_offset = orthog_offsets.get_first_offset();
+        auto dst_offset = orthog_offsets.get_second_offset();
+
+        auto dst_axis_offset = dst_axis_strider(i_along);
+        auto src_axis_offset = src_axis_strider(i_along / reps);
+        dst[dst_offset + dst_axis_offset] = src[src_offset + src_axis_offset];
+    }
+};
+
+typedef sycl::event (*repeat_by_scalar_fn_ptr_t)(
+    sycl::queue &,
+    std::size_t,
+    std::size_t,
+    const char *,
+    char *,
+    const ssize_t,
+    int,
+    const ssize_t *,
+    ssize_t,
+    ssize_t,
+    ssize_t,
+    ssize_t,
+    ssize_t,
+    ssize_t,
+    const std::vector<sycl::event> &);
+
+template <typename T>
+sycl::event repeat_by_scalar_impl(sycl::queue &q,
+                                  std::size_t orthog_nelems,
+                                  std::size_t dst_axis_nelems,
+                                  const char *src_cp,
+                                  char *dst_cp,
+                                  const ssize_t reps,
+                                  int orthog_nd,
+                                  const ssize_t *orthog_shape_and_strides,
+                                  ssize_t src_offset,
+                                  ssize_t dst_offset,
+                                  ssize_t src_axis_shape,
+                                  ssize_t src_axis_stride,
+                                  ssize_t dst_axis_shape,
+                                  ssize_t dst_axis_stride,
+                                  const std::vector<sycl::event> &depends)
+{
+    sycl::event repeat_ev = q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+
+        const T *src_tp = reinterpret_cast<const T *>(src_cp);
+        T *dst_tp = reinterpret_cast<T *>(dst_cp);
+
+        // orthog ndim indexer
+        const TwoOffsets_StridedIndexer orthog_indexer{
+            orthog_nd, src_offset, dst_offset, orthog_shape_and_strides};
+        // indexers along repeated axis
+        const Strided1DIndexer src_axis_indexer{/* size */ src_axis_shape,
+                                                /* step */ src_axis_stride};
+        const Strided1DIndexer dst_axis_indexer{/* size */ dst_axis_shape,
+                                                /* step */ dst_axis_stride};
+
+        const std::size_t gws = orthog_nelems * dst_axis_nelems;
+
+        cgh.parallel_for<repeat_by_scalar_kernel<
+            TwoOffsets_StridedIndexer, Strided1DIndexer, Strided1DIndexer, T>>(
+            sycl::range<1>(gws),
+            RepeatScalarFunctor<TwoOffsets_StridedIndexer, Strided1DIndexer,
+                                Strided1DIndexer, T>(
+                src_tp, dst_tp, reps, dst_axis_nelems, orthog_indexer,
+                src_axis_indexer, dst_axis_indexer));
+    });
+
+    return repeat_ev;
+}
+
+template <typename fnT, typename T>
+struct RepeatScalarFactory
+{
+    fnT get()
+    {
+        fnT fn = repeat_by_scalar_impl<T>;
+        return fn;
+    }
+};
+
+typedef sycl::event (*repeat_by_scalar_1d_fn_ptr_t)(
+    sycl::queue &,
+    std::size_t,
+    const char *,
+    char *,
+    const ssize_t,
+    int,
+    const ssize_t *,
+    ssize_t,
+    ssize_t,
+    const std::vector<sycl::event> &);
+
+template <typename T>
+sycl::event repeat_by_scalar_1d_impl(sycl::queue &q,
+                                     std::size_t dst_nelems,
+                                     const char *src_cp,
+                                     char *dst_cp,
+                                     const ssize_t reps,
+                                     int src_nd,
+                                     const ssize_t *src_shape_strides,
+                                     ssize_t dst_shape,
+                                     ssize_t dst_stride,
+                                     const std::vector<sycl::event> &depends)
+{
+    sycl::event repeat_ev = q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+
+        const T *src_tp = reinterpret_cast<const T *>(src_cp);
+        T *dst_tp = reinterpret_cast<T *>(dst_cp);
+
+        // orthog ndim indexer
+        static constexpr TwoZeroOffsets_Indexer orthog_indexer{};
+        // indexers along repeated axis
+        const StridedIndexer src_indexer(src_nd, 0, src_shape_strides);
+        const Strided1DIndexer dst_indexer{/* size */ dst_shape,
+                                           /* step */ dst_stride};
+
+        const std::size_t gws = dst_nelems;
+
+        cgh.parallel_for<repeat_by_scalar_kernel<
+            TwoZeroOffsets_Indexer, StridedIndexer, Strided1DIndexer, T>>(
+            sycl::range<1>(gws),
+            RepeatScalarFunctor<TwoZeroOffsets_Indexer, StridedIndexer,
+                                Strided1DIndexer, T>(src_tp, dst_tp, reps,
+                                                     dst_nelems, orthog_indexer,
+                                                     src_indexer, dst_indexer));
+    });
+
+    return repeat_ev;
+}
+
+template <typename fnT, typename T>
+struct RepeatScalar1DFactory
+{
+    fnT get()
+    {
+        fnT fn = repeat_by_scalar_1d_impl<T>;
+        return fn;
+    }
+};
+
+} // namespace dpctl::tensor::kernels::repeat
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/where.hpp b/dpctl_ext/tensor/libtensor/include/kernels/where.hpp
new file mode 100644
index 000000000000..454e1e61fa0d
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/include/kernels/where.hpp
@@ -0,0 +1,338 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for dpctl.tensor.where.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <algorithm>
+#include <cstddef>
+#include <cstdint>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpctl_tensor_types.hpp"
+#include "kernels/alignment.hpp"
+#include "utils/offset_utils.hpp"
+#include "utils/sycl_utils.hpp"
+#include "utils/type_utils.hpp"
+
+namespace dpctl::tensor::kernels::search
+{
+
+using dpctl::tensor::ssize_t;
+using namespace dpctl::tensor::offset_utils;
+
+using dpctl::tensor::kernels::alignment_utils::
+    disabled_sg_loadstore_wrapper_krn;
+using dpctl::tensor::kernels::alignment_utils::is_aligned;
+using dpctl::tensor::kernels::alignment_utils::required_alignment;
+
+using dpctl::tensor::sycl_utils::sub_group_load;
+using dpctl::tensor::sycl_utils::sub_group_store;
+
+template <typename T, typename condT, typename IndexerT>
+class where_strided_kernel;
+template <typename T, typename condT, std::uint8_t vec_sz, std::uint8_t n_vecs>
+class where_contig_kernel;
+
+template <typename T,
+          typename condT,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
+          bool enable_sg_loadstore = true>
+class WhereContigFunctor
+{
+private:
+    std::size_t nelems = 0;
+    const condT *cond_p = nullptr;
+    const T *x1_p = nullptr;
+    const T *x2_p = nullptr;
+    T *dst_p = nullptr;
+
+public:
+    WhereContigFunctor(std::size_t nelems_,
+                       const condT *cond_p_,
+                       const T *x1_p_,
+                       const T *x2_p_,
+                       T *dst_p_)
+        : nelems(nelems_), cond_p(cond_p_), x1_p(x1_p_), x2_p(x2_p_),
+          dst_p(dst_p_)
+    {
+    }
+
+    void operator()(sycl::nd_item<1> ndit) const
+    {
+        static constexpr std::uint8_t nelems_per_wi = n_vecs * vec_sz;
+
+        using dpctl::tensor::type_utils::is_complex;
+        if constexpr (!enable_sg_loadstore || is_complex<condT>::value ||
+                      is_complex<T>::value)
+        {
+            const std::uint16_t sgSize =
+                ndit.get_sub_group().get_local_range()[0];
+            const std::size_t gid = ndit.get_global_linear_id();
+
+            const std::uint16_t nelems_per_sg = sgSize * nelems_per_wi;
+            const std::size_t start =
+                (gid / sgSize) * (nelems_per_sg - sgSize) + gid;
+            const std::size_t end = std::min(nelems, start + nelems_per_sg);
+            for (std::size_t offset = start; offset < end; offset += sgSize) {
+                using dpctl::tensor::type_utils::convert_impl;
+                const bool check = convert_impl<bool, condT>(cond_p[offset]);
+                dst_p[offset] = check ? x1_p[offset] : x2_p[offset];
+            }
+        }
+        else {
+            auto sg = ndit.get_sub_group();
+            const std::uint16_t sgSize = sg.get_max_local_range()[0];
+
+            const std::size_t base =
+                nelems_per_wi * (ndit.get_group(0) * ndit.get_local_range(0) +
+                                 sg.get_group_id()[0] * sgSize);
+
+            if (base + nelems_per_wi * sgSize < nelems) {
+                sycl::vec<T, vec_sz> dst_vec;
+
+#pragma unroll
+                for (std::uint8_t it = 0; it < n_vecs * vec_sz; it += vec_sz) {
+                    const std::size_t idx = base + it * sgSize;
+                    auto x1_multi_ptr = sycl::address_space_cast<
+                        sycl::access::address_space::global_space,
+                        sycl::access::decorated::yes>(&x1_p[idx]);
+                    auto x2_multi_ptr = sycl::address_space_cast<
+                        sycl::access::address_space::global_space,
+                        sycl::access::decorated::yes>(&x2_p[idx]);
+                    auto cond_multi_ptr = sycl::address_space_cast<
+                        sycl::access::address_space::global_space,
+                        sycl::access::decorated::yes>(&cond_p[idx]);
+                    auto dst_multi_ptr = sycl::address_space_cast<
+                        sycl::access::address_space::global_space,
+                        sycl::access::decorated::yes>(&dst_p[idx]);
+
+                    const sycl::vec<T, vec_sz> x1_vec =
+                        sub_group_load<vec_sz>(sg, x1_multi_ptr);
+                    const sycl::vec<T, vec_sz> x2_vec =
+                        sub_group_load<vec_sz>(sg, x2_multi_ptr);
+                    const sycl::vec<condT, vec_sz> cond_vec =
+                        sub_group_load<vec_sz>(sg, cond_multi_ptr);
+#pragma unroll
+                    for (std::uint8_t k = 0; k < vec_sz; ++k) {
+                        dst_vec[k] = cond_vec[k] ? x1_vec[k] : x2_vec[k];
+                    }
+                    sub_group_store<vec_sz>(sg, dst_vec, dst_multi_ptr);
+                }
+            }
+            else {
+                const std::size_t lane_id = sg.get_local_id()[0];
+                for (std::size_t k = base + lane_id; k < nelems; k += sgSize) {
+                    dst_p[k] = cond_p[k] ? x1_p[k] : x2_p[k];
+                }
+            }
+        }
+    }
+};
+
+typedef sycl::event (*where_contig_impl_fn_ptr_t)(
+    sycl::queue &,
+    std::size_t,
+    const char *,
+    const char *,
+    const char *,
+    char *,
+    const std::vector<sycl::event> &);
+
+template <typename T, typename condT>
+sycl::event where_contig_impl(sycl::queue &q,
+                              std::size_t nelems,
+                              const char *cond_cp,
+                              const char *x1_cp,
+                              const char *x2_cp,
+                              char *dst_cp,
+                              const std::vector<sycl::event> &depends)
+{
+    const condT *cond_tp = reinterpret_cast<const condT *>(cond_cp);
+    const T *x1_tp = reinterpret_cast<const T *>(x1_cp);
+    const T *x2_tp = reinterpret_cast<const T *>(x2_cp);
+    T *dst_tp = reinterpret_cast<T *>(dst_cp);
+
+    sycl::event where_ev = q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+
+        std::size_t lws = 64;
+        static constexpr std::uint8_t vec_sz = 4u;
+        static constexpr std::uint8_t n_vecs = 2u;
+        const std::size_t n_groups =
+            ((nelems + lws * n_vecs * vec_sz - 1) / (lws * n_vecs * vec_sz));
+        const auto gws_range = sycl::range<1>(n_groups * lws);
+        const auto lws_range = sycl::range<1>(lws);
+
+        if (is_aligned<required_alignment>(cond_cp) &&
+            is_aligned<required_alignment>(x1_cp) &&
+            is_aligned<required_alignment>(x2_cp) &&
+            is_aligned<required_alignment>(dst_cp))
+        {
+            static constexpr bool enable_sg_loadstore = true;
+            using KernelName = where_contig_kernel<T, condT, vec_sz, n_vecs>;
+
+            cgh.parallel_for<KernelName>(
+                sycl::nd_range<1>(gws_range, lws_range),
+                WhereContigFunctor<T, condT, vec_sz, n_vecs,
+                                   enable_sg_loadstore>(nelems, cond_tp, x1_tp,
+                                                        x2_tp, dst_tp));
+        }
+        else {
+            static constexpr bool disable_sg_loadstore = false;
+            using InnerKernelName =
+                where_contig_kernel<T, condT, vec_sz, n_vecs>;
+            using KernelName =
+                disabled_sg_loadstore_wrapper_krn<InnerKernelName>;
+
+            cgh.parallel_for<KernelName>(
+                sycl::nd_range<1>(gws_range, lws_range),
+                WhereContigFunctor<T, condT, vec_sz, n_vecs,
+                                   disable_sg_loadstore>(nelems, cond_tp, x1_tp,
+                                                         x2_tp, dst_tp));
+        }
+    });
+
+    return where_ev;
+}
+
+template <typename T, typename condT, typename IndexerT>
+class WhereStridedFunctor
+{
+private:
+    const T *x1_p = nullptr;
+    const T *x2_p = nullptr;
+    T *dst_p = nullptr;
+    const condT *cond_p = nullptr;
+    IndexerT indexer;
+
+public:
+    WhereStridedFunctor(const condT *cond_p_,
+                        const T *x1_p_,
+                        const T *x2_p_,
+                        T *dst_p_,
+                        const IndexerT &indexer_)
+        : x1_p(x1_p_), x2_p(x2_p_), dst_p(dst_p_), cond_p(cond_p_),
+          indexer(indexer_)
+    {
+    }
+
+    void operator()(sycl::id<1> id) const
+    {
+        std::size_t gid = id[0];
+        auto offsets = indexer(static_cast<ssize_t>(gid));
+
+        using dpctl::tensor::type_utils::convert_impl;
+        bool check =
+            convert_impl<bool, condT>(cond_p[offsets.get_first_offset()]);
+
+        dst_p[offsets.get_fourth_offset()] =
+            check ? x1_p[offsets.get_second_offset()]
+                  : x2_p[offsets.get_third_offset()];
+    }
+};
+
+typedef sycl::event (*where_strided_impl_fn_ptr_t)(
+    sycl::queue &,
+    std::size_t,
+    int,
+    const char *,
+    const char *,
+    const char *,
+    char *,
+    const ssize_t *,
+    ssize_t,
+    ssize_t,
+    ssize_t,
+    ssize_t,
+    const std::vector<sycl::event> &);
+
+template <typename T, typename condT>
+sycl::event where_strided_impl(sycl::queue &q,
+                               std::size_t nelems,
+                               int nd,
+                               const char *cond_cp,
+                               const char *x1_cp,
+                               const char *x2_cp,
+                               char *dst_cp,
+                               const ssize_t *shape_strides,
+                               ssize_t x1_offset,
+                               ssize_t x2_offset,
+                               ssize_t cond_offset,
+                               ssize_t dst_offset,
+                               const std::vector<sycl::event> &depends)
+{
+    const condT *cond_tp = reinterpret_cast<const condT *>(cond_cp);
+    const T *x1_tp = reinterpret_cast<const T *>(x1_cp);
+    const T *x2_tp = reinterpret_cast<const T *>(x2_cp);
+    T *dst_tp = reinterpret_cast<T *>(dst_cp);
+
+    sycl::event where_ev = q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+
+        const FourOffsets_StridedIndexer indexer{
+            nd, cond_offset, x1_offset, x2_offset, dst_offset, shape_strides};
+
+        cgh.parallel_for<
+            where_strided_kernel<T, condT, FourOffsets_StridedIndexer>>(
+            sycl::range<1>(nelems),
+            WhereStridedFunctor<T, condT, FourOffsets_StridedIndexer>(
+                cond_tp, x1_tp, x2_tp, dst_tp, indexer));
+    });
+
+    return where_ev;
+}
+
+template <typename fnT, typename T, typename condT>
+struct WhereStridedFactory
+{
+    fnT get()
+    {
+        fnT fn = where_strided_impl<T, condT>;
+        return fn;
+    }
+};
+
+template <typename fnT, typename T, typename condT>
+struct WhereContigFactory
+{
+    fnT get()
+    {
+        fnT fn = where_contig_impl<T, condT>;
+        return fn;
+    }
+};
+
+} // namespace dpctl::tensor::kernels::search
diff --git a/dpctl_ext/tensor/libtensor/source/clip.cpp b/dpctl_ext/tensor/libtensor/source/clip.cpp
new file mode 100644
index 000000000000..3e1c5e8cd262
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/clip.cpp
@@ -0,0 +1,265 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines Python API for implementation functions of
+/// dpctl.tensor.clip
+//===---------------------------------------------------------------------===//
+
+#include <cassert>
+#include <cstddef>
+#include <tuple>
+#include <utility>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/pybind11.h>
+
+#include "clip.hpp"
+#include "kernels/clip.hpp"
+#include "simplify_iteration_space.hpp"
+#include "utils/memory_overlap.hpp"
+#include "utils/offset_utils.hpp"
+#include "utils/output_validation.hpp"
+#include "utils/sycl_alloc_utils.hpp"
+#include "utils/type_dispatch.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+using dpctl::tensor::kernels::clip::clip_contig_impl_fn_ptr_t;
+using dpctl::tensor::kernels::clip::clip_strided_impl_fn_ptr_t;
+
+static clip_contig_impl_fn_ptr_t clip_contig_dispatch_vector[td_ns::num_types];
+static clip_strided_impl_fn_ptr_t
+    clip_strided_dispatch_vector[td_ns::num_types];
+
+void init_clip_dispatch_vectors(void)
+{
+    using namespace td_ns;
+    using dpctl::tensor::kernels::clip::ClipContigFactory;
+    DispatchVectorBuilder<clip_contig_impl_fn_ptr_t, ClipContigFactory,
+                          num_types>
+        dvb1;
+    dvb1.populate_dispatch_vector(clip_contig_dispatch_vector);
+
+    using dpctl::tensor::kernels::clip::ClipStridedFactory;
+    DispatchVectorBuilder<clip_strided_impl_fn_ptr_t, ClipStridedFactory,
+                          num_types>
+        dvb2;
+    dvb2.populate_dispatch_vector(clip_strided_dispatch_vector);
+}
+
+using dpctl::utils::keep_args_alive;
+
+std::pair<sycl::event, sycl::event>
+    py_clip(const dpctl::tensor::usm_ndarray &src,
+            const dpctl::tensor::usm_ndarray &min,
+            const dpctl::tensor::usm_ndarray &max,
+            const dpctl::tensor::usm_ndarray &dst,
+            sycl::queue &exec_q,
+            const std::vector<sycl::event> &depends)
+{
+
+    if (!dpctl::utils::queues_are_compatible(exec_q, {src, min, max, dst})) {
+        throw py::value_error(
+            "Execution queue is not compatible with allocation queues");
+    }
+
+    dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst);
+
+    int nd = src.get_ndim();
+    int min_nd = min.get_ndim();
+    int max_nd = max.get_ndim();
+    int dst_nd = dst.get_ndim();
+
+    if (nd != min_nd || nd != max_nd) {
+        throw py::value_error(
+            "Input arrays are not of appropriate dimension for clip kernel.");
+    }
+
+    if (nd != dst_nd) {
+        throw py::value_error(
+            "Destination is not of appropriate dimension for clip kernel.");
+    }
+
+    const py::ssize_t *src_shape = src.get_shape_raw();
+    const py::ssize_t *min_shape = min.get_shape_raw();
+    const py::ssize_t *max_shape = max.get_shape_raw();
+    const py::ssize_t *dst_shape = dst.get_shape_raw();
+
+    bool shapes_equal(true);
+    std::size_t nelems(1);
+    for (int i = 0; i < nd; ++i) {
+        const auto &sh_i = dst_shape[i];
+        nelems *= static_cast<std::size_t>(sh_i);
+        shapes_equal = shapes_equal && (min_shape[i] == sh_i) &&
+                       (max_shape[i] == sh_i) && (src_shape[i] == sh_i);
+    }
+
+    if (!shapes_equal) {
+        throw py::value_error("Arrays are not of matching shapes.");
+    }
+
+    if (nelems == 0) {
+        return std::make_pair(sycl::event{}, sycl::event{});
+    }
+
+    auto const &overlap = dpctl::tensor::overlap::MemoryOverlap();
+    auto const &same_logical_tensors =
+        dpctl::tensor::overlap::SameLogicalTensors();
+    if ((overlap(dst, src) && !same_logical_tensors(dst, src)) ||
+        (overlap(dst, min) && !same_logical_tensors(dst, min)) ||
+        (overlap(dst, max) && !same_logical_tensors(dst, max)))
+    {
+        throw py::value_error("Destination array overlaps with input.");
+    }
+
+    int min_typenum = min.get_typenum();
+    int max_typenum = max.get_typenum();
+    int src_typenum = src.get_typenum();
+    int dst_typenum = dst.get_typenum();
+
+    auto const &array_types = td_ns::usm_ndarray_types();
+    int src_typeid = array_types.typenum_to_lookup_id(src_typenum);
+    int min_typeid = array_types.typenum_to_lookup_id(min_typenum);
+    int max_typeid = array_types.typenum_to_lookup_id(max_typenum);
+    int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum);
+
+    if (src_typeid != dst_typeid || src_typeid != min_typeid ||
+        src_typeid != max_typeid)
+    {
+        throw py::value_error("Input, min, max, and destination arrays must "
+                              "have the same data type");
+    }
+
+    dpctl::tensor::validation::AmpleMemory::throw_if_not_ample(dst, nelems);
+
+    char *src_data = src.get_data();
+    char *min_data = min.get_data();
+    char *max_data = max.get_data();
+    char *dst_data = dst.get_data();
+
+    bool is_min_c_contig = min.is_c_contiguous();
+    bool is_min_f_contig = min.is_f_contiguous();
+
+    bool is_max_c_contig = max.is_c_contiguous();
+    bool is_max_f_contig = max.is_f_contiguous();
+
+    bool is_src_c_contig = src.is_c_contiguous();
+    bool is_src_f_contig = src.is_f_contiguous();
+
+    bool is_dst_c_contig = dst.is_c_contiguous();
+    bool is_dst_f_contig = dst.is_f_contiguous();
+
+    bool all_c_contig = (is_min_c_contig && is_max_c_contig &&
+                         is_src_c_contig && is_dst_c_contig);
+    bool all_f_contig = (is_min_f_contig && is_max_f_contig &&
+                         is_src_f_contig && is_dst_f_contig);
+
+    if (all_c_contig || all_f_contig) {
+        auto fn = clip_contig_dispatch_vector[src_typeid];
+
+        sycl::event clip_ev =
+            fn(exec_q, nelems, src_data, min_data, max_data, dst_data, depends);
+        sycl::event ht_ev =
+            keep_args_alive(exec_q, {src, min, max, dst}, {clip_ev});
+
+        return std::make_pair(ht_ev, clip_ev);
+    }
+
+    auto const &src_strides = src.get_strides_vector();
+    auto const &min_strides = min.get_strides_vector();
+    auto const &max_strides = max.get_strides_vector();
+    auto const &dst_strides = dst.get_strides_vector();
+
+    using shT = std::vector<py::ssize_t>;
+    shT simplified_shape;
+    shT simplified_src_strides;
+    shT simplified_min_strides;
+    shT simplified_max_strides;
+    shT simplified_dst_strides;
+    py::ssize_t src_offset(0);
+    py::ssize_t min_offset(0);
+    py::ssize_t max_offset(0);
+    py::ssize_t dst_offset(0);
+
+    simplify_iteration_space_4(
+        nd, src_shape, src_strides, min_strides, max_strides, dst_strides,
+        // outputs
+        simplified_shape, simplified_src_strides, simplified_min_strides,
+        simplified_max_strides, simplified_dst_strides, src_offset, min_offset,
+        max_offset, dst_offset);
+
+    auto fn = clip_strided_dispatch_vector[src_typeid];
+
+    std::vector<sycl::event> host_task_events;
+    host_task_events.reserve(2);
+
+    using dpctl::tensor::offset_utils::device_allocate_and_pack;
+    auto ptr_size_event_tuple = device_allocate_and_pack<py::ssize_t>(
+        exec_q, host_task_events,
+        // common shape and strides
+        simplified_shape, simplified_src_strides, simplified_min_strides,
+        simplified_max_strides, simplified_dst_strides);
+    auto packed_shape_strides_owner =
+        std::move(std::get<0>(ptr_size_event_tuple));
+    sycl::event copy_shape_strides_ev = std::get<2>(ptr_size_event_tuple);
+    const py::ssize_t *packed_shape_strides = packed_shape_strides_owner.get();
+
+    std::vector<sycl::event> all_deps;
+    all_deps.reserve(depends.size() + 1);
+    all_deps.insert(all_deps.end(), depends.begin(), depends.end());
+    all_deps.push_back(copy_shape_strides_ev);
+
+    assert(all_deps.size() == depends.size() + 1);
+
+    sycl::event clip_ev = fn(exec_q, nelems, nd, src_data, min_data, max_data,
+                             dst_data, packed_shape_strides, src_offset,
+                             min_offset, max_offset, dst_offset, all_deps);
+
+    // free packed temporaries
+    sycl::event temporaries_cleanup_ev =
+        dpctl::tensor::alloc_utils::async_smart_free(
+            exec_q, {clip_ev}, packed_shape_strides_owner);
+    host_task_events.push_back(temporaries_cleanup_ev);
+
+    sycl::event arg_cleanup_ev =
+        keep_args_alive(exec_q, {src, min, max, dst}, host_task_events);
+
+    return std::make_pair(arg_cleanup_ev, clip_ev);
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/clip.hpp b/dpctl_ext/tensor/libtensor/source/clip.hpp
new file mode 100644
index 000000000000..de8f0e559b6e
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/clip.hpp
@@ -0,0 +1,57 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines Python API for implementation functions of
+/// dpctl.tensor.clip
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <utility>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+extern std::pair<sycl::event, sycl::event>
+    py_clip(const dpctl::tensor::usm_ndarray &src,
+            const dpctl::tensor::usm_ndarray &min,
+            const dpctl::tensor::usm_ndarray &max,
+            const dpctl::tensor::usm_ndarray &dst,
+            sycl::queue &exec_q,
+            const std::vector<sycl::event> &depends);
+
+extern void init_clip_dispatch_vectors(void);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/repeat.cpp b/dpctl_ext/tensor/libtensor/source/repeat.cpp
new file mode 100644
index 000000000000..919f51f9a4d1
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/repeat.cpp
@@ -0,0 +1,820 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===--------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions
+//===--------------------------------------------------------------------===//
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <tuple>
+#include <utility>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/pybind11.h>
+
+#include "kernels/repeat.hpp"
+#include "utils/memory_overlap.hpp"
+#include "utils/offset_utils.hpp"
+#include "utils/output_validation.hpp"
+#include "utils/sycl_alloc_utils.hpp"
+#include "utils/type_dispatch.hpp"
+
+#include "simplify_iteration_space.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+using dpctl::tensor::kernels::repeat::repeat_by_sequence_fn_ptr_t;
+static repeat_by_sequence_fn_ptr_t
+    repeat_by_sequence_dispatch_vector[td_ns::num_types];
+
+using dpctl::tensor::kernels::repeat::repeat_by_sequence_1d_fn_ptr_t;
+static repeat_by_sequence_1d_fn_ptr_t
+    repeat_by_sequence_1d_dispatch_vector[td_ns::num_types];
+
+using dpctl::tensor::kernels::repeat::repeat_by_scalar_fn_ptr_t;
+static repeat_by_scalar_fn_ptr_t
+    repeat_by_scalar_dispatch_vector[td_ns::num_types];
+
+using dpctl::tensor::kernels::repeat::repeat_by_scalar_1d_fn_ptr_t;
+static repeat_by_scalar_1d_fn_ptr_t
+    repeat_by_scalar_1d_dispatch_vector[td_ns::num_types];
+
+void init_repeat_dispatch_vectors(void)
+{
+    using dpctl::tensor::kernels::repeat::RepeatSequenceFactory;
+    td_ns::DispatchVectorBuilder<repeat_by_sequence_fn_ptr_t,
+                                 RepeatSequenceFactory, td_ns::num_types>
+        dvb1;
+    dvb1.populate_dispatch_vector(repeat_by_sequence_dispatch_vector);
+
+    using dpctl::tensor::kernels::repeat::RepeatSequence1DFactory;
+    td_ns::DispatchVectorBuilder<repeat_by_sequence_1d_fn_ptr_t,
+                                 RepeatSequence1DFactory, td_ns::num_types>
+        dvb2;
+    dvb2.populate_dispatch_vector(repeat_by_sequence_1d_dispatch_vector);
+
+    using dpctl::tensor::kernels::repeat::RepeatScalarFactory;
+    td_ns::DispatchVectorBuilder<repeat_by_scalar_fn_ptr_t, RepeatScalarFactory,
+                                 td_ns::num_types>
+        dvb3;
+    dvb3.populate_dispatch_vector(repeat_by_scalar_dispatch_vector);
+
+    using dpctl::tensor::kernels::repeat::RepeatScalar1DFactory;
+    td_ns::DispatchVectorBuilder<repeat_by_scalar_1d_fn_ptr_t,
+                                 RepeatScalar1DFactory, td_ns::num_types>
+        dvb4;
+    dvb4.populate_dispatch_vector(repeat_by_scalar_1d_dispatch_vector);
+}
+
+std::pair<sycl::event, sycl::event>
+    py_repeat_by_sequence(const dpctl::tensor::usm_ndarray &src,
+                          const dpctl::tensor::usm_ndarray &dst,
+                          const dpctl::tensor::usm_ndarray &reps,
+                          const dpctl::tensor::usm_ndarray &cumsum,
+                          int axis,
+                          sycl::queue &exec_q,
+                          const std::vector<sycl::event> &depends)
+{
+    int src_nd = src.get_ndim();
+    if (axis < 0 || (axis + 1 > src_nd && src_nd > 0) ||
+        (axis > 0 && src_nd == 0)) {
+        throw py::value_error("Specified axis is invalid.");
+    }
+
+    int dst_nd = dst.get_ndim();
+    if ((src_nd != dst_nd && src_nd > 0) || (src_nd == 0 && dst_nd > 1)) {
+        throw py::value_error("Number of dimensions of source and destination "
+                              "arrays is not consistent");
+    }
+
+    int reps_nd = reps.get_ndim();
+    if (reps_nd != 1) {
+        throw py::value_error("`reps` array must be 1-dimensional");
+    }
+
+    if (cumsum.get_ndim() != 1) {
+        throw py::value_error("`cumsum` array must be 1-dimensional.");
+    }
+
+    if (!cumsum.is_c_contiguous()) {
+        throw py::value_error("Expecting `cumsum` array to be C-contiguous.");
+    }
+
+    if (!dpctl::utils::queues_are_compatible(exec_q, {src, reps, cumsum, dst}))
+    {
+        throw py::value_error(
+            "Execution queue is not compatible with allocation queues");
+    }
+
+    dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst);
+
+    std::size_t reps_sz = reps.get_size();
+    std::size_t cumsum_sz = cumsum.get_size();
+
+    const py::ssize_t *src_shape = src.get_shape_raw();
+    const py::ssize_t *dst_shape = dst.get_shape_raw();
+    bool same_orthog_dims(true);
+    std::size_t orthog_nelems(1); // number of orthogonal iterations
+    for (auto i = 0; i < axis; ++i) {
+        auto src_sh_i = src_shape[i];
+        orthog_nelems *= src_sh_i;
+        same_orthog_dims = same_orthog_dims && (src_sh_i == dst_shape[i]);
+    }
+    for (auto i = axis + 1; i < src_nd; ++i) {
+        auto src_sh_i = src_shape[i];
+        orthog_nelems *= src_sh_i;
+        same_orthog_dims = same_orthog_dims && (src_sh_i == dst_shape[i]);
+    }
+
+    std::size_t src_axis_nelems(1);
+    if (src_nd > 0) {
+        src_axis_nelems = src_shape[axis];
+    }
+    std::size_t dst_axis_nelems(dst_shape[axis]);
+
+    // shape at repeated axis must be equal to the sum of reps
+    if (!same_orthog_dims || src_axis_nelems != reps_sz ||
+        src_axis_nelems != cumsum_sz)
+    {
+        throw py::value_error("Inconsistent array dimensions");
+    }
+
+    if (orthog_nelems == 0 || src_axis_nelems == 0) {
+        return std::make_pair(sycl::event(), sycl::event());
+    }
+
+    dpctl::tensor::validation::AmpleMemory::throw_if_not_ample(
+        dst, orthog_nelems * dst_axis_nelems);
+
+    auto const &overlap = dpctl::tensor::overlap::MemoryOverlap();
+    // check that dst does not intersect with src or reps
+    if (overlap(dst, src) || overlap(dst, reps) || overlap(dst, cumsum)) {
+        throw py::value_error("Destination array overlaps with inputs");
+    }
+
+    int src_typenum = src.get_typenum();
+    int dst_typenum = dst.get_typenum();
+    int reps_typenum = reps.get_typenum();
+    int cumsum_typenum = cumsum.get_typenum();
+
+    auto const &array_types = td_ns::usm_ndarray_types();
+    int src_typeid = array_types.typenum_to_lookup_id(src_typenum);
+    int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum);
+    int reps_typeid = array_types.typenum_to_lookup_id(reps_typenum);
+    int cumsum_typeid = array_types.typenum_to_lookup_id(cumsum_typenum);
+
+    if (src_typeid != dst_typeid) {
+        throw py::value_error(
+            "Destination array must have the same elemental data type");
+    }
+
+    static constexpr int int64_typeid =
+        static_cast<int>(td_ns::typenum_t::INT64);
+    if (cumsum_typeid != int64_typeid) {
+        throw py::value_error(
+            "Unexpected data type of `cumsum` array, expecting "
+            "'int64'");
+    }
+
+    if (reps_typeid != cumsum_typeid) {
+        throw py::value_error("`reps` array must have the same elemental "
+                              "data type as cumsum");
+    }
+
+    const char *src_data_p = src.get_data();
+    const char *reps_data_p = reps.get_data();
+    const char *cumsum_data_p = cumsum.get_data();
+    char *dst_data_p = dst.get_data();
+
+    auto src_shape_vec = src.get_shape_vector();
+    auto src_strides_vec = src.get_strides_vector();
+
+    auto dst_shape_vec = dst.get_shape_vector();
+    auto dst_strides_vec = dst.get_strides_vector();
+
+    auto reps_shape_vec = reps.get_shape_vector();
+    auto reps_strides_vec = reps.get_strides_vector();
+
+    sycl::event repeat_ev;
+    std::vector<sycl::event> host_task_events{};
+    if (axis == 0 && src_nd < 2) {
+        // empty orthogonal directions
+
+        auto fn = repeat_by_sequence_1d_dispatch_vector[src_typeid];
+
+        assert(dst_shape_vec.size() == 1);
+        assert(dst_strides_vec.size() == 1);
+
+        if (src_nd == 0) {
+            src_shape_vec = {0};
+            src_strides_vec = {0};
+        }
+
+        using dpctl::tensor::offset_utils::device_allocate_and_pack;
+        auto ptr_size_event_tuple1 = device_allocate_and_pack<py::ssize_t>(
+            exec_q, host_task_events, src_shape_vec, src_strides_vec);
+        auto packed_src_shape_strides_owner =
+            std::move(std::get<0>(ptr_size_event_tuple1));
+        sycl::event copy_shapes_strides_ev = std::get<2>(ptr_size_event_tuple1);
+        const py::ssize_t *packed_src_shape_strides =
+            packed_src_shape_strides_owner.get();
+
+        std::vector<sycl::event> all_deps;
+        all_deps.reserve(depends.size() + 1);
+        all_deps.insert(all_deps.end(), depends.begin(), depends.end());
+        all_deps.push_back(copy_shapes_strides_ev);
+
+        assert(all_deps.size() == depends.size() + 1);
+
+        repeat_ev =
+            fn(exec_q, src_axis_nelems, src_data_p, dst_data_p, reps_data_p,
+               cumsum_data_p, src_nd, packed_src_shape_strides,
+               dst_shape_vec[0], dst_strides_vec[0], reps_shape_vec[0],
+               reps_strides_vec[0], all_deps);
+
+        sycl::event cleanup_tmp_allocations_ev =
+            dpctl::tensor::alloc_utils::async_smart_free(
+                exec_q, {repeat_ev}, packed_src_shape_strides_owner);
+        host_task_events.push_back(cleanup_tmp_allocations_ev);
+    }
+    else {
+        // non-empty orthogonal directions
+
+        auto fn = repeat_by_sequence_dispatch_vector[src_typeid];
+
+        int orthog_nd = src_nd - 1;
+
+        using shT = std::vector<py::ssize_t>;
+        shT orthog_src_shape;
+        shT orthog_src_strides;
+        shT axis_src_shape;
+        shT axis_src_stride;
+        split_iteration_space(src_shape_vec, src_strides_vec, axis, axis + 1,
+                              orthog_src_shape, axis_src_shape,
+                              orthog_src_strides, axis_src_stride);
+
+        shT orthog_dst_shape;
+        shT orthog_dst_strides;
+        shT axis_dst_shape;
+        shT axis_dst_stride;
+        split_iteration_space(dst_shape_vec, dst_strides_vec, axis, axis + 1,
+                              orthog_dst_shape, axis_dst_shape,
+                              orthog_dst_strides, axis_dst_stride);
+
+        assert(orthog_src_shape.size() == static_cast<std::size_t>(orthog_nd));
+        assert(orthog_dst_shape.size() == static_cast<std::size_t>(orthog_nd));
+        assert(std::equal(orthog_src_shape.begin(), orthog_src_shape.end(),
+                          orthog_dst_shape.begin()));
+
+        shT simplified_orthog_shape;
+        shT simplified_orthog_src_strides;
+        shT simplified_orthog_dst_strides;
+
+        const py::ssize_t *_shape = orthog_src_shape.data();
+
+        py::ssize_t orthog_src_offset(0);
+        py::ssize_t orthog_dst_offset(0);
+        simplify_iteration_space(
+            orthog_nd, _shape, orthog_src_strides, orthog_dst_strides,
+            // output
+            simplified_orthog_shape, simplified_orthog_src_strides,
+            simplified_orthog_dst_strides, orthog_src_offset,
+            orthog_dst_offset);
+
+        using dpctl::tensor::offset_utils::device_allocate_and_pack;
+        auto ptr_size_event_tuple1 = device_allocate_and_pack<py::ssize_t>(
+            exec_q, host_task_events, simplified_orthog_shape,
+            simplified_orthog_src_strides, simplified_orthog_dst_strides);
+        auto packed_shapes_strides_owner =
+            std::move(std::get<0>(ptr_size_event_tuple1));
+        sycl::event copy_shapes_strides_ev = std::get<2>(ptr_size_event_tuple1);
+        const py::ssize_t *packed_shapes_strides =
+            packed_shapes_strides_owner.get();
+
+        std::vector<sycl::event> all_deps;
+        all_deps.reserve(depends.size() + 1);
+        all_deps.insert(all_deps.end(), depends.begin(), depends.end());
+        all_deps.push_back(copy_shapes_strides_ev);
+
+        assert(all_deps.size() == depends.size() + 1);
+
+        repeat_ev = fn(exec_q, orthog_nelems, src_axis_nelems, src_data_p,
+                       dst_data_p, reps_data_p, cumsum_data_p,
+                       // data to build orthog indexer
+                       orthog_nd, packed_shapes_strides, orthog_src_offset,
+                       orthog_dst_offset,
+                       // data to build indexers along repeated axis in src
+                       axis_src_shape[0], axis_src_stride[0],
+                       // data to build indexer along repeated axis in dst
+                       axis_dst_shape[0], axis_dst_stride[0],
+                       // data to build indexer for reps array
+                       reps_shape_vec[0], reps_strides_vec[0], all_deps);
+
+        sycl::event cleanup_tmp_allocations_ev =
+            dpctl::tensor::alloc_utils::async_smart_free(
+                exec_q, {repeat_ev}, packed_shapes_strides_owner);
+        host_task_events.push_back(cleanup_tmp_allocations_ev);
+    }
+
+    sycl::event py_obj_management_host_task_ev = dpctl::utils::keep_args_alive(
+        exec_q, {src, reps, cumsum, dst}, host_task_events);
+
+    return std::make_pair(py_obj_management_host_task_ev, repeat_ev);
+}
+
+std::pair<sycl::event, sycl::event>
+    py_repeat_by_sequence(const dpctl::tensor::usm_ndarray &src,
+                          const dpctl::tensor::usm_ndarray &dst,
+                          const dpctl::tensor::usm_ndarray &reps,
+                          const dpctl::tensor::usm_ndarray &cumsum,
+                          sycl::queue &exec_q,
+                          const std::vector<sycl::event> &depends)
+{
+
+    int dst_nd = dst.get_ndim();
+    if (dst_nd != 1) {
+        throw py::value_error(
+            "`dst` array must be 1-dimensional when repeating a full array");
+    }
+
+    int reps_nd = reps.get_ndim();
+    if (reps_nd != 1) {
+        throw py::value_error("`reps` array must be 1-dimensional");
+    }
+
+    if (cumsum.get_ndim() != 1) {
+        throw py::value_error("`cumsum` array must be 1-dimensional.");
+    }
+
+    if (!cumsum.is_c_contiguous()) {
+        throw py::value_error("Expecting `cumsum` array to be C-contiguous.");
+    }
+
+    if (!dpctl::utils::queues_are_compatible(exec_q, {src, reps, cumsum, dst}))
+    {
+        throw py::value_error(
+            "Execution queue is not compatible with allocation queues");
+    }
+
+    dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst);
+
+    std::size_t src_sz = src.get_size();
+    std::size_t reps_sz = reps.get_size();
+    std::size_t cumsum_sz = cumsum.get_size();
+
+    // shape at repeated axis must be equal to the sum of reps
+    if (src_sz != reps_sz || src_sz != cumsum_sz) {
+        throw py::value_error("Inconsistent array dimensions");
+    }
+
+    if (src_sz == 0) {
+        return std::make_pair(sycl::event(), sycl::event());
+    }
+
+    dpctl::tensor::validation::AmpleMemory::throw_if_not_ample(dst,
+                                                               dst.get_size());
+
+    auto const &overlap = dpctl::tensor::overlap::MemoryOverlap();
+    // check that dst does not intersect with src, cumsum, or reps
+    if (overlap(dst, src) || overlap(dst, reps) || overlap(dst, cumsum)) {
+        throw py::value_error("Destination array overlaps with inputs");
+    }
+
+    int src_typenum = src.get_typenum();
+    int dst_typenum = dst.get_typenum();
+    int reps_typenum = reps.get_typenum();
+    int cumsum_typenum = cumsum.get_typenum();
+
+    auto const &array_types = td_ns::usm_ndarray_types();
+    int src_typeid = array_types.typenum_to_lookup_id(src_typenum);
+    int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum);
+    int reps_typeid = array_types.typenum_to_lookup_id(reps_typenum);
+    int cumsum_typeid = array_types.typenum_to_lookup_id(cumsum_typenum);
+
+    if (src_typeid != dst_typeid) {
+        throw py::value_error(
+            "Destination array must have the same elemental data type");
+    }
+
+    static constexpr int int64_typeid =
+        static_cast<int>(td_ns::typenum_t::INT64);
+    if (cumsum_typeid != int64_typeid) {
+        throw py::value_error(
+            "Unexpected data type of `cumsum` array, expecting "
+            "'int64'");
+    }
+
+    if (reps_typeid != cumsum_typeid) {
+        throw py::value_error("`reps` array must have the same elemental "
+                              "data type as cumsum");
+    }
+
+    const char *src_data_p = src.get_data();
+    const char *reps_data_p = reps.get_data();
+    const char *cumsum_data_p = cumsum.get_data();
+    char *dst_data_p = dst.get_data();
+
+    int src_nd = src.get_ndim();
+    auto src_shape_vec = src.get_shape_vector();
+    auto src_strides_vec = src.get_strides_vector();
+    if (src_nd == 0) {
+        src_shape_vec = {0};
+        src_strides_vec = {0};
+    }
+
+    auto dst_shape_vec = dst.get_shape_vector();
+    auto dst_strides_vec = dst.get_strides_vector();
+
+    auto reps_shape_vec = reps.get_shape_vector();
+    auto reps_strides_vec = reps.get_strides_vector();
+
+    std::vector<sycl::event> host_task_events{};
+
+    auto fn = repeat_by_sequence_1d_dispatch_vector[src_typeid];
+
+    using dpctl::tensor::offset_utils::device_allocate_and_pack;
+    auto ptr_size_event_tuple1 = device_allocate_and_pack<py::ssize_t>(
+        exec_q, host_task_events, src_shape_vec, src_strides_vec);
+    auto packed_src_shapes_strides_owner =
+        std::move(std::get<0>(ptr_size_event_tuple1));
+    sycl::event copy_shapes_strides_ev = std::get<2>(ptr_size_event_tuple1);
+    const py::ssize_t *packed_src_shapes_strides =
+        packed_src_shapes_strides_owner.get();
+
+    std::vector<sycl::event> all_deps;
+    all_deps.reserve(depends.size() + 1);
+    all_deps.insert(all_deps.end(), depends.begin(), depends.end());
+    all_deps.push_back(copy_shapes_strides_ev);
+
+    assert(all_deps.size() == depends.size() + 1);
+
+    sycl::event repeat_ev = fn(
+        exec_q, src_sz, src_data_p, dst_data_p, reps_data_p, cumsum_data_p,
+        src_nd, packed_src_shapes_strides, dst_shape_vec[0], dst_strides_vec[0],
+        reps_shape_vec[0], reps_strides_vec[0], all_deps);
+
+    sycl::event cleanup_tmp_allocations_ev =
+        dpctl::tensor::alloc_utils::async_smart_free(
+            exec_q, {repeat_ev}, packed_src_shapes_strides_owner);
+    host_task_events.push_back(cleanup_tmp_allocations_ev);
+
+    sycl::event py_obj_management_host_task_ev = dpctl::utils::keep_args_alive(
+        exec_q, {src, reps, cumsum, dst}, host_task_events);
+
+    return std::make_pair(py_obj_management_host_task_ev, repeat_ev);
+}
+
+std::pair<sycl::event, sycl::event>
+    py_repeat_by_scalar(const dpctl::tensor::usm_ndarray &src,
+                        const dpctl::tensor::usm_ndarray &dst,
+                        const py::ssize_t reps,
+                        int axis,
+                        sycl::queue &exec_q,
+                        const std::vector<sycl::event> &depends)
+{
+    int src_nd = src.get_ndim();
+    if (axis < 0 || (axis + 1 > src_nd && src_nd > 0) ||
+        (axis > 0 && src_nd == 0)) {
+        throw py::value_error("Specified axis is invalid.");
+    }
+
+    int dst_nd = dst.get_ndim();
+    if ((src_nd != dst_nd && src_nd > 0) || (src_nd == 0 && dst_nd > 1)) {
+        throw py::value_error("Number of dimensions of source and destination "
+                              "arrays is not consistent");
+    }
+
+    if (!dpctl::utils::queues_are_compatible(exec_q, {src, dst})) {
+        throw py::value_error(
+            "Execution queue is not compatible with allocation queues");
+    }
+
+    dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst);
+
+    const py::ssize_t *src_shape = src.get_shape_raw();
+    const py::ssize_t *dst_shape = dst.get_shape_raw();
+    bool same_orthog_dims(true);
+    std::size_t orthog_nelems(1); // number of orthogonal iterations
+    for (auto i = 0; i < axis; ++i) {
+        auto src_sh_i = src_shape[i];
+        orthog_nelems *= src_sh_i;
+        same_orthog_dims = same_orthog_dims && (src_sh_i == dst_shape[i]);
+    }
+    for (auto i = axis + 1; i < src_nd; ++i) {
+        auto src_sh_i = src_shape[i];
+        orthog_nelems *= src_sh_i;
+        same_orthog_dims = same_orthog_dims && (src_sh_i == dst_shape[i]);
+    }
+
+    std::size_t src_axis_nelems(1);
+    if (src_nd > 0) {
+        src_axis_nelems = src_shape[axis];
+    }
+    std::size_t dst_axis_nelems(dst_shape[axis]);
+
+    // shape at repeated axis must be equal to the shape of src at the axis *
+    // reps
+    if (!same_orthog_dims || (src_axis_nelems * reps) != dst_axis_nelems) {
+        throw py::value_error("Inconsistent array dimensions");
+    }
+
+    if (orthog_nelems == 0 || src_axis_nelems == 0) {
+        return std::make_pair(sycl::event(), sycl::event());
+    }
+
+    dpctl::tensor::validation::AmpleMemory::throw_if_not_ample(
+        dst, orthog_nelems * (src_axis_nelems * reps));
+
+    auto const &overlap = dpctl::tensor::overlap::MemoryOverlap();
+    // check that dst does not intersect with src
+    if (overlap(dst, src)) {
+        throw py::value_error("Destination array overlaps with inputs");
+    }
+
+    int src_typenum = src.get_typenum();
+    int dst_typenum = dst.get_typenum();
+
+    auto const &array_types = td_ns::usm_ndarray_types();
+    int src_typeid = array_types.typenum_to_lookup_id(src_typenum);
+    int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum);
+
+    if (src_typeid != dst_typeid) {
+        throw py::value_error(
+            "Destination array must have the same elemental data type");
+    }
+
+    const char *src_data_p = src.get_data();
+    char *dst_data_p = dst.get_data();
+
+    auto src_shape_vec = src.get_shape_vector();
+    auto src_strides_vec = src.get_strides_vector();
+
+    auto dst_shape_vec = dst.get_shape_vector();
+    auto dst_strides_vec = dst.get_strides_vector();
+
+    sycl::event repeat_ev;
+    std::vector<sycl::event> host_task_events{};
+    if (axis == 0 && src_nd < 2) {
+        // empty orthogonal directions
+
+        auto fn = repeat_by_scalar_1d_dispatch_vector[src_typeid];
+
+        assert(dst_shape_vec.size() == 1);
+        assert(dst_strides_vec.size() == 1);
+
+        if (src_nd == 0) {
+            src_shape_vec = {0};
+            src_strides_vec = {0};
+        }
+
+        using dpctl::tensor::offset_utils::device_allocate_and_pack;
+        auto ptr_size_event_tuple1 = device_allocate_and_pack<py::ssize_t>(
+            exec_q, host_task_events, src_shape_vec, src_strides_vec);
+        auto packed_src_shape_strides_owner =
+            std::move(std::get<0>(ptr_size_event_tuple1));
+        sycl::event copy_shapes_strides_ev = std::get<2>(ptr_size_event_tuple1);
+        const py::ssize_t *packed_src_shape_strides =
+            packed_src_shape_strides_owner.get();
+
+        std::vector<sycl::event> all_deps;
+        all_deps.reserve(depends.size() + 1);
+        all_deps.insert(all_deps.end(), depends.begin(), depends.end());
+        all_deps.push_back(copy_shapes_strides_ev);
+
+        assert(all_deps.size() == depends.size() + 1);
+
+        repeat_ev = fn(exec_q, dst_axis_nelems, src_data_p, dst_data_p, reps,
+                       src_nd, packed_src_shape_strides, dst_shape_vec[0],
+                       dst_strides_vec[0], all_deps);
+
+        sycl::event cleanup_tmp_allocations_ev =
+            dpctl::tensor::alloc_utils::async_smart_free(
+                exec_q, {repeat_ev}, packed_src_shape_strides_owner);
+
+        host_task_events.push_back(cleanup_tmp_allocations_ev);
+    }
+    else {
+        // non-empty orthogonal directions
+
+        auto fn = repeat_by_scalar_dispatch_vector[src_typeid];
+
+        int orthog_nd = src_nd - 1;
+
+        using shT = std::vector<py::ssize_t>;
+        shT orthog_src_shape;
+        shT orthog_src_strides;
+        shT axis_src_shape;
+        shT axis_src_stride;
+        split_iteration_space(src_shape_vec, src_strides_vec, axis, axis + 1,
+                              orthog_src_shape, axis_src_shape,
+                              orthog_src_strides, axis_src_stride);
+
+        shT orthog_dst_shape;
+        shT orthog_dst_strides;
+        shT axis_dst_shape;
+        shT axis_dst_stride;
+        split_iteration_space(dst_shape_vec, dst_strides_vec, axis, axis + 1,
+                              orthog_dst_shape, axis_dst_shape,
+                              orthog_dst_strides, axis_dst_stride);
+
+        assert(orthog_src_shape.size() == static_cast<std::size_t>(orthog_nd));
+        assert(orthog_dst_shape.size() == static_cast<std::size_t>(orthog_nd));
+        assert(std::equal(orthog_src_shape.begin(), orthog_src_shape.end(),
+                          orthog_dst_shape.begin()));
+
+        shT simplified_orthog_shape;
+        shT simplified_orthog_src_strides;
+        shT simplified_orthog_dst_strides;
+
+        const py::ssize_t *_shape = orthog_src_shape.data();
+
+        py::ssize_t orthog_src_offset(0);
+        py::ssize_t orthog_dst_offset(0);
+
+        simplify_iteration_space(
+            orthog_nd, _shape, orthog_src_strides, orthog_dst_strides,
+            // output
+            simplified_orthog_shape, simplified_orthog_src_strides,
+            simplified_orthog_dst_strides, orthog_src_offset,
+            orthog_dst_offset);
+
+        using dpctl::tensor::offset_utils::device_allocate_and_pack;
+        auto ptr_size_event_tuple1 = device_allocate_and_pack<py::ssize_t>(
+            exec_q, host_task_events, simplified_orthog_shape,
+            simplified_orthog_src_strides, simplified_orthog_dst_strides);
+        auto packed_shapes_strides_owner =
+            std::move(std::get<0>(ptr_size_event_tuple1));
+        sycl::event copy_shapes_strides_ev = std::get<2>(ptr_size_event_tuple1);
+        const py::ssize_t *packed_shapes_strides =
+            packed_shapes_strides_owner.get();
+
+        std::vector<sycl::event> all_deps;
+        all_deps.reserve(depends.size() + 1);
+        all_deps.insert(all_deps.end(), depends.begin(), depends.end());
+        all_deps.push_back(copy_shapes_strides_ev);
+
+        assert(all_deps.size() == depends.size() + 1);
+
+        repeat_ev = fn(exec_q, orthog_nelems, dst_axis_nelems, src_data_p,
+                       dst_data_p, reps,
+                       // data to build orthog indexer
+                       orthog_nd, packed_shapes_strides, orthog_src_offset,
+                       orthog_dst_offset,
+                       // data to build indexer along repeated axis in src
+                       axis_src_shape[0], axis_src_stride[0],
+                       // data to build indexer along repeated axis in dst
+                       axis_dst_shape[0], axis_dst_stride[0], all_deps);
+
+        sycl::event cleanup_tmp_allocations_ev =
+            dpctl::tensor::alloc_utils::async_smart_free(
+                exec_q, {repeat_ev}, packed_shapes_strides_owner);
+        host_task_events.push_back(cleanup_tmp_allocations_ev);
+    }
+
+    sycl::event py_obj_management_host_task_ev =
+        dpctl::utils::keep_args_alive(exec_q, {src, dst}, host_task_events);
+
+    return std::make_pair(py_obj_management_host_task_ev, repeat_ev);
+}
+
+std::pair<sycl::event, sycl::event>
+    py_repeat_by_scalar(const dpctl::tensor::usm_ndarray &src,
+                        const dpctl::tensor::usm_ndarray &dst,
+                        const py::ssize_t reps,
+                        sycl::queue &exec_q,
+                        const std::vector<sycl::event> &depends)
+{
+    int dst_nd = dst.get_ndim();
+    if (dst_nd != 1) {
+        throw py::value_error(
+            "`dst` array must be 1-dimensional when repeating a full array");
+    }
+
+    if (!dpctl::utils::queues_are_compatible(exec_q, {src, dst})) {
+        throw py::value_error(
+            "Execution queue is not compatible with allocation queues");
+    }
+
+    dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst);
+
+    std::size_t src_sz = src.get_size();
+    std::size_t dst_sz = dst.get_size();
+
+    // shape at repeated axis must be equal to the shape of src at the axis *
+    // reps
+    if ((src_sz * reps) != dst_sz) {
+        throw py::value_error("Inconsistent array dimensions");
+    }
+
+    if (src_sz == 0) {
+        return std::make_pair(sycl::event(), sycl::event());
+    }
+
+    dpctl::tensor::validation::AmpleMemory::throw_if_not_ample(dst,
+                                                               src_sz * reps);
+
+    auto const &overlap = dpctl::tensor::overlap::MemoryOverlap();
+    // check that dst does not intersect with src
+    if (overlap(dst, src)) {
+        throw py::value_error("Destination array overlaps with inputs");
+    }
+
+    int src_typenum = src.get_typenum();
+    int dst_typenum = dst.get_typenum();
+
+    auto const &array_types = td_ns::usm_ndarray_types();
+    int src_typeid = array_types.typenum_to_lookup_id(src_typenum);
+    int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum);
+
+    if (src_typeid != dst_typeid) {
+        throw py::value_error(
+            "Destination array must have the same elemental data type");
+    }
+
+    const char *src_data_p = src.get_data();
+    char *dst_data_p = dst.get_data();
+
+    int src_nd = src.get_ndim();
+    auto src_shape_vec = src.get_shape_vector();
+    auto src_strides_vec = src.get_strides_vector();
+
+    if (src_nd == 0) {
+        src_shape_vec = {0};
+        src_strides_vec = {0};
+    }
+
+    auto dst_shape_vec = dst.get_shape_vector();
+    auto dst_strides_vec = dst.get_strides_vector();
+
+    std::vector<sycl::event> host_task_events{};
+
+    auto fn = repeat_by_scalar_1d_dispatch_vector[src_typeid];
+
+    using dpctl::tensor::offset_utils::device_allocate_and_pack;
+    auto ptr_size_event_tuple1 = device_allocate_and_pack<py::ssize_t>(
+        exec_q, host_task_events, src_shape_vec, src_strides_vec);
+    auto packed_src_shape_strides_owner =
+        std::move(std::get<0>(ptr_size_event_tuple1));
+    sycl::event copy_shapes_strides_ev = std::get<2>(ptr_size_event_tuple1);
+    const py::ssize_t *packed_src_shape_strides =
+        packed_src_shape_strides_owner.get();
+
+    std::vector<sycl::event> all_deps;
+    all_deps.reserve(depends.size() + 1);
+    all_deps.insert(all_deps.end(), depends.begin(), depends.end());
+    all_deps.push_back(copy_shapes_strides_ev);
+
+    assert(all_deps.size() == depends.size() + 1);
+
+    sycl::event repeat_ev = fn(exec_q, dst_sz, src_data_p, dst_data_p, reps,
+                               src_nd, packed_src_shape_strides,
+                               dst_shape_vec[0], dst_strides_vec[0], all_deps);
+
+    sycl::event cleanup_tmp_allocations_ev =
+        dpctl::tensor::alloc_utils::async_smart_free(
+            exec_q, {repeat_ev}, packed_src_shape_strides_owner);
+    host_task_events.push_back(cleanup_tmp_allocations_ev);
+
+    sycl::event py_obj_management_host_task_ev =
+        dpctl::utils::keep_args_alive(exec_q, {src, dst}, host_task_events);
+
+    return std::make_pair(py_obj_management_host_task_ev, repeat_ev);
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/repeat.hpp b/dpctl_ext/tensor/libtensor/source/repeat.hpp
new file mode 100644
index 000000000000..5835377fb29c
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/repeat.hpp
@@ -0,0 +1,83 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===--------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions
+//===--------------------------------------------------------------------===//
+
+#pragma once
+#include <utility>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern void init_repeat_dispatch_vectors(void);
+
+extern std::pair<sycl::event, sycl::event>
+    py_repeat_by_sequence(const dpctl::tensor::usm_ndarray &src,
+                          const dpctl::tensor::usm_ndarray &dst,
+                          const dpctl::tensor::usm_ndarray &reps,
+                          const dpctl::tensor::usm_ndarray &cumsum,
+                          int axis,
+                          sycl::queue &exec_q,
+                          const std::vector<sycl::event> &depends);
+
+extern std::pair<sycl::event, sycl::event>
+    py_repeat_by_sequence(const dpctl::tensor::usm_ndarray &src,
+                          const dpctl::tensor::usm_ndarray &dst,
+                          const dpctl::tensor::usm_ndarray &reps,
+                          const dpctl::tensor::usm_ndarray &cumsum,
+                          sycl::queue &exec_q,
+                          const std::vector<sycl::event> &depends);
+
+extern std::pair<sycl::event, sycl::event>
+    py_repeat_by_scalar(const dpctl::tensor::usm_ndarray &src,
+                        const dpctl::tensor::usm_ndarray &dst,
+                        const py::ssize_t reps,
+                        int axis,
+                        sycl::queue &exec_q,
+                        const std::vector<sycl::event> &depends);
+
+extern std::pair<sycl::event, sycl::event>
+    py_repeat_by_scalar(const dpctl::tensor::usm_ndarray &src,
+                        const dpctl::tensor::usm_ndarray &dst,
+                        const py::ssize_t reps,
+                        sycl::queue &exec_q,
+                        const std::vector<sycl::event> &depends);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp b/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp
index 7b151c773fe0..5e5b07c087f8 100644
--- a/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp
+++ b/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp
@@ -45,7 +45,7 @@
 
 #include "accumulators.hpp"
 #include "boolean_advanced_indexing.hpp"
-// #include "clip.hpp"
+#include "clip.hpp"
 #include "copy_and_cast_usm_to_usm.hpp"
 #include "copy_as_contig.hpp"
 #include "copy_for_reshape.hpp"
@@ -57,12 +57,12 @@
 #include "integer_advanced_indexing.hpp"
 #include "kernels/dpctl_tensor_types.hpp"
 // #include "linear_sequences.hpp"
-// #include "repeat.hpp"
+#include "repeat.hpp"
 #include "simplify_iteration_space.hpp"
 #include "triul_ctor.hpp"
 #include "utils/memory_overlap.hpp"
 #include "utils/strided_iters.hpp"
-// #include "where.hpp"
+#include "where.hpp"
 #include "zeros_ctor.hpp"
 
 namespace py = pybind11;
@@ -116,8 +116,8 @@ using dpctl::tensor::py_internal::py_place;
 
 /* ================= Repeat ====================*/
 using dpctl::tensor::py_internal::py_cumsum_1d;
-// using dpctl::tensor::py_internal::py_repeat_by_scalar;
-// using dpctl::tensor::py_internal::py_repeat_by_sequence;
+using dpctl::tensor::py_internal::py_repeat_by_scalar;
+using dpctl::tensor::py_internal::py_repeat_by_sequence;
 
 /* ================ Eye ================== */
 
@@ -129,10 +129,10 @@ using dpctl::tensor::py_internal::usm_ndarray_triul;
 
 /* =========================== Where ============================== */
 
-// using dpctl::tensor::py_internal::py_where;
+using dpctl::tensor::py_internal::py_where;
 
 /* =========================== Clip ============================== */
-// using dpctl::tensor::py_internal::py_clip;
+using dpctl::tensor::py_internal::py_clip;
 
 // populate dispatch tables
 void init_dispatch_tables(void)
@@ -142,7 +142,7 @@ void init_dispatch_tables(void)
     init_copy_and_cast_usm_to_usm_dispatch_tables();
     init_copy_numpy_ndarray_into_usm_ndarray_dispatch_tables();
     init_advanced_indexing_dispatch_tables();
-    // init_where_dispatch_tables();
+    init_where_dispatch_tables();
     return;
 }
 
@@ -166,9 +166,9 @@ void init_dispatch_vectors(void)
     populate_mask_positions_dispatch_vectors();
 
     populate_cumsum_1d_dispatch_vectors();
-    // init_repeat_dispatch_vectors();
+    init_repeat_dispatch_vectors();
 
-    // init_clip_dispatch_vectors();
+    init_clip_dispatch_vectors();
 
     return;
 }
@@ -443,55 +443,53 @@ PYBIND11_MODULE(_tensor_impl, m)
           py::arg("mask_shape"), py::arg("sycl_queue"),
           py::arg("depends") = py::list());
 
-    // m.def("_where", &py_where, "", py::arg("condition"), py::arg("x1"),
-    //       py::arg("x2"), py::arg("dst"), py::arg("sycl_queue"),
-    //       py::arg("depends") = py::list());
-
-    // auto repeat_sequence = [](const dpctl::tensor::usm_ndarray &src,
-    //                           const dpctl::tensor::usm_ndarray &dst,
-    //                           const dpctl::tensor::usm_ndarray &reps,
-    //                           const dpctl::tensor::usm_ndarray &cumsum,
-    //                           std::optional<int> axis, sycl::queue &exec_q,
-    //                           const std::vector<sycl::event> depends)
-    //     -> std::pair<sycl::event, sycl::event> {
-    //     if (axis) {
-    //         return py_repeat_by_sequence(src, dst, reps, cumsum,
-    //         axis.value(),
-    //                                      exec_q, depends);
-    //     }
-    //     else {
-    //         return py_repeat_by_sequence(src, dst, reps, cumsum, exec_q,
-    //                                      depends);
-    //     }
-    // };
-    // m.def("_repeat_by_sequence", repeat_sequence, py::arg("src"),
-    //       py::arg("dst"), py::arg("reps"), py::arg("cumsum"),
-    //       py::arg("axis"), py::arg("sycl_queue"), py::arg("depends") =
-    //       py::list());
-
-    // auto repeat_scalar = [](const dpctl::tensor::usm_ndarray &src,
-    //                         const dpctl::tensor::usm_ndarray &dst,
-    //                         const py::ssize_t reps, std::optional<int> axis,
-    //                         sycl::queue &exec_q,
-    //                         const std::vector<sycl::event> depends)
-    //     -> std::pair<sycl::event, sycl::event> {
-    //     if (axis) {
-    //         return py_repeat_by_scalar(src, dst, reps, axis.value(), exec_q,
-    //                                    depends);
-    //     }
-    //     else {
-    //         return py_repeat_by_scalar(src, dst, reps, exec_q, depends);
-    //     }
-    // };
-    // m.def("_repeat_by_scalar", repeat_scalar, py::arg("src"), py::arg("dst"),
-    //       py::arg("reps"), py::arg("axis"), py::arg("sycl_queue"),
-    //       py::arg("depends") = py::list());
-
-    // m.def("_clip", &py_clip,
-    //       "Clamps elements of array `x` to the range "
-    //       "[`min`, `max] and writes the result to the "
-    //       "array `dst` for each element of `x`, `min`, and `max`."
-    //       "Returns a tuple of events: (hev, ev)",
-    //       py::arg("src"), py::arg("min"), py::arg("max"), py::arg("dst"),
-    //       py::arg("sycl_queue"), py::arg("depends") = py::list());
+    m.def("_where", &py_where, "", py::arg("condition"), py::arg("x1"),
+          py::arg("x2"), py::arg("dst"), py::arg("sycl_queue"),
+          py::arg("depends") = py::list());
+
+    auto repeat_sequence = [](const dpctl::tensor::usm_ndarray &src,
+                              const dpctl::tensor::usm_ndarray &dst,
+                              const dpctl::tensor::usm_ndarray &reps,
+                              const dpctl::tensor::usm_ndarray &cumsum,
+                              std::optional<int> axis, sycl::queue &exec_q,
+                              const std::vector<sycl::event> depends)
+        -> std::pair<sycl::event, sycl::event> {
+        if (axis) {
+            return py_repeat_by_sequence(src, dst, reps, cumsum, axis.value(),
+                                         exec_q, depends);
+        }
+        else {
+            return py_repeat_by_sequence(src, dst, reps, cumsum, exec_q,
+                                         depends);
+        }
+    };
+    m.def("_repeat_by_sequence", repeat_sequence, py::arg("src"),
+          py::arg("dst"), py::arg("reps"), py::arg("cumsum"), py::arg("axis"),
+          py::arg("sycl_queue"), py::arg("depends") = py::list());
+
+    auto repeat_scalar = [](const dpctl::tensor::usm_ndarray &src,
+                            const dpctl::tensor::usm_ndarray &dst,
+                            const py::ssize_t reps, std::optional<int> axis,
+                            sycl::queue &exec_q,
+                            const std::vector<sycl::event> depends)
+        -> std::pair<sycl::event, sycl::event> {
+        if (axis) {
+            return py_repeat_by_scalar(src, dst, reps, axis.value(), exec_q,
+                                       depends);
+        }
+        else {
+            return py_repeat_by_scalar(src, dst, reps, exec_q, depends);
+        }
+    };
+    m.def("_repeat_by_scalar", repeat_scalar, py::arg("src"), py::arg("dst"),
+          py::arg("reps"), py::arg("axis"), py::arg("sycl_queue"),
+          py::arg("depends") = py::list());
+
+    m.def("_clip", &py_clip,
+          "Clamps elements of array `x` to the range "
+          "[`min`, `max] and writes the result to the "
+          "array `dst` for each element of `x`, `min`, and `max`."
+          "Returns a tuple of events: (hev, ev)",
+          py::arg("src"), py::arg("min"), py::arg("max"), py::arg("dst"),
+          py::arg("sycl_queue"), py::arg("depends") = py::list());
 }
diff --git a/dpctl_ext/tensor/libtensor/source/where.cpp b/dpctl_ext/tensor/libtensor/source/where.cpp
new file mode 100644
index 000000000000..46c52cf83b34
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/where.cpp
@@ -0,0 +1,265 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines Python API for implementation functions of
+/// dpctl.tensor.where
+//===---------------------------------------------------------------------===//
+
+#include <cassert>
+#include <cstddef>
+#include <tuple>
+#include <utility>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/pybind11.h>
+
+#include "kernels/where.hpp"
+#include "utils/memory_overlap.hpp"
+#include "utils/offset_utils.hpp"
+#include "utils/output_validation.hpp"
+#include "utils/sycl_alloc_utils.hpp"
+#include "utils/type_dispatch.hpp"
+
+#include "simplify_iteration_space.hpp"
+#include "where.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+using dpctl::tensor::kernels::search::where_contig_impl_fn_ptr_t;
+using dpctl::tensor::kernels::search::where_strided_impl_fn_ptr_t;
+
+static where_contig_impl_fn_ptr_t where_contig_dispatch_table[td_ns::num_types]
+                                                             [td_ns::num_types];
+static where_strided_impl_fn_ptr_t
+    where_strided_dispatch_table[td_ns::num_types][td_ns::num_types];
+
+using dpctl::utils::keep_args_alive;
+
+std::pair<sycl::event, sycl::event>
+    py_where(const dpctl::tensor::usm_ndarray &condition,
+             const dpctl::tensor::usm_ndarray &x1,
+             const dpctl::tensor::usm_ndarray &x2,
+             const dpctl::tensor::usm_ndarray &dst,
+             sycl::queue &exec_q,
+             const std::vector<sycl::event> &depends)
+{
+
+    if (!dpctl::utils::queues_are_compatible(exec_q, {x1, x2, condition, dst}))
+    {
+        throw py::value_error(
+            "Execution queue is not compatible with allocation queues");
+    }
+
+    dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst);
+
+    int nd = condition.get_ndim();
+    int x1_nd = x1.get_ndim();
+    int x2_nd = x2.get_ndim();
+    int dst_nd = dst.get_ndim();
+
+    if (nd != x1_nd || nd != x2_nd) {
+        throw py::value_error(
+            "Input arrays are not of appropriate dimension for where kernel.");
+    }
+
+    if (nd != dst_nd) {
+        throw py::value_error(
+            "Destination is not of appropriate dimension for where kernel.");
+    }
+
+    const py::ssize_t *x1_shape = x1.get_shape_raw();
+    const py::ssize_t *x2_shape = x2.get_shape_raw();
+    const py::ssize_t *dst_shape = dst.get_shape_raw();
+    const py::ssize_t *cond_shape = condition.get_shape_raw();
+
+    bool shapes_equal(true);
+    std::size_t nelems(1);
+    for (int i = 0; i < nd; ++i) {
+        const auto &sh_i = dst_shape[i];
+        nelems *= static_cast<std::size_t>(sh_i);
+        shapes_equal = shapes_equal && (x1_shape[i] == sh_i) &&
+                       (x2_shape[i] == sh_i) && (cond_shape[i] == sh_i);
+    }
+
+    if (!shapes_equal) {
+        throw py::value_error("Axes are not of matching shapes.");
+    }
+
+    if (nelems == 0) {
+        return std::make_pair(sycl::event{}, sycl::event{});
+    }
+
+    auto const &overlap = dpctl::tensor::overlap::MemoryOverlap();
+    auto const &same_logical_tensors =
+        dpctl::tensor::overlap::SameLogicalTensors();
+    if ((overlap(dst, condition) && !same_logical_tensors(dst, condition)) ||
+        (overlap(dst, x1) && !same_logical_tensors(dst, x1)) ||
+        (overlap(dst, x2) && !same_logical_tensors(dst, x2)))
+    {
+        throw py::value_error("Destination array overlaps with input.");
+    }
+
+    int x1_typenum = x1.get_typenum();
+    int x2_typenum = x2.get_typenum();
+    int cond_typenum = condition.get_typenum();
+    int dst_typenum = dst.get_typenum();
+
+    auto const &array_types = td_ns::usm_ndarray_types();
+    int cond_typeid = array_types.typenum_to_lookup_id(cond_typenum);
+    int x1_typeid = array_types.typenum_to_lookup_id(x1_typenum);
+    int x2_typeid = array_types.typenum_to_lookup_id(x2_typenum);
+    int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum);
+
+    if (x1_typeid != x2_typeid || x1_typeid != dst_typeid) {
+        throw py::value_error("Value arrays must have the same data type");
+    }
+
+    dpctl::tensor::validation::AmpleMemory::throw_if_not_ample(dst, nelems);
+
+    char *cond_data = condition.get_data();
+    char *x1_data = x1.get_data();
+    char *x2_data = x2.get_data();
+    char *dst_data = dst.get_data();
+
+    bool is_x1_c_contig = x1.is_c_contiguous();
+    bool is_x1_f_contig = x1.is_f_contiguous();
+
+    bool is_x2_c_contig = x2.is_c_contiguous();
+    bool is_x2_f_contig = x2.is_f_contiguous();
+
+    bool is_cond_c_contig = condition.is_c_contiguous();
+    bool is_cond_f_contig = condition.is_f_contiguous();
+
+    bool is_dst_c_contig = dst.is_c_contiguous();
+    bool is_dst_f_contig = dst.is_f_contiguous();
+
+    bool all_c_contig = (is_x1_c_contig && is_x2_c_contig && is_cond_c_contig &&
+                         is_dst_c_contig);
+    bool all_f_contig = (is_x1_f_contig && is_x2_f_contig && is_cond_f_contig &&
+                         is_dst_f_contig);
+
+    if (all_c_contig || all_f_contig) {
+        auto contig_fn = where_contig_dispatch_table[x1_typeid][cond_typeid];
+
+        auto where_ev = contig_fn(exec_q, nelems, cond_data, x1_data, x2_data,
+                                  dst_data, depends);
+        sycl::event ht_ev =
+            keep_args_alive(exec_q, {x1, x2, dst, condition}, {where_ev});
+
+        return std::make_pair(ht_ev, where_ev);
+    }
+
+    auto const &cond_strides = condition.get_strides_vector();
+    auto const &x1_strides = x1.get_strides_vector();
+    auto const &x2_strides = x2.get_strides_vector();
+    auto const &dst_strides = dst.get_strides_vector();
+
+    using shT = std::vector<py::ssize_t>;
+    shT simplified_shape;
+    shT simplified_cond_strides;
+    shT simplified_x1_strides;
+    shT simplified_x2_strides;
+    shT simplified_dst_strides;
+    py::ssize_t cond_offset(0);
+    py::ssize_t x1_offset(0);
+    py::ssize_t x2_offset(0);
+    py::ssize_t dst_offset(0);
+
+    simplify_iteration_space_4(
+        nd, x1_shape, cond_strides, x1_strides, x2_strides, dst_strides,
+        // outputs
+        simplified_shape, simplified_cond_strides, simplified_x1_strides,
+        simplified_x2_strides, simplified_dst_strides, cond_offset, x1_offset,
+        x2_offset, dst_offset);
+
+    auto fn = where_strided_dispatch_table[x1_typeid][cond_typeid];
+
+    std::vector<sycl::event> host_task_events;
+    host_task_events.reserve(2);
+
+    using dpctl::tensor::offset_utils::device_allocate_and_pack;
+    auto ptr_size_event_tuple = device_allocate_and_pack<py::ssize_t>(
+        exec_q, host_task_events,
+        // common shape and strides
+        simplified_shape, simplified_cond_strides, simplified_x1_strides,
+        simplified_x2_strides, simplified_dst_strides);
+    auto packed_shape_strides_owner =
+        std::move(std::get<0>(ptr_size_event_tuple));
+    sycl::event copy_shape_strides_ev = std::get<2>(ptr_size_event_tuple);
+    const py::ssize_t *packed_shape_strides = packed_shape_strides_owner.get();
+
+    std::vector<sycl::event> all_deps;
+    all_deps.reserve(depends.size() + 1);
+    all_deps.insert(all_deps.end(), depends.begin(), depends.end());
+    all_deps.push_back(copy_shape_strides_ev);
+
+    assert(all_deps.size() == depends.size() + 1);
+
+    sycl::event where_ev = fn(exec_q, nelems, nd, cond_data, x1_data, x2_data,
+                              dst_data, packed_shape_strides, cond_offset,
+                              x1_offset, x2_offset, dst_offset, all_deps);
+
+    // free packed temporaries
+    sycl::event temporaries_cleanup_ev =
+        dpctl::tensor::alloc_utils::async_smart_free(
+            exec_q, {where_ev}, packed_shape_strides_owner);
+    host_task_events.push_back(temporaries_cleanup_ev);
+
+    sycl::event arg_cleanup_ev =
+        keep_args_alive(exec_q, {x1, x2, condition, dst}, host_task_events);
+
+    return std::make_pair(arg_cleanup_ev, where_ev);
+}
+
+void init_where_dispatch_tables(void)
+{
+    using namespace td_ns;
+    using dpctl::tensor::kernels::search::WhereContigFactory;
+    DispatchTableBuilder<where_contig_impl_fn_ptr_t, WhereContigFactory,
+                         num_types>
+        dtb1;
+    dtb1.populate_dispatch_table(where_contig_dispatch_table);
+
+    using dpctl::tensor::kernels::search::WhereStridedFactory;
+    DispatchTableBuilder<where_strided_impl_fn_ptr_t, WhereStridedFactory,
+                         num_types>
+        dtb2;
+    dtb2.populate_dispatch_table(where_strided_dispatch_table);
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/where.hpp b/dpctl_ext/tensor/libtensor/source/where.hpp
new file mode 100644
index 000000000000..ba81d8b11642
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/where.hpp
@@ -0,0 +1,57 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file declares Python API for implementation functions of
+/// dpctl.tensor.where
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <utility>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+extern std::pair<sycl::event, sycl::event>
+    py_where(const dpctl::tensor::usm_ndarray &,
+             const dpctl::tensor::usm_ndarray &,
+             const dpctl::tensor::usm_ndarray &,
+             const dpctl::tensor::usm_ndarray &,
+             sycl::queue &,
+             const std::vector<sycl::event> &);
+
+extern void init_where_dispatch_tables(void);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpnp/dpnp_algo/dpnp_arraycreation.py b/dpnp/dpnp_algo/dpnp_arraycreation.py
index 47edf63a68b4..f3dd18153563 100644
--- a/dpnp/dpnp_algo/dpnp_arraycreation.py
+++ b/dpnp/dpnp_algo/dpnp_arraycreation.py
@@ -243,7 +243,7 @@ def dpnp_linspace(
             # Needed a special handling for denormal numbers (when step == 0),
             # see numpy#5437 for more details.
             # Note, dpt.where() is used to avoid a synchronization branch.
-            usm_res = dpt.where(
+            usm_res = dpt_ext.where(
                 step == 0, (usm_res / step_num) * delta, usm_res * step
             )
         else:
diff --git a/dpnp/dpnp_algo/dpnp_elementwise_common.py b/dpnp/dpnp_algo/dpnp_elementwise_common.py
index 55d74e8c1803..722b8cb3b3f0 100644
--- a/dpnp/dpnp_algo/dpnp_elementwise_common.py
+++ b/dpnp/dpnp_algo/dpnp_elementwise_common.py
@@ -30,7 +30,6 @@
 from functools import wraps
 
 import dpctl.tensor as dpt
-import dpctl.tensor._copy_utils as dtc
 import dpctl.tensor._type_utils as dtu
 import dpctl.utils as dpu
 import numpy
@@ -48,6 +47,7 @@
 # TODO: revert to `import dpctl.tensor...`
 # when dpnp fully migrates dpctl/tensor
 import dpctl_ext.tensor as dpt_ext
+import dpctl_ext.tensor._copy_utils as dtc
 import dpctl_ext.tensor._tensor_impl as dti
 import dpnp
 import dpnp.backend.extensions.vm._vm_impl as vmi
diff --git a/dpnp/dpnp_array.py b/dpnp/dpnp_array.py
index 0b6d882c53db..b3ed3770396d 100644
--- a/dpnp/dpnp_array.py
+++ b/dpnp/dpnp_array.py
@@ -38,13 +38,13 @@
 import warnings
 
 import dpctl.tensor as dpt
-import dpctl.tensor._type_utils as dtu
-from dpctl.tensor._numpy_helper import AxisError
 
 # TODO: revert to `import dpctl.tensor...`
 # when dpnp fully migrates dpctl/tensor
 import dpctl_ext.tensor as dpt_ext
+import dpctl_ext.tensor._type_utils as dtu
 import dpnp
+from dpctl_ext.tensor._numpy_helper import AxisError
 
 from . import memory as dpm
 
diff --git a/dpnp/dpnp_iface_functional.py b/dpnp/dpnp_iface_functional.py
index 1985eced2e71..797d8a736276 100644
--- a/dpnp/dpnp_iface_functional.py
+++ b/dpnp/dpnp_iface_functional.py
@@ -41,13 +41,15 @@
 
 # pylint: disable=protected-access
 
-from dpctl.tensor._numpy_helper import (
+import dpnp
+
+# TODO: revert to `from dpctl.tensor...`
+# when dpnp fully migrates dpctl/tensor
+from dpctl_ext.tensor._numpy_helper import (
     normalize_axis_index,
     normalize_axis_tuple,
 )
 
-import dpnp
-
 # pylint: disable=no-name-in-module
 from dpnp.dpnp_utils import get_usm_allocations
 
diff --git a/dpnp/dpnp_iface_indexing.py b/dpnp/dpnp_iface_indexing.py
index f305b106221f..bc190db70c4e 100644
--- a/dpnp/dpnp_iface_indexing.py
+++ b/dpnp/dpnp_iface_indexing.py
@@ -250,7 +250,7 @@ def choose(a, choices, out=None, mode="wrap"):
 
     res_usm_type, exec_q = get_usm_allocations(choices + [inds])
     # apply type promotion to input choices
-    res_dt = dpt.result_type(*choices)
+    res_dt = dpt_ext.result_type(*choices)
     if len(choices) > 1:
         choices = tuple(
             map(
diff --git a/dpnp/dpnp_iface_manipulation.py b/dpnp/dpnp_iface_manipulation.py
index e988bbaa237b..d5e1e1aa5706 100644
--- a/dpnp/dpnp_iface_manipulation.py
+++ b/dpnp/dpnp_iface_manipulation.py
@@ -47,16 +47,16 @@
 import dpctl
 import dpctl.tensor as dpt
 import numpy
-from dpctl.tensor._numpy_helper import (
-    AxisError,
-    normalize_axis_index,
-    normalize_axis_tuple,
-)
 
 # TODO: revert to `import dpctl.tensor...`
 # when dpnp fully migrates dpctl/tensor
 import dpctl_ext.tensor as dpt_ext
 import dpnp
+from dpctl_ext.tensor._numpy_helper import (
+    AxisError,
+    normalize_axis_index,
+    normalize_axis_tuple,
+)
 
 from .dpnp_array import dpnp_array
 
@@ -1270,7 +1270,7 @@ def can_cast(from_, to, casting="safe"):
         if dpnp.is_supported_array_type(from_)
         else dpnp.dtype(from_)
     )
-    return dpt.can_cast(dtype_from, to, casting=casting)
+    return dpt_ext.can_cast(dtype_from, to, casting=casting)
 
 
 def column_stack(tup):
@@ -2837,7 +2837,7 @@ def repeat(a, repeats, axis=None):
         a = dpnp.ravel(a)
 
     usm_arr = dpnp.get_usm_ndarray(a)
-    usm_res = dpt.repeat(usm_arr, repeats, axis=axis)
+    usm_res = dpt_ext.repeat(usm_arr, repeats, axis=axis)
     return dpnp_array._create_from_usm_ndarray(usm_res)
 
 
@@ -3195,7 +3195,7 @@ def result_type(*arrays_and_dtypes):
         )
         for X in arrays_and_dtypes
     ]
-    return dpt.result_type(*usm_arrays_and_dtypes)
+    return dpt_ext.result_type(*usm_arrays_and_dtypes)
 
 
 def roll(x, shift, axis=None):
diff --git a/dpnp/dpnp_iface_mathematical.py b/dpnp/dpnp_iface_mathematical.py
index e339c24d384c..06f4fe936253 100644
--- a/dpnp/dpnp_iface_mathematical.py
+++ b/dpnp/dpnp_iface_mathematical.py
@@ -40,6 +40,7 @@
 """
 
 # pylint: disable=protected-access
+# pylint: disable=duplicate-code
 # pylint: disable=no-name-in-module
 
 
@@ -48,17 +49,19 @@
 
 import dpctl.tensor as dpt
 import dpctl.tensor._tensor_elementwise_impl as ti
-import dpctl.tensor._type_utils as dtu
 import dpctl.utils as dpu
 import numpy
-from dpctl.tensor._numpy_helper import (
-    normalize_axis_index,
-    normalize_axis_tuple,
-)
-from dpctl.tensor._type_utils import _acceptance_fn_divide
 
+# TODO: revert to `import dpctl.tensor...`
+# when dpnp fully migrates dpctl/tensor
+import dpctl_ext.tensor as dpt_ext
+import dpctl_ext.tensor._type_utils as dtu
 import dpnp
 import dpnp.backend.extensions.ufunc._ufunc_impl as ufi
+from dpctl_ext.tensor._numpy_helper import (
+    normalize_axis_index,
+    normalize_axis_tuple,
+)
 
 from .dpnp_algo.dpnp_elementwise_common import (
     DPNPI0,
@@ -727,7 +730,7 @@ def clip(a, /, min=None, max=None, *, out=None, order="K", **kwargs):
     usm_max = None if max is None else dpnp.get_usm_ndarray_or_scalar(max)
 
     usm_out = None if out is None else dpnp.get_usm_ndarray(out)
-    usm_res = dpt.clip(usm_arr, usm_min, usm_max, out=usm_out, order=order)
+    usm_res = dpt_ext.clip(usm_arr, usm_min, usm_max, out=usm_out, order=order)
     if out is not None and isinstance(out, dpnp_array):
         return out
     return dpnp_array._create_from_usm_ndarray(usm_res)
@@ -1561,7 +1564,7 @@ def diff(a, n=1, axis=-1, prepend=None, append=None):
     mkl_fn_to_call="_mkl_div_to_call",
     mkl_impl_fn="_div",
     binary_inplace_fn=ti._divide_inplace,
-    acceptance_fn=_acceptance_fn_divide,
+    acceptance_fn=dtu._acceptance_fn_divide,
 )
 
 
diff --git a/dpnp/dpnp_iface_searching.py b/dpnp/dpnp_iface_searching.py
index 16ab633d506b..a2389978d506 100644
--- a/dpnp/dpnp_iface_searching.py
+++ b/dpnp/dpnp_iface_searching.py
@@ -44,6 +44,7 @@
 # pylint: disable=no-name-in-module
 # TODO: revert to `import dpctl.tensor...`
 # when dpnp fully migrates dpctl/tensor
+import dpctl_ext.tensor as dpt_ext
 import dpctl_ext.tensor._tensor_impl as dti
 import dpnp
 
@@ -473,5 +474,7 @@ def where(condition, x=None, y=None, /, *, order="K", out=None):
     usm_condition = dpnp.get_usm_ndarray(condition)
 
     usm_out = None if out is None else dpnp.get_usm_ndarray(out)
-    usm_res = dpt.where(usm_condition, usm_x, usm_y, order=order, out=usm_out)
+    usm_res = dpt_ext.where(
+        usm_condition, usm_x, usm_y, order=order, out=usm_out
+    )
     return dpnp.get_result_array(usm_res, out)
diff --git a/dpnp/dpnp_iface_sorting.py b/dpnp/dpnp_iface_sorting.py
index be6c52ae9d80..5f7a3829b3c9 100644
--- a/dpnp/dpnp_iface_sorting.py
+++ b/dpnp/dpnp_iface_sorting.py
@@ -42,12 +42,12 @@
 from collections.abc import Sequence
 
 import dpctl.tensor as dpt
-from dpctl.tensor._numpy_helper import normalize_axis_index
 
 # TODO: revert to `import dpctl.tensor...`
 # when dpnp fully migrates dpctl/tensor
 import dpctl_ext.tensor as dpt_ext
 import dpnp
+from dpctl_ext.tensor._numpy_helper import normalize_axis_index
 
 # pylint: disable=no-name-in-module
 from .dpnp_algo import (
diff --git a/dpnp/dpnp_iface_statistics.py b/dpnp/dpnp_iface_statistics.py
index daff981d5cc4..9d3ccc40ecf5 100644
--- a/dpnp/dpnp_iface_statistics.py
+++ b/dpnp/dpnp_iface_statistics.py
@@ -45,7 +45,6 @@
 import dpctl.tensor._tensor_elementwise_impl as ti
 import dpctl.utils as dpu
 import numpy
-from dpctl.tensor._numpy_helper import normalize_axis_index
 
 # TODO: revert to `import dpctl.tensor...`
 # when dpnp fully migrates dpctl/tensor
@@ -54,6 +53,7 @@
 
 # pylint: disable=no-name-in-module
 import dpnp.backend.extensions.statistics._statistics_impl as statistics_ext
+from dpctl_ext.tensor._numpy_helper import normalize_axis_index
 from dpnp.dpnp_utils.dpnp_utils_common import (
     result_type_for_device,
     to_supported_dtypes,
diff --git a/dpnp/dpnp_iface_trigonometric.py b/dpnp/dpnp_iface_trigonometric.py
index a46f06c10e08..9894bd304701 100644
--- a/dpnp/dpnp_iface_trigonometric.py
+++ b/dpnp/dpnp_iface_trigonometric.py
@@ -45,8 +45,10 @@
 
 import dpctl.tensor as dpt
 import dpctl.tensor._tensor_elementwise_impl as ti
-import dpctl.tensor._type_utils as dtu
 
+# TODO: revert to `import dpctl.tensor...`
+# when dpnp fully migrates dpctl/tensor
+import dpctl_ext.tensor._type_utils as dtu
 import dpnp
 import dpnp.backend.extensions.ufunc._ufunc_impl as ufi
 
diff --git a/dpnp/dpnp_iface_types.py b/dpnp/dpnp_iface_types.py
index 8fdb9e1d3d38..f133333d6b83 100644
--- a/dpnp/dpnp_iface_types.py
+++ b/dpnp/dpnp_iface_types.py
@@ -40,6 +40,9 @@
 import dpctl.tensor as dpt
 import numpy
 
+# TODO: revert to `import dpctl.tensor...`
+# when dpnp fully migrates dpctl/tensor
+import dpctl_ext.tensor as dpt_ext
 import dpnp
 
 from .dpnp_array import dpnp_array
@@ -211,7 +214,7 @@ def finfo(dtype):
     """
     if isinstance(dtype, dpnp_array):
         dtype = dtype.dtype
-    return dpt.finfo(dtype)
+    return dpt_ext.finfo(dtype)
 
 
 # pylint: disable=redefined-outer-name
@@ -244,7 +247,7 @@ def iinfo(dtype):
 
     if isinstance(dtype, dpnp_array):
         dtype = dtype.dtype
-    return dpt.iinfo(dtype)
+    return dpt_ext.iinfo(dtype)
 
 
 def isdtype(dtype, kind):
@@ -298,7 +301,7 @@ def isdtype(dtype, kind):
     elif isinstance(kind, tuple):
         kind = tuple(dpt.dtype(k) if isinstance(k, type) else k for k in kind)
 
-    return dpt.isdtype(dtype, kind)
+    return dpt_ext.isdtype(dtype, kind)
 
 
 def issubdtype(arg1, arg2):
diff --git a/dpnp/dpnp_utils/dpnp_utils_common.py b/dpnp/dpnp_utils/dpnp_utils_common.py
index e4bde2e1ec86..aa294fefe275 100644
--- a/dpnp/dpnp_utils/dpnp_utils_common.py
+++ b/dpnp/dpnp_utils/dpnp_utils_common.py
@@ -29,8 +29,9 @@
 
 from collections.abc import Iterable
 
-import dpctl.tensor._type_utils as dtu
-
+# TODO: revert to `import dpctl.tensor...`
+# when dpnp fully migrates dpctl/tensor
+import dpctl_ext.tensor._type_utils as dtu
 import dpnp
 from dpnp.dpnp_utils import map_dtype_to_device
 
diff --git a/dpnp/dpnp_utils/dpnp_utils_linearalgebra.py b/dpnp/dpnp_utils/dpnp_utils_linearalgebra.py
index 9ad97742ee18..b01f57eaecdd 100644
--- a/dpnp/dpnp_utils/dpnp_utils_linearalgebra.py
+++ b/dpnp/dpnp_utils/dpnp_utils_linearalgebra.py
@@ -30,11 +30,6 @@
 import dpctl.tensor as dpt
 import dpctl.utils as dpu
 import numpy
-from dpctl.tensor._numpy_helper import (
-    AxisError,
-    normalize_axis_index,
-    normalize_axis_tuple,
-)
 from dpctl.utils import ExecutionPlacementError
 
 # pylint: disable=no-name-in-module
@@ -43,6 +38,11 @@
 import dpctl_ext.tensor._tensor_impl as ti
 import dpnp
 import dpnp.backend.extensions.blas._blas_impl as bi
+from dpctl_ext.tensor._numpy_helper import (
+    AxisError,
+    normalize_axis_index,
+    normalize_axis_tuple,
+)
 from dpnp.dpnp_array import dpnp_array
 from dpnp.dpnp_utils import get_usm_allocations
 
diff --git a/dpnp/dpnp_utils/dpnp_utils_statistics.py b/dpnp/dpnp_utils/dpnp_utils_statistics.py
index 3a3bc04a31af..ec67b619a13f 100644
--- a/dpnp/dpnp_utils/dpnp_utils_statistics.py
+++ b/dpnp/dpnp_utils/dpnp_utils_statistics.py
@@ -30,10 +30,13 @@
 
 import dpctl
 import dpctl.tensor as dpt
-from dpctl.tensor._numpy_helper import normalize_axis_tuple
 from dpctl.utils import ExecutionPlacementError
 
 import dpnp
+
+# TODO: revert to `from dpctl.tensor...`
+# when dpnp fully migrates dpctl/tensor
+from dpctl_ext.tensor._numpy_helper import normalize_axis_tuple
 from dpnp.dpnp_array import dpnp_array
 
 __all__ = ["dpnp_cov", "dpnp_median"]
diff --git a/dpnp/fft/dpnp_utils_fft.py b/dpnp/fft/dpnp_utils_fft.py
index 534b9404254f..b959b78e1ad0 100644
--- a/dpnp/fft/dpnp_utils_fft.py
+++ b/dpnp/fft/dpnp_utils_fft.py
@@ -49,10 +49,6 @@
 import dpctl.tensor._tensor_impl as ti
 import dpctl.utils as dpu
 import numpy
-from dpctl.tensor._numpy_helper import (
-    normalize_axis_index,
-    normalize_axis_tuple,
-)
 from dpctl.utils import ExecutionPlacementError
 
 # pylint: disable=no-name-in-module
@@ -61,6 +57,10 @@
 import dpctl_ext.tensor._tensor_impl as ti_ext
 import dpnp
 import dpnp.backend.extensions.fft._fft_impl as fi
+from dpctl_ext.tensor._numpy_helper import (
+    normalize_axis_index,
+    normalize_axis_tuple,
+)
 
 from ..dpnp_array import dpnp_array
 from ..dpnp_utils import map_dtype_to_device
diff --git a/dpnp/linalg/dpnp_iface_linalg.py b/dpnp/linalg/dpnp_iface_linalg.py
index 6959565ecf17..f4e0f96da5e6 100644
--- a/dpnp/linalg/dpnp_iface_linalg.py
+++ b/dpnp/linalg/dpnp_iface_linalg.py
@@ -45,9 +45,12 @@
 from typing import NamedTuple
 
 import numpy
-from dpctl.tensor._numpy_helper import normalize_axis_tuple
 
 import dpnp
+
+# TODO: revert to `from dpctl.tensor...`
+# when dpnp fully migrates dpctl/tensor
+from dpctl_ext.tensor._numpy_helper import normalize_axis_tuple
 from dpnp.backend.extensions.lapack._lapack_impl import LinAlgError
 
 from .dpnp_utils_linalg import (
diff --git a/dpnp/linalg/dpnp_utils_linalg.py b/dpnp/linalg/dpnp_utils_linalg.py
index c6897e7b0614..28e11f6188c5 100644
--- a/dpnp/linalg/dpnp_utils_linalg.py
+++ b/dpnp/linalg/dpnp_utils_linalg.py
@@ -44,7 +44,6 @@
 
 import dpctl.utils as dpu
 import numpy
-from dpctl.tensor._numpy_helper import normalize_axis_index
 from numpy import prod
 
 # pylint: disable=no-name-in-module
@@ -53,6 +52,7 @@
 import dpctl_ext.tensor._tensor_impl as ti
 import dpnp
 import dpnp.backend.extensions.lapack._lapack_impl as li
+from dpctl_ext.tensor._numpy_helper import normalize_axis_index
 from dpnp.dpnp_utils import get_usm_allocations
 
 
diff --git a/dpnp/tests/test_arraymanipulation.py b/dpnp/tests/test_arraymanipulation.py
index ba83ee94d8b0..17d2c07fd6cc 100644
--- a/dpnp/tests/test_arraymanipulation.py
+++ b/dpnp/tests/test_arraymanipulation.py
@@ -3,11 +3,14 @@
 import dpctl.tensor as dpt
 import numpy
 import pytest
-from dpctl.tensor._numpy_helper import AxisError
 from numpy.testing import assert_array_equal, assert_equal, assert_raises
 
 import dpnp
 
+# TODO: revert to `from dpctl.tensor...`
+# when dpnp fully migrates dpctl/tensor
+from dpctl_ext.tensor._numpy_helper import AxisError
+
 from .helper import get_all_dtypes, get_float_complex_dtypes
 from .third_party.cupy import testing
 
diff --git a/dpnp/tests/test_counting.py b/dpnp/tests/test_counting.py
index 762abd58b687..9210e7c1b3dd 100644
--- a/dpnp/tests/test_counting.py
+++ b/dpnp/tests/test_counting.py
@@ -1,6 +1,5 @@
 import numpy
 import pytest
-from dpctl.tensor._numpy_helper import AxisError
 from numpy.testing import (
     assert_allclose,
     assert_equal,
@@ -9,6 +8,10 @@
 
 import dpnp
 
+# TODO: revert to `from dpctl.tensor...`
+# when dpnp fully migrates dpctl/tensor
+from dpctl_ext.tensor._numpy_helper import AxisError
+
 from .helper import (
     get_all_dtypes,
     get_float_dtypes,
diff --git a/dpnp/tests/test_flipping.py b/dpnp/tests/test_flipping.py
index cc84242f4557..cd55846e3668 100644
--- a/dpnp/tests/test_flipping.py
+++ b/dpnp/tests/test_flipping.py
@@ -2,13 +2,16 @@
 
 import numpy
 import pytest
-from dpctl.tensor._numpy_helper import AxisError
 from numpy.testing import (
     assert_equal,
 )
 
 import dpnp
 
+# TODO: revert to `from dpctl.tensor...`
+# when dpnp fully migrates dpctl/tensor
+from dpctl_ext.tensor._numpy_helper import AxisError
+
 from .helper import (
     get_all_dtypes,
 )
diff --git a/dpnp/tests/test_indexing.py b/dpnp/tests/test_indexing.py
index 9a55efe138b7..79c41a2f45f7 100644
--- a/dpnp/tests/test_indexing.py
+++ b/dpnp/tests/test_indexing.py
@@ -4,8 +4,6 @@
 import dpctl.tensor as dpt
 import numpy
 import pytest
-from dpctl.tensor._numpy_helper import AxisError
-from dpctl.tensor._type_utils import _to_device_supported_dtype
 from dpctl.utils import ExecutionPlacementError
 from numpy.testing import (
     assert_,
@@ -16,6 +14,11 @@
 )
 
 import dpnp
+
+# TODO: revert to `import dpctl.tensor...`
+# when dpnp fully migrates dpctl/tensor
+from dpctl_ext.tensor._numpy_helper import AxisError
+from dpctl_ext.tensor._type_utils import _to_device_supported_dtype
 from dpnp.dpnp_array import dpnp_array
 
 from .helper import (
diff --git a/dpnp/tests/test_linalg.py b/dpnp/tests/test_linalg.py
index 31d99d71ce49..e1ad9af7d220 100644
--- a/dpnp/tests/test_linalg.py
+++ b/dpnp/tests/test_linalg.py
@@ -4,7 +4,6 @@
 import dpctl.tensor as dpt
 import numpy
 import pytest
-from dpctl.tensor._numpy_helper import AxisError
 from dpctl.utils import ExecutionPlacementError
 from numpy.testing import (
     assert_allclose,
@@ -16,6 +15,10 @@
 
 import dpnp
 
+# TODO: revert to `from dpctl.tensor...`
+# when dpnp fully migrates dpctl/tensor
+from dpctl_ext.tensor._numpy_helper import AxisError
+
 from .helper import (
     assert_dtype_allclose,
     generate_random_numpy_array,
diff --git a/dpnp/tests/test_manipulation.py b/dpnp/tests/test_manipulation.py
index 8ddba08dbb92..2512c0955da7 100644
--- a/dpnp/tests/test_manipulation.py
+++ b/dpnp/tests/test_manipulation.py
@@ -3,7 +3,6 @@
 import dpctl.tensor as dpt
 import numpy
 import pytest
-from dpctl.tensor._numpy_helper import AxisError
 from numpy.testing import (
     assert_array_equal,
     assert_equal,
@@ -12,6 +11,10 @@
 
 import dpnp
 
+# TODO: revert to `from dpctl.tensor...`
+# when dpnp fully migrates dpctl/tensor
+from dpctl_ext.tensor._numpy_helper import AxisError
+
 from .helper import (
     assert_dtype_allclose,
     get_all_dtypes,
diff --git a/dpnp/tests/test_mathematical.py b/dpnp/tests/test_mathematical.py
index 760c1a0ceb2e..ef8f6731ffd2 100644
--- a/dpnp/tests/test_mathematical.py
+++ b/dpnp/tests/test_mathematical.py
@@ -2,10 +2,6 @@
 import dpctl.tensor as dpt
 import numpy
 import pytest
-from dpctl.tensor._numpy_helper import (
-    AxisError,
-    normalize_axis_index,
-)
 from dpctl.utils import ExecutionPlacementError
 from numpy.testing import (
     assert_allclose,
@@ -16,6 +12,13 @@
 )
 
 import dpnp
+
+# TODO: revert to `from dpctl.tensor...`
+# when dpnp fully migrates dpctl/tensor
+from dpctl_ext.tensor._numpy_helper import (
+    AxisError,
+    normalize_axis_index,
+)
 from dpnp.dpnp_array import dpnp_array
 from dpnp.dpnp_utils import map_dtype_to_device
 
diff --git a/dpnp/tests/test_product.py b/dpnp/tests/test_product.py
index afe767a5e5d9..9c2bc54e30b5 100644
--- a/dpnp/tests/test_product.py
+++ b/dpnp/tests/test_product.py
@@ -1,11 +1,14 @@
 import dpctl
 import numpy
 import pytest
-from dpctl.tensor._numpy_helper import AxisError
 from dpctl.utils import ExecutionPlacementError
 from numpy.testing import assert_allclose, assert_array_equal, assert_raises
 
 import dpnp
+
+# TODO: revert to `from dpctl.tensor...`
+# when dpnp fully migrates dpctl/tensor
+from dpctl_ext.tensor._numpy_helper import AxisError
 from dpnp.dpnp_utils import map_dtype_to_device
 
 from .helper import (
diff --git a/dpnp/tests/test_sort.py b/dpnp/tests/test_sort.py
index 5e883c575f85..73eac4064892 100644
--- a/dpnp/tests/test_sort.py
+++ b/dpnp/tests/test_sort.py
@@ -1,10 +1,13 @@
 import numpy
 import pytest
-from dpctl.tensor._numpy_helper import AxisError
 from numpy.testing import assert_array_equal, assert_equal, assert_raises
 
 import dpnp
 
+# TODO: revert to `from dpctl.tensor...`
+# when dpnp fully migrates dpctl/tensor
+from dpctl_ext.tensor._numpy_helper import AxisError
+
 from .helper import (
     assert_dtype_allclose,
     generate_random_numpy_array,
diff --git a/dpnp/tests/third_party/cupy/core_tests/test_ndarray.py b/dpnp/tests/third_party/cupy/core_tests/test_ndarray.py
index 95d753c90473..085261317ead 100644
--- a/dpnp/tests/third_party/cupy/core_tests/test_ndarray.py
+++ b/dpnp/tests/third_party/cupy/core_tests/test_ndarray.py
@@ -6,13 +6,16 @@
 import dpctl
 import numpy
 import pytest
-from dpctl.tensor._numpy_helper import AxisError
 
 # from cupy_backends.cuda.api import driver
 # from cupy_backends.cuda.api import runtime
 # from cupy_backends.cuda import stream as stream_module
 import dpnp as cupy
 
+# TODO: revert to `from dpctl.tensor...`
+# when dpnp fully migrates dpctl/tensor
+from dpctl_ext.tensor._numpy_helper import AxisError
+
 # from cupy import _util
 # from cupy import _core
 # from cupy import cuda
diff --git a/dpnp/tests/third_party/cupy/lib_tests/test_shape_base.py b/dpnp/tests/third_party/cupy/lib_tests/test_shape_base.py
index c241824fa81d..a1309f3ed83d 100644
--- a/dpnp/tests/third_party/cupy/lib_tests/test_shape_base.py
+++ b/dpnp/tests/third_party/cupy/lib_tests/test_shape_base.py
@@ -2,9 +2,12 @@
 
 import numpy
 import pytest
-from dpctl.tensor._numpy_helper import AxisError
 
 import dpnp as cupy
+
+# TODO: revert to `from dpctl.tensor...`
+# when dpnp fully migrates dpctl/tensor
+from dpctl_ext.tensor._numpy_helper import AxisError
 from dpnp.tests.helper import has_support_aspect64
 from dpnp.tests.third_party.cupy import testing
 
diff --git a/dpnp/tests/third_party/cupy/manipulation_tests/test_dims.py b/dpnp/tests/third_party/cupy/manipulation_tests/test_dims.py
index 7355d07e1d9b..8944a6b944c9 100644
--- a/dpnp/tests/third_party/cupy/manipulation_tests/test_dims.py
+++ b/dpnp/tests/third_party/cupy/manipulation_tests/test_dims.py
@@ -2,9 +2,12 @@
 
 import numpy
 import pytest
-from dpctl.tensor._numpy_helper import AxisError
 
 import dpnp as cupy
+
+# TODO: revert to `from dpctl.tensor...`
+# when dpnp fully migrates dpctl/tensor
+from dpctl_ext.tensor._numpy_helper import AxisError
 from dpnp.tests.third_party.cupy import testing
 
 
diff --git a/dpnp/tests/third_party/cupy/manipulation_tests/test_transpose.py b/dpnp/tests/third_party/cupy/manipulation_tests/test_transpose.py
index 7e7a62dce52a..0f6bed1c2ced 100644
--- a/dpnp/tests/third_party/cupy/manipulation_tests/test_transpose.py
+++ b/dpnp/tests/third_party/cupy/manipulation_tests/test_transpose.py
@@ -2,9 +2,12 @@
 
 import numpy
 import pytest
-from dpctl.tensor._numpy_helper import AxisError
 
 import dpnp as cupy
+
+# TODO: revert to `from dpctl.tensor...`
+# when dpnp fully migrates dpctl/tensor
+from dpctl_ext.tensor._numpy_helper import AxisError
 from dpnp.tests.third_party.cupy import testing
 
 
diff --git a/dpnp/tests/third_party/cupy/math_tests/test_sumprod.py b/dpnp/tests/third_party/cupy/math_tests/test_sumprod.py
index b8f98456a13a..cb7200c1b13b 100644
--- a/dpnp/tests/third_party/cupy/math_tests/test_sumprod.py
+++ b/dpnp/tests/third_party/cupy/math_tests/test_sumprod.py
@@ -2,9 +2,12 @@
 
 import numpy
 import pytest
-from dpctl.tensor._numpy_helper import AxisError
 
 import dpnp as cupy
+
+# TODO: revert to `from dpctl.tensor...`
+# when dpnp fully migrates dpctl/tensor
+from dpctl_ext.tensor._numpy_helper import AxisError
 from dpnp.tests.helper import (
     has_support_aspect16,
     has_support_aspect64,
diff --git a/dpnp/tests/third_party/cupy/sorting_tests/test_sort.py b/dpnp/tests/third_party/cupy/sorting_tests/test_sort.py
index 7e0eade13254..8359ba580a25 100644
--- a/dpnp/tests/third_party/cupy/sorting_tests/test_sort.py
+++ b/dpnp/tests/third_party/cupy/sorting_tests/test_sort.py
@@ -4,9 +4,12 @@
 
 import numpy
 import pytest
-from dpctl.tensor._numpy_helper import AxisError
 
 import dpnp as cupy
+
+# TODO: revert to `from dpctl.tensor...`
+# when dpnp fully migrates dpctl/tensor
+from dpctl_ext.tensor._numpy_helper import AxisError
 from dpnp.tests.helper import has_support_aspect64
 from dpnp.tests.third_party.cupy import testing
 
diff --git a/dpnp/tests/third_party/cupy/statistics_tests/test_meanvar.py b/dpnp/tests/third_party/cupy/statistics_tests/test_meanvar.py
index bf5d37df2fba..d355d18985f2 100644
--- a/dpnp/tests/third_party/cupy/statistics_tests/test_meanvar.py
+++ b/dpnp/tests/third_party/cupy/statistics_tests/test_meanvar.py
@@ -2,9 +2,12 @@
 
 import numpy
 import pytest
-from dpctl.tensor._numpy_helper import AxisError
 
 import dpnp as cupy
+
+# TODO: revert to `from dpctl.tensor...`
+# when dpnp fully migrates dpctl/tensor
+from dpctl_ext.tensor._numpy_helper import AxisError
 from dpnp.tests.helper import has_support_aspect16, has_support_aspect64
 from dpnp.tests.third_party.cupy import testing
 
diff --git a/dpnp/tests/third_party/cupy/testing/_loops.py b/dpnp/tests/third_party/cupy/testing/_loops.py
index 63cd09147c4b..66c243a3d7f7 100644
--- a/dpnp/tests/third_party/cupy/testing/_loops.py
+++ b/dpnp/tests/third_party/cupy/testing/_loops.py
@@ -10,9 +10,12 @@
 import numpy
 import pytest
 from dpctl import select_default_device
-from dpctl.tensor._numpy_helper import AxisError
 
 import dpnp as cupy
+
+# TODO: revert to `from dpctl.tensor...`
+# when dpnp fully migrates dpctl/tensor
+from dpctl_ext.tensor._numpy_helper import AxisError
 from dpnp.tests import config
 from dpnp.tests.third_party.cupy.testing import _array, _parameterized
 from dpnp.tests.third_party.cupy.testing._pytest_impl import is_available

From 585f2e5fc836841ae6a534621ee60e9b9790b73f Mon Sep 17 00:00:00 2001
From: vlad-perevezentsev <vladislav.perevezentsev@intel.com>
Date: Thu, 5 Mar 2026 15:44:55 +0100
Subject: [PATCH 08/43] Extend `._tensor_impl` with linear sequences functions 
 (#2782)

This PR is the final one in the series of extending `_tensor_impl`
extension

It extends `_tensor_impl` in `dpctl_ext.tensor` with linear sequence
functions
(`_linspace_step and _linspace_affine`)

Also this PR significantly expands Python API of `dpctl_ext.tensor` by
adding all missing functions from `dpctl_ext.tensor._ctors` and
`dpctl_ext.tensor._manipulation_functions`

`_tensor_impl`:  45 / 45 functions
Python API dpctl_ext.tensor:  70 / 233 functions
---
 dpctl_ext/tensor/CMakeLists.txt               |    4 +-
 dpctl_ext/tensor/__init__.py                  |   46 +
 dpctl_ext/tensor/_clip.py                     |   74 +-
 dpctl_ext/tensor/_copy_utils.py               |   58 +-
 dpctl_ext/tensor/_ctors.py                    | 1776 +++++++++++++++--
 dpctl_ext/tensor/_indexing_functions.py       |   14 +-
 dpctl_ext/tensor/_manipulation_functions.py   |  727 ++++++-
 dpctl_ext/tensor/_reshape.py                  |    8 +-
 dpctl_ext/tensor/_scalar_utils.py             |    6 +-
 dpctl_ext/tensor/_search_functions.py         |   24 +-
 .../include/kernels/constructors.hpp          |  178 ++
 .../libtensor/source/linear_sequences.cpp     |  306 +++
 .../libtensor/source/linear_sequences.hpp     |   66 +
 .../tensor/libtensor/source/tensor_ctors.cpp  |   38 +-
 dpnp/dpnp_algo/dpnp_arraycreation.py          |   14 +-
 dpnp/dpnp_algo/dpnp_elementwise_common.py     |   24 +-
 dpnp/dpnp_algo/dpnp_fill.py                   |    7 +-
 dpnp/dpnp_array.py                            |    2 +-
 dpnp/dpnp_container.py                        |   13 +-
 dpnp/dpnp_iface.py                            |    2 +-
 dpnp/dpnp_iface_arraycreation.py              |    8 +-
 dpnp/dpnp_iface_indexing.py                   |   16 +-
 dpnp/dpnp_iface_manipulation.py               |   28 +-
 dpnp/dpnp_iface_searching.py                  |    2 +-
 dpnp/fft/dpnp_utils_fft.py                    |   12 +-
 dpnp/tests/test_arraycreation.py              |    5 +-
 dpnp/tests/test_arraymanipulation.py          |    6 +-
 dpnp/tests/test_fft.py                        |    4 +-
 dpnp/tests/test_indexing.py                   |    5 +-
 dpnp/tests/test_linalg.py                     |    4 +-
 dpnp/tests/test_manipulation.py               |    4 +-
 dpnp/tests/test_mathematical.py               |   13 +-
 dpnp/tests/test_memory.py                     |    5 +-
 dpnp/tests/test_nanfunctions.py               |    4 +-
 dpnp/tests/test_ndarray.py                    |    5 +-
 dpnp/tests/test_search.py                     |    4 +-
 dpnp/tests/test_statistics.py                 |    4 +-
 dpnp/tests/test_sycl_queue.py                 |    4 +-
 dpnp/tests/test_usm_type.py                   |    4 +-
 dpnp/tests/test_utils.py                      |    4 +-
 40 files changed, 3194 insertions(+), 334 deletions(-)
 create mode 100644 dpctl_ext/tensor/libtensor/source/linear_sequences.cpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/linear_sequences.hpp

diff --git a/dpctl_ext/tensor/CMakeLists.txt b/dpctl_ext/tensor/CMakeLists.txt
index 6f823a818ce7..864e34ddaba4 100644
--- a/dpctl_ext/tensor/CMakeLists.txt
+++ b/dpctl_ext/tensor/CMakeLists.txt
@@ -51,7 +51,7 @@ set(_tensor_impl_sources
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_numpy_ndarray_into_usm_ndarray.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_for_reshape.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_for_roll.cpp
-    # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/linear_sequences.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/linear_sequences.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/integer_advanced_indexing.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/boolean_advanced_indexing.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/eye_ctor.cpp
@@ -93,7 +93,7 @@ endif()
 set(_no_fast_math_sources
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_and_cast_usm_to_usm.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/full_ctor.cpp
-    # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/linear_sequences.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/linear_sequences.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/clip.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/where.cpp
 )
diff --git a/dpctl_ext/tensor/__init__.py b/dpctl_ext/tensor/__init__.py
index 2c1e761beb3b..9d4013e146a7 100644
--- a/dpctl_ext/tensor/__init__.py
+++ b/dpctl_ext/tensor/__init__.py
@@ -36,10 +36,21 @@
     to_numpy,
 )
 from ._ctors import (
+    arange,
+    asarray,
+    empty,
+    empty_like,
     eye,
     full,
+    full_like,
+    linspace,
+    meshgrid,
+    ones,
+    ones_like,
     tril,
     triu,
+    zeros,
+    zeros_like,
 )
 from ._indexing_functions import (
     extract,
@@ -51,27 +62,55 @@
     take_along_axis,
 )
 from ._manipulation_functions import (
+    broadcast_arrays,
+    broadcast_to,
+    concat,
+    expand_dims,
+    flip,
+    moveaxis,
+    permute_dims,
     repeat,
     roll,
+    squeeze,
+    stack,
+    swapaxes,
+    tile,
+    unstack,
 )
 from ._reshape import reshape
 from ._search_functions import where
 from ._type_utils import can_cast, finfo, iinfo, isdtype, result_type
 
 __all__ = [
+    "arange",
+    "asarray",
     "asnumpy",
     "astype",
+    "broadcast_arrays",
+    "broadcast_to",
     "can_cast",
+    "concat",
     "copy",
     "clip",
+    "empty",
+    "empty_like",
     "extract",
+    "expand_dims",
     "eye",
     "finfo",
+    "flip",
     "from_numpy",
     "full",
+    "full_like",
     "iinfo",
     "isdtype",
+    "linspace",
+    "meshgrid",
+    "moveaxis",
+    "permute_dims",
     "nonzero",
+    "ones",
+    "ones_like",
     "place",
     "put",
     "put_along_axis",
@@ -79,10 +118,17 @@
     "reshape",
     "result_type",
     "roll",
+    "squeeze",
+    "stack",
+    "swapaxes",
     "take",
     "take_along_axis",
+    "tile",
     "to_numpy",
     "tril",
     "triu",
+    "unstack",
     "where",
+    "zeros",
+    "zeros_like",
 ]
diff --git a/dpctl_ext/tensor/_clip.py b/dpctl_ext/tensor/_clip.py
index f145e9f2d98d..ef07269c4ea0 100644
--- a/dpctl_ext/tensor/_clip.py
+++ b/dpctl_ext/tensor/_clip.py
@@ -163,7 +163,7 @@ def _clip_none(x, val, out, order, _binary_fn):
 
         if ti._array_overlap(x, out):
             if not ti._same_logical_tensors(x, out):
-                out = dpt.empty_like(out)
+                out = dpt_ext.empty_like(out)
 
         if isinstance(val, dpt.usm_ndarray):
             if (
@@ -171,12 +171,12 @@ def _clip_none(x, val, out, order, _binary_fn):
                 and not ti._same_logical_tensors(val, out)
                 and val_dtype == res_dt
             ):
-                out = dpt.empty_like(out)
+                out = dpt_ext.empty_like(out)
 
     if isinstance(val, dpt.usm_ndarray):
         val_ary = val
     else:
-        val_ary = dpt.asarray(val, dtype=val_dtype, sycl_queue=exec_q)
+        val_ary = dpt_ext.asarray(val, dtype=val_dtype, sycl_queue=exec_q)
 
     if order == "A":
         order = (
@@ -197,7 +197,7 @@ def _clip_none(x, val, out, order, _binary_fn):
                     x, val_ary, res_dt, res_shape, res_usm_type, exec_q
                 )
             else:
-                out = dpt.empty(
+                out = dpt_ext.empty(
                     res_shape,
                     dtype=res_dt,
                     usm_type=res_usm_type,
@@ -205,9 +205,9 @@ def _clip_none(x, val, out, order, _binary_fn):
                     order=order,
                 )
         if x_shape != res_shape:
-            x = dpt.broadcast_to(x, res_shape)
+            x = dpt_ext.broadcast_to(x, res_shape)
         if val_ary.shape != res_shape:
-            val_ary = dpt.broadcast_to(val_ary, res_shape)
+            val_ary = dpt_ext.broadcast_to(val_ary, res_shape)
         _manager = SequentialOrderManager[exec_q]
         dep_evs = _manager.submitted_events
         ht_binary_ev, binary_ev = _binary_fn(
@@ -229,7 +229,7 @@ def _clip_none(x, val, out, order, _binary_fn):
         if order == "K":
             buf = _empty_like_orderK(val_ary, res_dt)
         else:
-            buf = dpt.empty_like(val_ary, dtype=res_dt, order=order)
+            buf = dpt_ext.empty_like(val_ary, dtype=res_dt, order=order)
         _manager = SequentialOrderManager[exec_q]
         dep_evs = _manager.submitted_events
         ht_copy_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
@@ -242,7 +242,7 @@ def _clip_none(x, val, out, order, _binary_fn):
                     x, buf, res_dt, res_shape, res_usm_type, exec_q
                 )
             else:
-                out = dpt.empty(
+                out = dpt_ext.empty(
                     res_shape,
                     dtype=res_dt,
                     usm_type=res_usm_type,
@@ -251,8 +251,8 @@ def _clip_none(x, val, out, order, _binary_fn):
                 )
 
         if x_shape != res_shape:
-            x = dpt.broadcast_to(x, res_shape)
-        buf = dpt.broadcast_to(buf, res_shape)
+            x = dpt_ext.broadcast_to(x, res_shape)
+        buf = dpt_ext.broadcast_to(buf, res_shape)
         ht_binary_ev, binary_ev = _binary_fn(
             src1=x,
             src2=buf,
@@ -353,14 +353,14 @@ def clip(x, /, min=None, max=None, out=None, order="K"):
 
             if ti._array_overlap(x, out):
                 if not ti._same_logical_tensors(x, out):
-                    out = dpt.empty_like(out)
+                    out = dpt_ext.empty_like(out)
                 else:
                     return out
         else:
             if order == "K":
                 out = _empty_like_orderK(x, x.dtype)
             else:
-                out = dpt.empty_like(x, order=order)
+                out = dpt_ext.empty_like(x, order=order)
 
         _manager = SequentialOrderManager[exec_q]
         dep_evs = _manager.submitted_events
@@ -519,7 +519,7 @@ def clip(x, /, min=None, max=None, out=None, order="K"):
 
             if ti._array_overlap(x, out):
                 if not ti._same_logical_tensors(x, out):
-                    out = dpt.empty_like(out)
+                    out = dpt_ext.empty_like(out)
 
             if isinstance(min, dpt.usm_ndarray):
                 if (
@@ -527,7 +527,7 @@ def clip(x, /, min=None, max=None, out=None, order="K"):
                     and not ti._same_logical_tensors(min, out)
                     and buf1_dt is None
                 ):
-                    out = dpt.empty_like(out)
+                    out = dpt_ext.empty_like(out)
 
             if isinstance(max, dpt.usm_ndarray):
                 if (
@@ -535,16 +535,16 @@ def clip(x, /, min=None, max=None, out=None, order="K"):
                     and not ti._same_logical_tensors(max, out)
                     and buf2_dt is None
                 ):
-                    out = dpt.empty_like(out)
+                    out = dpt_ext.empty_like(out)
 
         if isinstance(min, dpt.usm_ndarray):
             a_min = min
         else:
-            a_min = dpt.asarray(min, dtype=min_dtype, sycl_queue=exec_q)
+            a_min = dpt_ext.asarray(min, dtype=min_dtype, sycl_queue=exec_q)
         if isinstance(max, dpt.usm_ndarray):
             a_max = max
         else:
-            a_max = dpt.asarray(max, dtype=max_dtype, sycl_queue=exec_q)
+            a_max = dpt_ext.asarray(max, dtype=max_dtype, sycl_queue=exec_q)
 
         if order == "A":
             order = (
@@ -572,7 +572,7 @@ def clip(x, /, min=None, max=None, out=None, order="K"):
                         exec_q,
                     )
                 else:
-                    out = dpt.empty(
+                    out = dpt_ext.empty(
                         res_shape,
                         dtype=res_dt,
                         usm_type=res_usm_type,
@@ -580,11 +580,11 @@ def clip(x, /, min=None, max=None, out=None, order="K"):
                         order=order,
                     )
             if x_shape != res_shape:
-                x = dpt.broadcast_to(x, res_shape)
+                x = dpt_ext.broadcast_to(x, res_shape)
             if a_min.shape != res_shape:
-                a_min = dpt.broadcast_to(a_min, res_shape)
+                a_min = dpt_ext.broadcast_to(a_min, res_shape)
             if a_max.shape != res_shape:
-                a_max = dpt.broadcast_to(a_max, res_shape)
+                a_max = dpt_ext.broadcast_to(a_max, res_shape)
             _manager = SequentialOrderManager[exec_q]
             dep_ev = _manager.submitted_events
             ht_binary_ev, binary_ev = ti._clip(
@@ -612,7 +612,7 @@ def clip(x, /, min=None, max=None, out=None, order="K"):
             if order == "K":
                 buf2 = _empty_like_orderK(a_max, buf2_dt)
             else:
-                buf2 = dpt.empty_like(a_max, dtype=buf2_dt, order=order)
+                buf2 = dpt_ext.empty_like(a_max, dtype=buf2_dt, order=order)
             _manager = SequentialOrderManager[exec_q]
             dep_ev = _manager.submitted_events
             ht_copy_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
@@ -631,7 +631,7 @@ def clip(x, /, min=None, max=None, out=None, order="K"):
                         exec_q,
                     )
                 else:
-                    out = dpt.empty(
+                    out = dpt_ext.empty(
                         res_shape,
                         dtype=res_dt,
                         usm_type=res_usm_type,
@@ -639,10 +639,10 @@ def clip(x, /, min=None, max=None, out=None, order="K"):
                         order=order,
                     )
 
-            x = dpt.broadcast_to(x, res_shape)
+            x = dpt_ext.broadcast_to(x, res_shape)
             if a_min.shape != res_shape:
-                a_min = dpt.broadcast_to(a_min, res_shape)
-            buf2 = dpt.broadcast_to(buf2, res_shape)
+                a_min = dpt_ext.broadcast_to(a_min, res_shape)
+            buf2 = dpt_ext.broadcast_to(buf2, res_shape)
             ht_binary_ev, binary_ev = ti._clip(
                 src=x,
                 min=a_min,
@@ -668,7 +668,7 @@ def clip(x, /, min=None, max=None, out=None, order="K"):
             if order == "K":
                 buf1 = _empty_like_orderK(a_min, buf1_dt)
             else:
-                buf1 = dpt.empty_like(a_min, dtype=buf1_dt, order=order)
+                buf1 = dpt_ext.empty_like(a_min, dtype=buf1_dt, order=order)
             _manager = SequentialOrderManager[exec_q]
             dep_ev = _manager.submitted_events
             ht_copy_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
@@ -687,7 +687,7 @@ def clip(x, /, min=None, max=None, out=None, order="K"):
                         exec_q,
                     )
                 else:
-                    out = dpt.empty(
+                    out = dpt_ext.empty(
                         res_shape,
                         dtype=res_dt,
                         usm_type=res_usm_type,
@@ -695,10 +695,10 @@ def clip(x, /, min=None, max=None, out=None, order="K"):
                         order=order,
                     )
 
-            x = dpt.broadcast_to(x, res_shape)
-            buf1 = dpt.broadcast_to(buf1, res_shape)
+            x = dpt_ext.broadcast_to(x, res_shape)
+            buf1 = dpt_ext.broadcast_to(buf1, res_shape)
             if a_max.shape != res_shape:
-                a_max = dpt.broadcast_to(a_max, res_shape)
+                a_max = dpt_ext.broadcast_to(a_max, res_shape)
             ht_binary_ev, binary_ev = ti._clip(
                 src=x,
                 min=buf1,
@@ -736,7 +736,7 @@ def clip(x, /, min=None, max=None, out=None, order="K"):
         if order == "K":
             buf1 = _empty_like_orderK(a_min, buf1_dt)
         else:
-            buf1 = dpt.empty_like(a_min, dtype=buf1_dt, order=order)
+            buf1 = dpt_ext.empty_like(a_min, dtype=buf1_dt, order=order)
 
         _manager = SequentialOrderManager[exec_q]
         dep_evs = _manager.submitted_events
@@ -747,7 +747,7 @@ def clip(x, /, min=None, max=None, out=None, order="K"):
         if order == "K":
             buf2 = _empty_like_orderK(a_max, buf2_dt)
         else:
-            buf2 = dpt.empty_like(a_max, dtype=buf2_dt, order=order)
+            buf2 = dpt_ext.empty_like(a_max, dtype=buf2_dt, order=order)
         ht_copy2_ev, copy2_ev = ti._copy_usm_ndarray_into_usm_ndarray(
             src=a_max, dst=buf2, sycl_queue=exec_q, depends=dep_evs
         )
@@ -758,7 +758,7 @@ def clip(x, /, min=None, max=None, out=None, order="K"):
                     x, buf1, buf2, res_dt, res_shape, res_usm_type, exec_q
                 )
             else:
-                out = dpt.empty(
+                out = dpt_ext.empty(
                     res_shape,
                     dtype=res_dt,
                     usm_type=res_usm_type,
@@ -766,9 +766,9 @@ def clip(x, /, min=None, max=None, out=None, order="K"):
                     order=order,
                 )
 
-        x = dpt.broadcast_to(x, res_shape)
-        buf1 = dpt.broadcast_to(buf1, res_shape)
-        buf2 = dpt.broadcast_to(buf2, res_shape)
+        x = dpt_ext.broadcast_to(x, res_shape)
+        buf1 = dpt_ext.broadcast_to(buf1, res_shape)
+        buf2 = dpt_ext.broadcast_to(buf2, res_shape)
         ht_, clip_ev = ti._clip(
             src=x,
             min=buf1,
diff --git a/dpctl_ext/tensor/_copy_utils.py b/dpctl_ext/tensor/_copy_utils.py
index 64689057eb84..37879997b788 100644
--- a/dpctl_ext/tensor/_copy_utils.py
+++ b/dpctl_ext/tensor/_copy_utils.py
@@ -91,7 +91,7 @@ def _copy_from_numpy(np_ary, usm_type="device", sycl_queue=None):
         )
     else:
         Xusm_dtype = dt
-    Xusm = dpt.empty(
+    Xusm = dpt_ext.empty(
         Xnp.shape, dtype=Xusm_dtype, usm_type=usm_type, sycl_queue=sycl_queue
     )
     _copy_from_numpy_into(Xusm, Xnp)
@@ -159,7 +159,7 @@ def _extract_impl(ary, ary_mask, axis=0):
     elif isinstance(ary_mask, np.ndarray):
         dst_usm_type = ary.usm_type
         exec_q = ary.sycl_queue
-        ary_mask = dpt.asarray(
+        ary_mask = dpt_ext.asarray(
             ary_mask, usm_type=dst_usm_type, sycl_queue=exec_q
         )
     else:
@@ -176,7 +176,7 @@ def _extract_impl(ary, ary_mask, axis=0):
         )
     mask_nelems = ary_mask.size
     cumsum_dt = dpt.int32 if mask_nelems < int32_t_max else dpt.int64
-    cumsum = dpt.empty(mask_nelems, dtype=cumsum_dt, device=ary_mask.device)
+    cumsum = dpt_ext.empty(mask_nelems, dtype=cumsum_dt, device=ary_mask.device)
     exec_q = cumsum.sycl_queue
     _manager = dpctl.utils.SequentialOrderManager[exec_q]
     dep_evs = _manager.submitted_events
@@ -184,7 +184,7 @@ def _extract_impl(ary, ary_mask, axis=0):
         ary_mask, cumsum, sycl_queue=exec_q, depends=dep_evs
     )
     dst_shape = ary.shape[:pp] + (mask_count,) + ary.shape[pp + mask_nd :]
-    dst = dpt.empty(
+    dst = dpt_ext.empty(
         dst_shape, dtype=ary.dtype, usm_type=dst_usm_type, device=ary.device
     )
     if dst.size == 0:
@@ -247,7 +247,7 @@ def _nonzero_impl(ary):
     usm_type = ary.usm_type
     mask_nelems = ary.size
     cumsum_dt = dpt.int32 if mask_nelems < int32_t_max else dpt.int64
-    cumsum = dpt.empty(
+    cumsum = dpt_ext.empty(
         mask_nelems, dtype=cumsum_dt, sycl_queue=exec_q, order="C"
     )
     _manager = dpctl.utils.SequentialOrderManager[exec_q]
@@ -256,7 +256,7 @@ def _nonzero_impl(ary):
         ary, cumsum, sycl_queue=exec_q, depends=dep_evs
     )
     indexes_dt = ti.default_device_index_type(exec_q.sycl_device)
-    indexes = dpt.empty(
+    indexes = dpt_ext.empty(
         (ary.ndim, mask_count),
         dtype=indexes_dt,
         usm_type=usm_type,
@@ -284,7 +284,7 @@ def _prepare_indices_arrays(inds, q, usm_type):
             lambda ind: (
                 ind
                 if isinstance(ind, dpt.usm_ndarray)
-                else dpt.asarray(ind, usm_type=usm_type, sycl_queue=q)
+                else dpt_ext.asarray(ind, usm_type=usm_type, sycl_queue=q)
             ),
             inds,
         )
@@ -299,14 +299,14 @@ def _prepare_indices_arrays(inds, q, usm_type):
     inds = tuple(
         map(
             lambda ind: (
-                ind if ind.dtype == ind_dt else dpt.astype(ind, ind_dt)
+                ind if ind.dtype == ind_dt else dpt_ext.astype(ind, ind_dt)
             ),
             inds,
         )
     )
 
     # broadcast
-    inds = dpt.broadcast_arrays(*inds)
+    inds = dpt_ext.broadcast_arrays(*inds)
 
     return inds
 
@@ -332,7 +332,7 @@ def _put_multi_index(ary, inds, p, vals, mode=0):
 
     if exec_q is not None:
         if not isinstance(vals, dpt.usm_ndarray):
-            vals = dpt.asarray(
+            vals = dpt_ext.asarray(
                 vals,
                 dtype=ary.dtype,
                 usm_type=coerced_usm_type,
@@ -368,7 +368,7 @@ def _put_multi_index(ary, inds, p, vals, mode=0):
         rhs = vals
     else:
         rhs = dpt_ext.astype(vals, ary.dtype)
-    rhs = dpt.broadcast_to(rhs, expected_vals_shape)
+    rhs = dpt_ext.broadcast_to(rhs, expected_vals_shape)
     _manager = dpctl.utils.SequentialOrderManager[exec_q]
     dep_ev = _manager.submitted_events
     hev, put_ev = ti._put(
@@ -418,7 +418,7 @@ def _take_multi_index(ary, inds, p, mode=0):
     if 0 in ary_sh[p:p_end] and ind0.size != 0:
         raise IndexError("cannot take non-empty indices from an empty axis")
     res_shape = ary_sh[:p] + ind0.shape + ary_sh[p_end:]
-    res = dpt.empty(
+    res = dpt_ext.empty(
         res_shape, dtype=ary.dtype, usm_type=res_usm_type, sycl_queue=exec_q
     )
     _manager = dpctl.utils.SequentialOrderManager[exec_q]
@@ -681,7 +681,9 @@ def _make_empty_like_orderK(x, dt, usm_type, dev):
     inv_perm = sorted(range(x.ndim), key=lambda i: perm[i])
     sh = x.shape
     sh_sorted = tuple(sh[i] for i in perm)
-    R = dpt.empty(sh_sorted, dtype=dt, usm_type=usm_type, device=dev, order="C")
+    R = dpt_ext.empty(
+        sh_sorted, dtype=dt, usm_type=usm_type, device=dev, order="C"
+    )
     if min(st) < 0:
         st_sorted = [st[i] for i in perm]
         sl = tuple(
@@ -693,7 +695,7 @@ def _make_empty_like_orderK(x, dt, usm_type, dev):
             for i in range(x.ndim)
         )
         R = R[sl]
-    return dpt.permute_dims(R, inv_perm)
+    return dpt_ext.permute_dims(R, inv_perm)
 
 
 def _empty_like_orderK(x, dt, usm_type=None, dev=None):
@@ -712,11 +714,11 @@ def _empty_like_orderK(x, dt, usm_type=None, dev=None):
         dev = x.device
     fl = x.flags
     if fl["C"] or x.size <= 1:
-        return dpt.empty_like(
+        return dpt_ext.empty_like(
             x, dtype=dt, usm_type=usm_type, device=dev, order="C"
         )
     elif fl["F"]:
-        return dpt.empty_like(
+        return dpt_ext.empty_like(
             x, dtype=dt, usm_type=usm_type, device=dev, order="F"
         )
     return _make_empty_like_orderK(x, dt, usm_type, dev)
@@ -734,11 +736,11 @@ def _from_numpy_empty_like_orderK(x, dt, usm_type, dev):
         raise TypeError(f"Expected numpy.ndarray, got {type(x)}")
     fl = x.flags
     if fl["C"] or x.size <= 1:
-        return dpt.empty(
+        return dpt_ext.empty(
             x.shape, dtype=dt, usm_type=usm_type, device=dev, order="C"
         )
     elif fl["F"]:
-        return dpt.empty(
+        return dpt_ext.empty(
             x.shape, dtype=dt, usm_type=usm_type, device=dev, order="F"
         )
     return _make_empty_like_orderK(x, dt, usm_type, dev)
@@ -758,11 +760,11 @@ def _empty_like_pair_orderK(X1, X2, dt, res_shape, usm_type, dev):
     fl1 = X1.flags
     fl2 = X2.flags
     if fl1["C"] or fl2["C"]:
-        return dpt.empty(
+        return dpt_ext.empty(
             res_shape, dtype=dt, usm_type=usm_type, device=dev, order="C"
         )
     if fl1["F"] and fl2["F"]:
-        return dpt.empty(
+        return dpt_ext.empty(
             res_shape, dtype=dt, usm_type=usm_type, device=dev, order="F"
         )
     st1 = list(X1.strides)
@@ -785,7 +787,9 @@ def _empty_like_pair_orderK(X1, X2, dt, res_shape, usm_type, dev):
     st2_sorted = [st2[i] for i in perm]
     sh = res_shape
     sh_sorted = tuple(sh[i] for i in perm)
-    R = dpt.empty(sh_sorted, dtype=dt, usm_type=usm_type, device=dev, order="C")
+    R = dpt_ext.empty(
+        sh_sorted, dtype=dt, usm_type=usm_type, device=dev, order="C"
+    )
     if max(min(st1_sorted), min(st2_sorted)) < 0:
         sl = tuple(
             (
@@ -796,7 +800,7 @@ def _empty_like_pair_orderK(X1, X2, dt, res_shape, usm_type, dev):
             for i in range(nd1)
         )
         R = R[sl]
-    return dpt.permute_dims(R, inv_perm)
+    return dpt_ext.permute_dims(R, inv_perm)
 
 
 def _empty_like_triple_orderK(X1, X2, X3, dt, res_shape, usm_type, dev):
@@ -823,11 +827,11 @@ def _empty_like_triple_orderK(X1, X2, X3, dt, res_shape, usm_type, dev):
     fl2 = X2.flags
     fl3 = X3.flags
     if fl1["C"] or fl2["C"] or fl3["C"]:
-        return dpt.empty(
+        return dpt_ext.empty(
             res_shape, dtype=dt, usm_type=usm_type, device=dev, order="C"
         )
     if fl1["F"] and fl2["F"] and fl3["F"]:
-        return dpt.empty(
+        return dpt_ext.empty(
             res_shape, dtype=dt, usm_type=usm_type, device=dev, order="F"
         )
     st1 = list(X1.strides)
@@ -855,7 +859,9 @@ def _empty_like_triple_orderK(X1, X2, X3, dt, res_shape, usm_type, dev):
     st3_sorted = [st3[i] for i in perm]
     sh = res_shape
     sh_sorted = tuple(sh[i] for i in perm)
-    R = dpt.empty(sh_sorted, dtype=dt, usm_type=usm_type, device=dev, order="C")
+    R = dpt_ext.empty(
+        sh_sorted, dtype=dt, usm_type=usm_type, device=dev, order="C"
+    )
     if max(min(st1_sorted), min(st2_sorted), min(st3_sorted)) < 0:
         sl = tuple(
             (
@@ -870,7 +876,7 @@ def _empty_like_triple_orderK(X1, X2, X3, dt, res_shape, usm_type, dev):
             for i in range(nd1)
         )
         R = R[sl]
-    return dpt.permute_dims(R, inv_perm)
+    return dpt_ext.permute_dims(R, inv_perm)
 
 
 def copy(usm_ary, /, *, order="K"):
diff --git a/dpctl_ext/tensor/_ctors.py b/dpctl_ext/tensor/_ctors.py
index 5a9e07c73346..0b7650873fe3 100644
--- a/dpctl_ext/tensor/_ctors.py
+++ b/dpctl_ext/tensor/_ctors.py
@@ -30,17 +30,292 @@
 from numbers import Number
 
 import dpctl
+import dpctl.memory as dpm
 import dpctl.tensor as dpt
 import dpctl.utils
 import numpy as np
 from dpctl.tensor._data_types import _get_dtype
 from dpctl.tensor._device import normalize_queue_device
+from dpctl.tensor._usmarray import _is_object_with_buffer_protocol
 
 # TODO: revert to `import dpctl.tensor...`
 # when dpnp fully migrates dpctl/tensor
 import dpctl_ext.tensor as dpt_ext
 import dpctl_ext.tensor._tensor_impl as ti
 
+from ._copy_utils import (
+    _empty_like_orderK,
+    _from_numpy_empty_like_orderK,
+)
+
+__doc__ = "Implementation of creation functions in :module:`dpctl.tensor`"
+
+_empty_tuple = ()
+_host_set = frozenset([None])
+
+
+def _array_info_dispatch(obj):
+    if isinstance(obj, dpt.usm_ndarray):
+        return obj.shape, obj.dtype, frozenset([obj.sycl_queue])
+    if isinstance(obj, np.ndarray):
+        return obj.shape, obj.dtype, _host_set
+    if isinstance(obj, range):
+        return (len(obj),), int, _host_set
+    if isinstance(obj, bool):
+        return _empty_tuple, bool, _host_set
+    if isinstance(obj, float):
+        return _empty_tuple, float, _host_set
+    if isinstance(obj, int):
+        return _empty_tuple, int, _host_set
+    if isinstance(obj, complex):
+        return _empty_tuple, complex, _host_set
+    if isinstance(
+        obj,
+        (
+            list,
+            tuple,
+        ),
+    ):
+        return _array_info_sequence(obj)
+    if _is_object_with_buffer_protocol(obj):
+        np_obj = np.array(obj)
+        return np_obj.shape, np_obj.dtype, _host_set
+    if hasattr(obj, "__usm_ndarray__"):
+        usm_ar = obj.__usm_ndarray__
+        if isinstance(usm_ar, dpt.usm_ndarray):
+            return usm_ar.shape, usm_ar.dtype, frozenset([usm_ar.sycl_queue])
+    if hasattr(obj, "__sycl_usm_array_interface__"):
+        usm_ar = _usm_ndarray_from_suai(obj)
+        return usm_ar.shape, usm_ar.dtype, frozenset([usm_ar.sycl_queue])
+
+
+def _array_info_sequence(li):
+    if not isinstance(li, (list, tuple, range)):
+        raise TypeError(f"Expected list, tuple, or range, got {type(li)}")
+    n = len(li)
+    dim = None
+    dt = None
+    device = frozenset()
+    for el in li:
+        el_dim, el_dt, el_dev = _array_info_dispatch(el)
+        if dim is None:
+            dim = el_dim
+            dt = np.promote_types(el_dt, el_dt)
+            device = device.union(el_dev)
+        elif el_dim == dim:
+            dt = np.promote_types(dt, el_dt)
+            device = device.union(el_dev)
+        else:
+            raise ValueError(f"Inconsistent dimensions, {dim} and {el_dim}")
+    if dim is None:
+        dim = ()
+        dt = float
+        device = _host_set
+    return (n,) + dim, dt, device
+
+
+def _asarray_from_numpy_ndarray(
+    ary, dtype=None, usm_type=None, sycl_queue=None, order="K"
+):
+    if not isinstance(ary, np.ndarray):
+        raise TypeError(f"Expected numpy.ndarray, got {type(ary)}")
+    if usm_type is None:
+        usm_type = "device"
+    copy_q = normalize_queue_device(sycl_queue=None, device=sycl_queue)
+    if ary.dtype.char not in "?bBhHiIlLqQefdFD":
+        raise TypeError(
+            f"Numpy array of data type {ary.dtype} is not supported. "
+            "Please convert the input to an array with numeric data type."
+        )
+    if dtype is None:
+        # deduce device-representable output data type
+        dtype = _map_to_device_dtype(ary.dtype, copy_q)
+    _ensure_native_dtype_device_support(dtype, copy_q.sycl_device)
+    f_contig = ary.flags["F"]
+    c_contig = ary.flags["C"]
+    fc_contig = f_contig or c_contig
+    if order == "A":
+        order = "F" if f_contig and not c_contig else "C"
+    if order == "K" and fc_contig:
+        order = "C" if c_contig else "F"
+    if order == "K":
+        # new USM allocation
+        res = _from_numpy_empty_like_orderK(ary, dtype, usm_type, copy_q)
+    else:
+        res = dpt.usm_ndarray(
+            ary.shape,
+            dtype=dtype,
+            buffer=usm_type,
+            order=order,
+            buffer_ctor_kwargs={"queue": copy_q},
+        )
+    res[...] = ary
+    return res
+
+
+def _asarray_from_seq(
+    seq_obj,
+    seq_shape,
+    seq_dt,
+    alloc_q,
+    exec_q,
+    dtype=None,
+    usm_type=None,
+    order="C",
+):
+    """`seq_obj` is a sequence"""
+    if usm_type is None:
+        usm_types_in_seq = []
+        _usm_types_walker(seq_obj, usm_types_in_seq)
+        usm_type = dpctl.utils.get_coerced_usm_type(usm_types_in_seq)
+    dpctl.utils.validate_usm_type(usm_type)
+    if dtype is None:
+        dtype = _map_to_device_dtype(seq_dt, alloc_q)
+    else:
+        _mapped_dt = _map_to_device_dtype(dtype, alloc_q)
+        if _mapped_dt != dtype:
+            raise ValueError(
+                f"Device {alloc_q.sycl_device} "
+                f"does not support {dtype} natively."
+            )
+        dtype = _mapped_dt
+    if order in "KA":
+        order = "C"
+    if isinstance(exec_q, dpctl.SyclQueue):
+        res = dpt_ext.empty(
+            seq_shape,
+            dtype=dtype,
+            usm_type=usm_type,
+            sycl_queue=alloc_q,
+            order=order,
+        )
+        _manager = dpctl.utils.SequentialOrderManager[exec_q]
+        _device_copy_walker(seq_obj, res, _manager)
+        return res
+    else:
+        res = dpt_ext.empty(
+            seq_shape,
+            dtype=dtype,
+            usm_type=usm_type,
+            sycl_queue=alloc_q,
+            order=order,
+        )
+        _copy_through_host_walker(seq_obj, res)
+        return res
+
+
+def _asarray_from_seq_single_device(
+    obj,
+    seq_shape,
+    seq_dt,
+    seq_dev,
+    dtype=None,
+    usm_type=None,
+    sycl_queue=None,
+    order="C",
+):
+    if sycl_queue is None:
+        exec_q = seq_dev
+        alloc_q = seq_dev
+    else:
+        exec_q = dpctl.utils.get_execution_queue(
+            (
+                sycl_queue,
+                seq_dev,
+            )
+        )
+        alloc_q = sycl_queue
+    return _asarray_from_seq(
+        obj,
+        seq_shape,
+        seq_dt,
+        alloc_q,
+        exec_q,
+        dtype=dtype,
+        usm_type=usm_type,
+        order=order,
+    )
+
+
+def _asarray_from_usm_ndarray(
+    usm_ndary,
+    dtype=None,
+    copy=None,
+    usm_type=None,
+    sycl_queue=None,
+    order="K",
+):
+    if not isinstance(usm_ndary, dpt.usm_ndarray):
+        raise TypeError(
+            f"Expected dpctl.tensor.usm_ndarray, got {type(usm_ndary)}"
+        )
+    if usm_type is None:
+        usm_type = usm_ndary.usm_type
+    if sycl_queue is not None:
+        exec_q = dpctl.utils.get_execution_queue(
+            [usm_ndary.sycl_queue, sycl_queue]
+        )
+        copy_q = normalize_queue_device(sycl_queue=sycl_queue, device=exec_q)
+    else:
+        copy_q = usm_ndary.sycl_queue
+    if dtype is None:
+        dtype = _map_to_device_dtype(usm_ndary.dtype, copy_q)
+    # Conditions for zero copy:
+    can_zero_copy = copy is not True
+    #    dtype is unchanged
+    can_zero_copy = can_zero_copy and dtype == usm_ndary.dtype
+    #    USM allocation type is unchanged
+    can_zero_copy = can_zero_copy and usm_type == usm_ndary.usm_type
+    #    sycl_queue is unchanged
+    can_zero_copy = can_zero_copy and copy_q is usm_ndary.sycl_queue
+    #    order is unchanged
+    c_contig = usm_ndary.flags.c_contiguous
+    f_contig = usm_ndary.flags.f_contiguous
+    fc_contig = usm_ndary.flags.forc
+    if can_zero_copy:
+        if order == "C" and c_contig:
+            pass
+        elif order == "F" and f_contig:
+            pass
+        elif order == "A" and fc_contig:
+            pass
+        elif order == "K":
+            pass
+        else:
+            can_zero_copy = False
+    if copy is False and can_zero_copy is False:
+        raise ValueError("asarray(..., copy=False) is not possible")
+    if can_zero_copy:
+        return usm_ndary
+    if order == "A":
+        order = "F" if f_contig and not c_contig else "C"
+    if order == "K" and fc_contig:
+        order = "C" if c_contig else "F"
+    if order == "K":
+        _ensure_native_dtype_device_support(dtype, copy_q.sycl_device)
+        res = _empty_like_orderK(usm_ndary, dtype, usm_type, copy_q)
+    else:
+        _ensure_native_dtype_device_support(dtype, copy_q.sycl_device)
+        res = dpt.usm_ndarray(
+            usm_ndary.shape,
+            dtype=dtype,
+            buffer=usm_type,
+            order=order,
+            buffer_ctor_kwargs={"queue": copy_q},
+        )
+    eq = dpctl.utils.get_execution_queue([usm_ndary.sycl_queue, copy_q])
+    if eq is not None:
+        _manager = dpctl.utils.SequentialOrderManager[eq]
+        dep_evs = _manager.submitted_events
+        hev, cpy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
+            src=usm_ndary, dst=res, sycl_queue=eq, depends=dep_evs
+        )
+        _manager.add_event_pair(hev, cpy_ev)
+    else:
+        tmp = dpt_ext.asnumpy(usm_ndary)
+        res[...] = tmp
+    return res
+
 
 def _cast_fill_val(fill_val, dt):
     """
@@ -58,6 +333,99 @@ def _cast_fill_val(fill_val, dt):
         return fill_val
 
 
+def _coerce_and_infer_dt(*args, dt, sycl_queue, err_msg, allow_bool=False):
+    """Deduce arange type from sequence spec"""
+    nd, seq_dt, d = _array_info_sequence(args)
+    if d != _host_set or nd != (len(args),):
+        raise ValueError(err_msg)
+    dt = _get_dtype(dt, sycl_queue, ref_type=seq_dt)
+    if np.issubdtype(dt, np.integer):
+        return tuple(int(v) for v in args), dt
+    if np.issubdtype(dt, np.floating):
+        return tuple(float(v) for v in args), dt
+    if np.issubdtype(dt, np.complexfloating):
+        return tuple(complex(v) for v in args), dt
+    if allow_bool and dt.char == "?":
+        return tuple(bool(v) for v in args), dt
+    raise ValueError(f"Data type {dt} is not supported")
+
+
+def _copy_through_host_walker(seq_o, usm_res):
+    if isinstance(seq_o, dpt.usm_ndarray):
+        if (
+            dpctl.utils.get_execution_queue(
+                (
+                    usm_res.sycl_queue,
+                    seq_o.sycl_queue,
+                )
+            )
+            is None
+        ):
+            usm_res[...] = dpt.asnumpy(seq_o).copy()
+            return
+        else:
+            usm_res[...] = seq_o
+    if hasattr(seq_o, "__usm_ndarray__"):
+        usm_arr = seq_o.__usm_ndarray__
+        if isinstance(usm_arr, dpt.usm_ndarray):
+            _copy_through_host_walker(usm_arr, usm_res)
+            return
+    if hasattr(seq_o, "__sycl_usm_array_interface__"):
+        usm_ar = _usm_ndarray_from_suai(seq_o)
+        if (
+            dpctl.utils.get_execution_queue(
+                (
+                    usm_res.sycl_queue,
+                    usm_ar.sycl_queue,
+                )
+            )
+            is None
+        ):
+            usm_res[...] = dpt_ext.asnumpy(usm_ar).copy()
+        else:
+            usm_res[...] = usm_ar
+        return
+    if _is_object_with_buffer_protocol(seq_o):
+        np_ar = np.asarray(seq_o)
+        usm_res[...] = np_ar
+        return
+    if isinstance(seq_o, (list, tuple)):
+        for i, el in enumerate(seq_o):
+            _copy_through_host_walker(el, usm_res[i])
+        return
+    usm_res[...] = np.asarray(seq_o)
+
+
+def _device_copy_walker(seq_o, res, _manager):
+    if isinstance(seq_o, dpt.usm_ndarray):
+        exec_q = res.sycl_queue
+        deps = _manager.submitted_events
+        ht_ev, cpy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
+            src=seq_o, dst=res, sycl_queue=exec_q, depends=deps
+        )
+        _manager.add_event_pair(ht_ev, cpy_ev)
+        return
+    if hasattr(seq_o, "__usm_ndarray__"):
+        usm_arr = seq_o.__usm_ndarray__
+        if isinstance(usm_arr, dpt.usm_ndarray):
+            _device_copy_walker(usm_arr, res, _manager)
+            return
+    if hasattr(seq_o, "__sycl_usm_array_interface__"):
+        usm_ar = _usm_ndarray_from_suai(seq_o)
+        exec_q = res.sycl_queue
+        deps = _manager.submitted_events
+        ht_ev, cpy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
+            src=usm_ar, dst=res, sycl_queue=exec_q, depends=deps
+        )
+        _manager.add_event_pair(ht_ev, cpy_ev)
+        return
+    if isinstance(seq_o, (list, tuple)):
+        for i, el in enumerate(seq_o):
+            _device_copy_walker(el, res[i], _manager)
+        return
+    raise TypeError
+
+
 def _ensure_native_dtype_device_support(dtype, dev) -> None:
     """Check that dtype is natively supported by device.
 
@@ -90,54 +458,1091 @@ def _ensure_native_dtype_device_support(dtype, dev) -> None:
         )
 
 
-def _to_scalar(obj, sc_ty):
-    """A way to convert object to NumPy scalar type.
-    Raises OverflowError if obj can not be represented
-    using the requested scalar type.
-    """
-    zd_arr = np.asarray(obj, dtype=sc_ty)
-    return zd_arr[()]
+def _get_arange_length(start, stop, step):
+    """Compute length of arange sequence"""
+    span = stop - start
+    if hasattr(step, "__float__") and hasattr(span, "__float__"):
+        return _round_for_arange(span / step)
+    tmp = span / step
+    if hasattr(tmp, "__complex__"):
+        tmp = complex(tmp)
+        tmp = tmp.real
+    else:
+        tmp = float(tmp)
+    return _round_for_arange(tmp)
+
+
+def _map_to_device_dtype(dt, q):
+    dtc = dt.char
+    if dtc == "?" or np.issubdtype(dt, np.integer):
+        return dt
+    d = q.sycl_device
+    if np.issubdtype(dt, np.floating):
+        if dtc == "f":
+            return dt
+        if dtc == "d" and d.has_aspect_fp64:
+            return dt
+        if dtc == "e" and d.has_aspect_fp16:
+            return dt
+        return dpt.dtype("f4")
+    if np.issubdtype(dt, np.complexfloating):
+        if dtc == "F":
+            return dt
+        if dtc == "D" and d.has_aspect_fp64:
+            return dt
+        return dpt.dtype("c8")
+    raise RuntimeError(f"Unrecognized data type '{dt}' encountered.")
+
+
+def _normalize_order(order, arr):
+    """
+    Utility function for processing the `order` keyword of array-like
+    constructors, which support `"K"` and `"A"` orders.
+    """
+    arr_flags = arr.flags
+    f_contig = arr_flags["F"]
+    c_contig = arr_flags["C"]
+    if order == "A":
+        order = "F" if f_contig and not c_contig else "C"
+    if order == "K" and (f_contig or c_contig):
+        order = "C" if c_contig else "F"
+    return order
+
+
+def _round_for_arange(tmp):
+    k = int(tmp)
+    if k >= 0 and float(k) < tmp:
+        tmp = tmp + 1
+    return tmp
+
+
+def _to_scalar(obj, sc_ty):
+    """A way to convert object to NumPy scalar type.
+    Raises OverflowError if obj can not be represented
+    using the requested scalar type.
+    """
+    zd_arr = np.asarray(obj, dtype=sc_ty)
+    return zd_arr[()]
+
+
+def _usm_ndarray_from_suai(obj):
+    sua_iface = obj.__sycl_usm_array_interface__
+    membuf = dpm.as_usm_memory(obj)
+    ary = dpt.usm_ndarray(
+        sua_iface["shape"],
+        dtype=sua_iface["typestr"],
+        buffer=membuf,
+        strides=sua_iface.get("strides", None),
+    )
+    _data_field = sua_iface["data"]
+    if isinstance(_data_field, tuple) and len(_data_field) > 1:
+        ro_field = _data_field[1]
+    else:
+        ro_field = False
+    if ro_field:
+        ary.flags["W"] = False
+    return ary
+
+
+def _usm_types_walker(o, usm_types_list):
+    if isinstance(o, dpt.usm_ndarray):
+        usm_types_list.append(o.usm_type)
+        return
+    if hasattr(o, "__usm_ndarray__"):
+        usm_arr = o.__usm_ndarray__
+        if isinstance(usm_arr, dpt.usm_ndarray):
+            usm_types_list.append(usm_arr.usm_type)
+            return
+    if hasattr(o, "__sycl_usm_array_interface__"):
+        usm_ar = _usm_ndarray_from_suai(o)
+        usm_types_list.append(usm_ar.usm_type)
+        return
+    if _is_object_with_buffer_protocol(o):
+        return
+    if isinstance(o, (int, bool, float, complex)):
+        return
+    if isinstance(o, (list, tuple, range)):
+        for el in o:
+            _usm_types_walker(el, usm_types_list)
+        return
+    raise TypeError
+
+
+def arange(
+    start,
+    /,
+    stop=None,
+    step=1,
+    *,
+    dtype=None,
+    device=None,
+    usm_type="device",
+    sycl_queue=None,
+):
+    """
+    Returns evenly spaced values within the half-open interval [start, stop)
+    as a one-dimensional array.
+
+    Args:
+        start:
+            Starting point of the interval
+        stop:
+            Ending point of the interval. Default: ``None``
+        step: Increment of the returned sequence. Default: ``1``
+        dtype: Output array data type. Default: ``None``
+        device (optional): array API concept of device where the output array
+            is created. ``device`` can be ``None``, a oneAPI filter selector
+            string, an instance of :class:`dpctl.SyclDevice` corresponding to
+            a non-partitioned SYCL device, an instance of
+            :class:`dpctl.SyclQueue`, or a :class:`dpctl.tensor.Device` object
+            returned by :attr:`dpctl.tensor.usm_ndarray.device`.
+            Default: ``None``
+        usm_type (``"device"``, ``"shared"``, ``"host"``, optional):
+            The type of SYCL USM allocation for the output array.
+            Default: ``"device"``
+        sycl_queue (:class:`dpctl.SyclQueue`, optional):
+            The SYCL queue to use
+            for output array allocation and copying. ``sycl_queue`` and
+            ``device`` are complementary arguments, i.e. use one or another.
+            If both are specified, a :exc:`TypeError` is raised unless both
+            imply the same underlying SYCL queue to be used. If both are
+            ``None``, a cached queue targeting default-selected device is
+            used for allocation and population. Default: ``None``
+
+    Returns:
+        usm_ndarray:
+            Array populated with evenly spaced values.
+    """
+    if stop is None:
+        stop = start
+        start = 0
+    if step is None:
+        step = 1
+    dpctl.utils.validate_usm_type(usm_type, allow_none=False)
+    sycl_queue = normalize_queue_device(sycl_queue=sycl_queue, device=device)
+    is_bool = False
+    if dtype:
+        is_bool = (dtype is bool) or (dpt.dtype(dtype) == dpt.bool)
+    _, dt = _coerce_and_infer_dt(
+        start,
+        stop,
+        step,
+        dt=dpt.int8 if is_bool else dtype,
+        sycl_queue=sycl_queue,
+        err_msg="start, stop, and step must be Python scalars",
+        allow_bool=False,
+    )
+    try:
+        tmp = _get_arange_length(start, stop, step)
+        sh = max(int(tmp), 0)
+    except TypeError:
+        sh = 0
+    if is_bool and sh > 2:
+        raise ValueError("no fill-function for boolean data type")
+    res = dpt.usm_ndarray(
+        (sh,),
+        dtype=dt,
+        buffer=usm_type,
+        order="C",
+        buffer_ctor_kwargs={"queue": sycl_queue},
+    )
+    sc_ty = dt.type
+    _first = _to_scalar(start, sc_ty)
+    if sh > 1:
+        _second = _to_scalar(start + step, sc_ty)
+        if dt in [dpt.uint8, dpt.uint16, dpt.uint32, dpt.uint64]:
+            int64_ty = dpt.int64.type
+            _step = int64_ty(_second) - int64_ty(_first)
+        else:
+            _step = _second - _first
+        _step = sc_ty(_step)
+    else:
+        _step = sc_ty(1)
+    _start = _first
+    _manager = dpctl.utils.SequentialOrderManager[sycl_queue]
+    # populating newly allocated array, no task dependencies
+    hev, lin_ev = ti._linspace_step(_start, _step, res, sycl_queue)
+    _manager.add_event_pair(hev, lin_ev)
+    if is_bool:
+        res_out = dpt.usm_ndarray(
+            (sh,),
+            dtype=dpt.bool,
+            buffer=usm_type,
+            order="C",
+            buffer_ctor_kwargs={"queue": sycl_queue},
+        )
+        hev_cpy, cpy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
+            src=res, dst=res_out, sycl_queue=sycl_queue, depends=[lin_ev]
+        )
+        _manager.add_event_pair(hev_cpy, cpy_ev)
+        return res_out
+    return res
+
+
+def asarray(
+    obj,
+    /,
+    *,
+    dtype=None,
+    device=None,
+    copy=None,
+    usm_type=None,
+    sycl_queue=None,
+    order="K",
+):
+    """
+    Converts input object to :class:`dpctl.tensor.usm_ndarray`.
+
+    Args:
+        obj: Python object to convert. Can be an instance of
+            :class:`dpctl.tensor.usm_ndarray`,
+            an object representing SYCL USM allocation and implementing
+            ``__sycl_usm_array_interface__`` protocol, an instance
+            of :class:`numpy.ndarray`, an object supporting Python buffer
+            protocol, a Python scalar, or a (possibly nested) sequence of
+            Python scalars.
+        dtype (data type, optional):
+            output array data type. If ``dtype`` is
+            ``None``, the output array data type is inferred from data types in
+            ``obj``. Default: ``None``
+        copy (`bool`, optional):
+            boolean indicating whether or not to copy the
+            input. If ``True``, always creates a copy. If ``False``, the
+            need to copy raises :exc:`ValueError`. If ``None``, tries to reuse
+            existing memory allocations if possible, but allows to perform
+            a copy otherwise. Default: ``None``
+        order (``"C"``, ``"F"``, ``"A"``, ``"K"``, optional):
+            memory layout of the output array. Default: ``"K"``
+        device (optional): array API concept of device where the output array
+            is created. ``device`` can be ``None``, a oneAPI filter selector
+            string, an instance of :class:`dpctl.SyclDevice` corresponding to
+            a non-partitioned SYCL device, an instance of
+            :class:`dpctl.SyclQueue`, or a :class:`dpctl.tensor.Device` object
+            returned by :attr:`dpctl.tensor.usm_ndarray.device`.
+            Default: ``None``
+        usm_type (``"device"``, ``"shared"``, ``"host"``, optional):
+            The type of SYCL USM allocation for the output array.
+            Default: ``"device"``
+        sycl_queue (:class:`dpctl.SyclQueue`, optional):
+            The SYCL queue to use
+            for output array allocation and copying. ``sycl_queue`` and
+            ``device`` are complementary arguments, i.e. use one or another.
+            If both are specified, a :exc:`TypeError` is raised unless both
+            imply the same underlying SYCL queue to be used. If both are
+            ``None``, a cached queue targeting default-selected device is
+            used for allocation and population. Default: ``None``
+
+    Returns:
+        usm_ndarray:
+            Array created from input object.
+    """
+    # 1. Check that copy is a valid keyword
+    if copy not in [None, True, False]:
+        raise TypeError(
+            "Recognized copy keyword values should be True, False, or None"
+        )
+    # 2. Check that dtype is None, or a valid dtype
+    if dtype is not None:
+        dtype = dpt.dtype(dtype)
+    # 3. Validate order
+    if not isinstance(order, str):
+        raise TypeError(
+            f"Expected order keyword to be of type str, got {type(order)}"
+        )
+    if len(order) == 0 or order[0] not in "KkAaCcFf":
+        raise ValueError(
+            "Unrecognized order keyword value, expecting 'K', 'A', 'F', or 'C'."
+        )
+    order = order[0].upper()
+    # 4. Check that usm_type is None, or a valid value
+    dpctl.utils.validate_usm_type(usm_type, allow_none=True)
+    # 5. Normalize device/sycl_queue [keep it None if was None]
+    if device is not None or sycl_queue is not None:
+        sycl_queue = normalize_queue_device(
+            sycl_queue=sycl_queue, device=device
+        )
+
+    # handle instance(obj, usm_ndarray)
+    if isinstance(obj, dpt.usm_ndarray):
+        return _asarray_from_usm_ndarray(
+            obj,
+            dtype=dtype,
+            copy=copy,
+            usm_type=usm_type,
+            sycl_queue=sycl_queue,
+            order=order,
+        )
+    if hasattr(obj, "__usm_ndarray__"):
+        usm_arr = obj.__usm_ndarray__
+        if isinstance(usm_arr, dpt.usm_ndarray):
+            return _asarray_from_usm_ndarray(
+                usm_arr,
+                dtype=dtype,
+                copy=copy,
+                usm_type=usm_type,
+                sycl_queue=sycl_queue,
+                order=order,
+            )
+    if hasattr(obj, "__sycl_usm_array_interface__"):
+        ary = _usm_ndarray_from_suai(obj)
+        return _asarray_from_usm_ndarray(
+            ary,
+            dtype=dtype,
+            copy=copy,
+            usm_type=usm_type,
+            sycl_queue=sycl_queue,
+            order=order,
+        )
+    if isinstance(obj, np.ndarray):
+        if copy is False:
+            raise ValueError(
+                "Converting numpy.ndarray to usm_ndarray requires a copy"
+            )
+        return _asarray_from_numpy_ndarray(
+            obj,
+            dtype=dtype,
+            usm_type=usm_type,
+            sycl_queue=sycl_queue,
+            order=order,
+        )
+    if _is_object_with_buffer_protocol(obj):
+        if copy is False:
+            raise ValueError(
+                f"Converting {type(obj)} to usm_ndarray requires a copy"
+            )
+        return _asarray_from_numpy_ndarray(
+            np.array(obj),
+            dtype=dtype,
+            usm_type=usm_type,
+            sycl_queue=sycl_queue,
+            order=order,
+        )
+    if isinstance(obj, (list, tuple, range)):
+        if copy is False:
+            raise ValueError(
+                "Converting Python sequence to usm_ndarray requires a copy"
+            )
+        seq_shape, seq_dt, devs = _array_info_sequence(obj)
+        if devs == _host_set:
+            return _asarray_from_numpy_ndarray(
+                np.asarray(obj, dtype=dtype, order=order),
+                dtype=dtype,
+                usm_type=usm_type,
+                sycl_queue=sycl_queue,
+                order=order,
+            )
+        elif len(devs) == 1:
+            seq_dev = list(devs)[0]
+            return _asarray_from_seq_single_device(
+                obj,
+                seq_shape,
+                seq_dt,
+                seq_dev,
+                dtype=dtype,
+                usm_type=usm_type,
+                sycl_queue=sycl_queue,
+                order=order,
+            )
+        elif len(devs) > 1:
+            devs = [dev for dev in devs if dev is not None]
+            if sycl_queue is None:
+                if len(devs) == 1:
+                    alloc_q = devs[0]
+                else:
+                    raise dpctl.utils.ExecutionPlacementError(
+                        "Please specify `device` or `sycl_queue` keyword "
+                        "argument to determine where to allocate the "
+                        "resulting array."
+                    )
+            else:
+                alloc_q = sycl_queue
+            return _asarray_from_seq(
+                obj,
+                seq_shape,
+                seq_dt,
+                alloc_q,
+                #  force copying via host
+                None,
+                dtype=dtype,
+                usm_type=usm_type,
+                order=order,
+            )
+    if copy is False:
+        raise ValueError(
+            f"Converting {type(obj)} to usm_ndarray requires a copy"
+        )
+    # obj is a scalar, create 0d array
+    return _asarray_from_numpy_ndarray(
+        np.asarray(obj, dtype=dtype),
+        dtype=dtype,
+        usm_type=usm_type,
+        sycl_queue=sycl_queue,
+        order="C",
+    )
+
+
+def empty(
+    shape,
+    *,
+    dtype=None,
+    order="C",
+    device=None,
+    usm_type="device",
+    sycl_queue=None,
+):
+    """
+    Creates :class:`dpctl.tensor.usm_ndarray` from uninitialized
+    USM allocation.
+
+    Args:
+        shape (Tuple[int], int):
+            Dimensions of the array to be created.
+        dtype (optional):
+            data type of the array. Can be typestring,
+            a :class:`numpy.dtype` object, :mod:`numpy` char string,
+            or a NumPy scalar type. The ``None`` value creates an
+            array of floating point data type. Default: ``None``
+        order (``"C"``, or ``F"``):
+            memory layout for the array. Default: ``"C"``
+        device (optional): array API concept of device where the output array
+            is created. ``device`` can be ``None``, a oneAPI filter selector
+            string, an instance of :class:`dpctl.SyclDevice` corresponding to
+            a non-partitioned SYCL device, an instance of
+            :class:`dpctl.SyclQueue`, or a :class:`dpctl.tensor.Device` object
+            returned by :attr:`dpctl.tensor.usm_ndarray.device`.
+            Default: ``None``
+        usm_type (``"device"``, ``"shared"``, ``"host"``, optional):
+            The type of SYCL USM allocation for the output array.
+            Default: ``"device"``
+        sycl_queue (:class:`dpctl.SyclQueue`, optional):
+            The SYCL queue to use
+            for output array allocation and copying. ``sycl_queue`` and
+            ``device`` are complementary arguments, i.e. use one or another.
+            If both are specified, a :exc:`TypeError` is raised unless both
+            imply the same underlying SYCL queue to be used. If both are
+            ``None``, a cached queue targeting default-selected device is
+            used for allocation and population. Default: ``None``
+
+    Returns:
+        usm_ndarray:
+            Created empty array.
+    """
+    if not isinstance(order, str) or len(order) == 0 or order[0] not in "CcFf":
+        raise ValueError(
+            "Unrecognized order keyword value, expecting 'F' or 'C'."
+        )
+    order = order[0].upper()
+    dpctl.utils.validate_usm_type(usm_type, allow_none=False)
+    sycl_queue = normalize_queue_device(sycl_queue=sycl_queue, device=device)
+    dtype = _get_dtype(dtype, sycl_queue)
+    _ensure_native_dtype_device_support(dtype, sycl_queue.sycl_device)
+    res = dpt.usm_ndarray(
+        shape,
+        dtype=dtype,
+        buffer=usm_type,
+        order=order,
+        buffer_ctor_kwargs={"queue": sycl_queue},
+    )
+    return res
+
+
+def empty_like(
+    x, /, *, dtype=None, order="K", device=None, usm_type=None, sycl_queue=None
+):
+    """
+    Returns an uninitialized :class:`dpctl.tensor.usm_ndarray` with the
+    same `shape` as the input array `x`.
+
+    Args:
+        x (usm_ndarray):
+            Input array from which to derive the output array shape.
+        dtype (optional):
+            data type of the array. Can be a typestring,
+            a :class:`numpy.dtype` object, NumPy char string,
+            or a NumPy scalar type. Default: ``None``
+        order ("C", "F", "A", or "K"):
+            memory layout for the array. Default: ``"K"``
+        device (optional): array API concept of device where the output array
+            is created. ``device`` can be ``None``, a oneAPI filter selector
+            string, an instance of :class:`dpctl.SyclDevice` corresponding to
+            a non-partitioned SYCL device, an instance of
+            :class:`dpctl.SyclQueue`, or a :class:`dpctl.tensor.Device` object
+            returned by :attr:`dpctl.tensor.usm_ndarray.device`.
+            Default: ``None``
+        usm_type (``"device"``, ``"shared"``, ``"host"``, optional):
+            The type of SYCL USM allocation for the output array.
+            Default: ``"device"``
+        sycl_queue (:class:`dpctl.SyclQueue`, optional):
+            The SYCL queue to use
+            for output array allocation and copying. ``sycl_queue`` and
+            ``device`` are complementary arguments, i.e. use one or another.
+            If both are specified, a :exc:`TypeError` is raised unless both
+            imply the same underlying SYCL queue to be used. If both are
+            ``None``, a cached queue targeting default-selected device is
+            used for allocation. Default: ``None``
+
+    Returns:
+        usm_ndarray:
+            Created empty array with uninitialized memory.
+    """
+    if not isinstance(x, dpt.usm_ndarray):
+        raise TypeError(f"Expected instance of dpt.usm_ndarray, got {type(x)}.")
+    if (
+        not isinstance(order, str)
+        or len(order) == 0
+        or order[0] not in "CcFfAaKk"
+    ):
+        raise ValueError(
+            "Unrecognized order keyword value, expecting 'C', 'F', 'A', or 'K'."
+        )
+    order = order[0].upper()
+    if dtype is None:
+        dtype = x.dtype
+    if usm_type is None:
+        usm_type = x.usm_type
+    dpctl.utils.validate_usm_type(usm_type, allow_none=False)
+    if device is None and sycl_queue is None:
+        device = x.device
+    sycl_queue = normalize_queue_device(sycl_queue=sycl_queue, device=device)
+    dtype = dpt.dtype(dtype)
+    order = _normalize_order(order, x)
+    if order == "K":
+        _ensure_native_dtype_device_support(dtype, sycl_queue.sycl_device)
+        return _empty_like_orderK(x, dtype, usm_type, sycl_queue)
+    else:
+        shape = x.shape
+        _ensure_native_dtype_device_support(dtype, sycl_queue.sycl_device)
+        res = dpt.usm_ndarray(
+            shape,
+            dtype=dtype,
+            buffer=usm_type,
+            order=order,
+            buffer_ctor_kwargs={"queue": sycl_queue},
+        )
+        return res
+
+
+def eye(
+    n_rows,
+    n_cols=None,
+    /,
+    *,
+    k=0,
+    dtype=None,
+    order="C",
+    device=None,
+    usm_type="device",
+    sycl_queue=None,
+):
+    """
+    eye(n_rows, n_cols=None, /, *, k=0, dtype=None, \
+        device=None, usm_type="device", sycl_queue=None)
+
+    Creates :class:`dpctl.tensor.usm_ndarray` with ones on the `k`-th
+    diagonal.
+
+    Args:
+        n_rows (int):
+            number of rows in the output array.
+        n_cols (int, optional):
+            number of columns in the output array. If ``None``,
+            ``n_cols = n_rows``. Default: ``None``
+        k (int):
+            index of the diagonal, with ``0`` as the main diagonal.
+            A positive value of ``k`` is a superdiagonal, a negative value
+            is a subdiagonal.
+            Raises :exc:`TypeError` if ``k`` is not an integer.
+            Default: ``0``
+        dtype (optional):
+            data type of the array. Can be typestring,
+            a :class:`numpy.dtype` object, :mod:`numpy` char string, or
+            a NumPy scalar type. Default: ``None``
+        order ("C" or "F"):
+            memory layout for the array. Default: ``"C"``
+        device (optional):
+            array API concept of device where the output array
+            is created. ``device`` can be ``None``, a oneAPI filter selector
+            string, an instance of :class:`dpctl.SyclDevice` corresponding to
+            a non-partitioned SYCL device, an instance of
+            :class:`dpctl.SyclQueue`, or a :class:`dpctl.tensor.Device` object
+            returned by :attr:`dpctl.tensor.usm_ndarray.device`.
+            Default: ``None``
+        usm_type (``"device"``, ``"shared"``, ``"host"``, optional):
+            The type of SYCL USM allocation for the output array.
+            Default: ``"device"``
+        sycl_queue (:class:`dpctl.SyclQueue`, optional):
+            The SYCL queue to use
+            for output array allocation and copying. ``sycl_queue`` and
+            ``device`` are complementary arguments, i.e. use one or another.
+            If both are specified, a :exc:`TypeError` is raised unless both
+            imply the same underlying SYCL queue to be used. If both are
+            ``None``, a cached queue targeting default-selected device is
+            used for allocation and population. Default: ``None``
+
+    Returns:
+        usm_ndarray:
+            A diagonal matrix.
+    """
+    if not isinstance(order, str) or len(order) == 0 or order[0] not in "CcFf":
+        raise ValueError(
+            "Unrecognized order keyword value, expecting 'F' or 'C'."
+        )
+    order = order[0].upper()
+    n_rows = operator.index(n_rows)
+    n_cols = n_rows if n_cols is None else operator.index(n_cols)
+    k = operator.index(k)
+    if k >= n_cols or -k >= n_rows:
+        return dpt_ext.zeros(
+            (n_rows, n_cols),
+            dtype=dtype,
+            order=order,
+            device=device,
+            usm_type=usm_type,
+            sycl_queue=sycl_queue,
+        )
+    dpctl.utils.validate_usm_type(usm_type, allow_none=False)
+    sycl_queue = normalize_queue_device(sycl_queue=sycl_queue, device=device)
+    dtype = _get_dtype(dtype, sycl_queue)
+    _ensure_native_dtype_device_support(dtype, sycl_queue.sycl_device)
+    res = dpt.usm_ndarray(
+        (n_rows, n_cols),
+        dtype=dtype,
+        buffer=usm_type,
+        order=order,
+        buffer_ctor_kwargs={"queue": sycl_queue},
+    )
+    if n_rows != 0 and n_cols != 0:
+        _manager = dpctl.utils.SequentialOrderManager[sycl_queue]
+        hev, eye_ev = ti._eye(k, dst=res, sycl_queue=sycl_queue)
+        _manager.add_event_pair(hev, eye_ev)
+    return res
+
+
+def _validate_fill_value(fill_val):
+    """Validates that `fill_val` is a numeric or boolean scalar."""
+    # TODO: verify if `np.True_` and `np.False_` should be instances of
+    # Number in NumPy, like other NumPy scalars and like Python bools
+    # check for `np.bool_` separately as NumPy<2 has no `np.bool`
+    if not isinstance(fill_val, Number) and not isinstance(fill_val, np.bool_):
+        raise TypeError(
+            f"array cannot be filled with scalar of type {type(fill_val)}"
+        )
+
+
+def full(
+    shape,
+    fill_value,
+    *,
+    dtype=None,
+    order="C",
+    device=None,
+    usm_type=None,
+    sycl_queue=None,
+):
+    """
+    Returns a new :class:`dpctl.tensor.usm_ndarray` having a specified
+    shape and filled with `fill_value`.
+
+    Args:
+        shape (tuple):
+            Dimensions of the array to be created.
+        fill_value (int,float,complex,usm_ndarray):
+            fill value
+        dtype (optional): data type of the array. Can be typestring,
+            a :class:`numpy.dtype` object, :mod:`numpy` char string,
+            or a NumPy scalar type. Default: ``None``
+        order ("C", or "F"):
+            memory layout for the array. Default: ``"C"``
+        device (optional): array API concept of device where the output array
+            is created. ``device`` can be ``None``, a oneAPI filter selector
+            string, an instance of :class:`dpctl.SyclDevice` corresponding to
+            a non-partitioned SYCL device, an instance of
+            :class:`dpctl.SyclQueue`, or a :class:`dpctl.tensor.Device` object
+            returned by :attr:`dpctl.tensor.usm_ndarray.device`.
+            Default: ``None``
+        usm_type (``"device"``, ``"shared"``, ``"host"``, optional):
+            The type of SYCL USM allocation for the output array.
+            Default: ``"device"``
+        sycl_queue (:class:`dpctl.SyclQueue`, optional):
+            The SYCL queue to use
+            for output array allocation and copying. ``sycl_queue`` and
+            ``device`` are complementary arguments, i.e. use one or another.
+            If both are specified, a :exc:`TypeError` is raised unless both
+            imply the same underlying SYCL queue to be used. If both are
+            ``None``, a cached queue targeting default-selected device is
+            used for allocation and population. Default: ``None``
+
+    Returns:
+        usm_ndarray:
+            New array initialized with given value.
+    """
+    if not isinstance(order, str) or len(order) == 0 or order[0] not in "CcFf":
+        raise ValueError(
+            "Unrecognized order keyword value, expecting 'F' or 'C'."
+        )
+    order = order[0].upper()
+    dpctl.utils.validate_usm_type(usm_type, allow_none=True)
+
+    if isinstance(fill_value, (dpt.usm_ndarray, np.ndarray, tuple, list)):
+        if (
+            isinstance(fill_value, dpt.usm_ndarray)
+            and sycl_queue is None
+            and device is None
+        ):
+            sycl_queue = fill_value.sycl_queue
+        else:
+            sycl_queue = normalize_queue_device(
+                sycl_queue=sycl_queue, device=device
+            )
+        X = dpt_ext.asarray(
+            fill_value,
+            dtype=dtype,
+            order=order,
+            usm_type=usm_type,
+            sycl_queue=sycl_queue,
+        )
+        return dpt_ext.copy(dpt_ext.broadcast_to(X, shape), order=order)
+    else:
+        _validate_fill_value(fill_value)
+
+    sycl_queue = normalize_queue_device(sycl_queue=sycl_queue, device=device)
+    usm_type = usm_type if usm_type is not None else "device"
+    dtype = _get_dtype(dtype, sycl_queue, ref_type=type(fill_value))
+    res = dpt.usm_ndarray(
+        shape,
+        dtype=dtype,
+        buffer=usm_type,
+        order=order,
+        buffer_ctor_kwargs={"queue": sycl_queue},
+    )
+    fill_value = _cast_fill_val(fill_value, dtype)
+
+    _manager = dpctl.utils.SequentialOrderManager[sycl_queue]
+    # populating new allocation, no dependent events
+    hev, full_ev = ti._full_usm_ndarray(fill_value, res, sycl_queue)
+    _manager.add_event_pair(hev, full_ev)
+    return res
+
+
+def full_like(
+    x,
+    /,
+    fill_value,
+    *,
+    dtype=None,
+    order="K",
+    device=None,
+    usm_type=None,
+    sycl_queue=None,
+):
+    """full_like(x, fill_value, dtype=None, order="K", \
+                 device=None, usm_type=None, sycl_queue=None)
+
+    Returns a new :class:`dpctl.tensor.usm_ndarray` filled with `fill_value`
+    and having the same `shape` as the input array `x`.
+
+    Args:
+        x (usm_ndarray): Input array from which to derive the output array
+            shape.
+        fill_value: the value to fill output array with
+        dtype (optional):
+            data type of the array. Can be typestring,
+            a :class:`numpy.dtype` object, :mod:`numpy` char string, or a
+            NumPy scalar type. If ``dtype`` is ``None``, the output array data
+            type is inferred from ``x``. Default: ``None``
+        order ("C", "F", "A", or "K"):
+            memory layout for the array. Default: ``"K"``
+        device (optional):
+            array API concept of device where the output array
+            is created. ``device`` can be ``None``, a oneAPI filter selector
+            string, an instance of :class:`dpctl.SyclDevice` corresponding to
+            a non-partitioned SYCL device, an instance of
+            :class:`dpctl.SyclQueue`, or a :class:`dpctl.tensor.Device` object
+            returned by :attr:`dpctl.tensor.usm_ndarray.device`.
+            Default: ``None``
+        usm_type (``"device"``, ``"shared"``, ``"host"``, optional):
+            The type of SYCL USM allocation for the output array.
+            Default: ``"device"``
+        sycl_queue (:class:`dpctl.SyclQueue`, optional):
+            The SYCL queue to use
+            for output array allocation and copying. ``sycl_queue`` and
+            ``device`` are complementary arguments, i.e. use one or another.
+            If both are specified, a :exc:`TypeError` is raised unless both
+            imply the same underlying SYCL queue to be used. If both are
+            ``None``, a cached queue targeting default-selected device is
+            used for allocation and population. Default: ``None``
+
+    Returns:
+        usm_ndarray:
+            New array initialized with given value.
+    """
+    if not isinstance(x, dpt.usm_ndarray):
+        raise TypeError(f"Expected instance of dpt.usm_ndarray, got {type(x)}.")
+    if (
+        not isinstance(order, str)
+        or len(order) == 0
+        or order[0] not in "CcFfAaKk"
+    ):
+        raise ValueError(
+            "Unrecognized order keyword value, expecting 'C', 'F', 'A', or 'K'."
+        )
+    order = order[0].upper()
+    if dtype is None:
+        dtype = x.dtype
+    if usm_type is None:
+        usm_type = x.usm_type
+    dpctl.utils.validate_usm_type(usm_type, allow_none=False)
+    if device is None and sycl_queue is None:
+        device = x.device
+    sycl_queue = normalize_queue_device(sycl_queue=sycl_queue, device=device)
+    sh = x.shape
+    dtype = dpt.dtype(dtype)
+    order = _normalize_order(order, x)
+    if order == "K":
+        _ensure_native_dtype_device_support(dtype, sycl_queue.sycl_device)
+        if isinstance(fill_value, (dpt.usm_ndarray, np.ndarray, tuple, list)):
+            X = dpt_ext.asarray(
+                fill_value,
+                dtype=dtype,
+                order=order,
+                usm_type=usm_type,
+                sycl_queue=sycl_queue,
+            )
+            X = dpt_ext.broadcast_to(X, sh)
+            res = _empty_like_orderK(x, dtype, usm_type, sycl_queue)
+            _manager = dpctl.utils.SequentialOrderManager[sycl_queue]
+            # order copy after tasks populating X
+            dep_evs = _manager.submitted_events
+            hev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
+                src=X, dst=res, sycl_queue=sycl_queue, depends=dep_evs
+            )
+            _manager.add_event_pair(hev, copy_ev)
+            return res
+        else:
+            _validate_fill_value(fill_value)
+
+        dtype = _get_dtype(dtype, sycl_queue, ref_type=type(fill_value))
+        res = _empty_like_orderK(x, dtype, usm_type, sycl_queue)
+        fill_value = _cast_fill_val(fill_value, dtype)
+        _manager = dpctl.utils.SequentialOrderManager[sycl_queue]
+        # populating new allocation, no dependent events
+        hev, full_ev = ti._full_usm_ndarray(fill_value, res, sycl_queue)
+        _manager.add_event_pair(hev, full_ev)
+        return res
+    else:
+        return full(
+            sh,
+            fill_value,
+            dtype=dtype,
+            order=order,
+            device=device,
+            usm_type=usm_type,
+            sycl_queue=sycl_queue,
+        )
+
+
+def linspace(
+    start,
+    stop,
+    /,
+    num,
+    *,
+    dtype=None,
+    device=None,
+    endpoint=True,
+    sycl_queue=None,
+    usm_type="device",
+):
+    """
+    linspace(start, stop, num, dtype=None, device=None, endpoint=True, \
+        sycl_queue=None, usm_type="device")
+
+    Returns :class:`dpctl.tensor.usm_ndarray` array populated with
+    evenly spaced numbers of specified interval.
+
+    Args:
+        start:
+            the start of the interval.
+        stop:
+            the end of the interval. If the ``endpoint`` is ``False``, the
+            function generates ``num+1`` evenly spaced points starting
+            with ``start`` and ending with ``stop`` and exclude the
+            ``stop`` from the returned array such that the returned array
+            consists of evenly spaced numbers over the half-open interval
+            ``[start, stop)``. If ``endpoint`` is ``True``, the output
+            array consists of evenly spaced numbers over the closed
+            interval ``[start, stop]``. Default: ``True``
+        num (int):
+            number of samples. Must be a non-negative integer; otherwise,
+            the function raises ``ValueError`` exception.
+        dtype:
+            output array data type. Should be a floating data type.
+            If ``dtype`` is ``None``, the output array must be the default
+            floating point data type for target device.
+            Default: ``None``
+        device (optional):
+            array API concept of device where the output array
+            is created. ``device`` can be ``None``, a oneAPI filter selector
+            string, an instance of :class:`dpctl.SyclDevice` corresponding to
+            a non-partitioned SYCL device, an instance of
+            :class:`dpctl.SyclQueue`, or a :class:`dpctl.tensor.Device` object
+            returned by :attr:`dpctl.tensor.usm_ndarray.device`.
+            Default: ``None``
+        usm_type (``"device"``, ``"shared"``, ``"host"``, optional):
+            The type of SYCL USM allocation for the output array.
+            Default: ``"device"``
+        sycl_queue (:class:`dpctl.SyclQueue`, optional):
+            The SYCL queue to use
+            for output array allocation and copying. ``sycl_queue`` and
+            ``device`` are complementary arguments, i.e. use one or another.
+            If both are specified, a :exc:`TypeError` is raised unless both
+            imply the same underlying SYCL queue to be used. If both are
+            ``None``, a cached queue targeting default-selected device is
+            used for allocation and population. Default: ``None``
+        endpoint: boolean indicating whether to include ``stop`` in the
+            interval. Default: ``True``
+
+    Returns:
+        usm_ndarray:
+            Array populated with evenly spaced numbers in the requested
+            interval.
+    """
+    sycl_queue = normalize_queue_device(sycl_queue=sycl_queue, device=device)
+    dpctl.utils.validate_usm_type(usm_type, allow_none=False)
+    if endpoint not in [True, False]:
+        raise TypeError("endpoint keyword argument must be of boolean type")
+
+    num = operator.index(num)
+    if num < 0:
+        raise ValueError("Number of points must be non-negative")
+
+    _, dt = _coerce_and_infer_dt(
+        start,
+        stop,
+        dt=dtype,
+        sycl_queue=sycl_queue,
+        err_msg="start and stop must be Python scalars.",
+        allow_bool=True,
+    )
+
+    int_dt = None
+    if np.issubdtype(dt, np.integer):
+        if dtype is not None:
+            int_dt = dt
+        dt = ti.default_device_fp_type(sycl_queue)
+        dt = dpt.dtype(dt)
+        start = float(start)
+        stop = float(stop)
+
+    res = dpt_ext.empty(num, dtype=dt, usm_type=usm_type, sycl_queue=sycl_queue)
+    _manager = dpctl.utils.SequentialOrderManager[sycl_queue]
+    hev, la_ev = ti._linspace_affine(
+        start, stop, dst=res, include_endpoint=endpoint, sycl_queue=sycl_queue
+    )
+    _manager.add_event_pair(hev, la_ev)
+
+    return res if int_dt is None else dpt_ext.astype(res, int_dt)
+
+
+def meshgrid(*arrays, indexing="xy"):
+    """
+    Creates list of :class:`dpctl.tensor.usm_ndarray` coordinate matrices
+    from vectors.
+
+    Args:
+        arrays (usm_ndarray):
+            an arbitrary number of one-dimensional arrays
+            representing grid coordinates. Each array should have the same
+            numeric data type.
+        indexing (``"xy"``, or ``"ij"``):
+            Cartesian (``"xy"``) or matrix (``"ij"``) indexing of output.
+            If provided zero or one one-dimensional vector(s) (i.e., the
+            zero- and one-dimensional cases, respectively), the ``indexing``
+            keyword has no effect and should be ignored. Default: ``"xy"``
+
+    Returns:
+        List[array]:
+            list of ``N`` arrays, where ``N`` is the number of
+            provided one-dimensional input arrays. Each returned array must
+            have rank ``N``.
+            For a set of ``n`` vectors with lengths ``N0``, ``N1``, ``N2``, ...
+            The cartesian indexing results in arrays of shape
+            ``(N1, N0, N2, ...)``, while the
+            matrix indexing results in arrays of shape
+            ``(N0, N1, N2, ...)``.
+            Default: ``"xy"``.
+
+    Raises:
+        ValueError: If vectors are not of the same data type, or are not
+            one-dimensional.
+
+    """
+    ref_dt = None
+    ref_unset = True
+    for array in arrays:
+        if not isinstance(array, dpt.usm_ndarray):
+            raise TypeError(
+                f"Expected instance of dpt.usm_ndarray, got {type(array)}."
+            )
+        if array.ndim != 1:
+            raise ValueError("All arrays must be one-dimensional.")
+        if ref_unset:
+            ref_unset = False
+            ref_dt = array.dtype
+        else:
+            if not ref_dt == array.dtype:
+                raise ValueError(
+                    "All arrays must be of the same numeric data type."
+                )
+    if indexing not in ["xy", "ij"]:
+        raise ValueError(
+            "Unrecognized indexing keyword value, expecting 'xy' or 'ij.'"
+        )
+    n = len(arrays)
+    if n == 0:
+        return []
+
+    sh = (-1,) + (1,) * (n - 1)
+
+    res = []
+    if n > 1 and indexing == "xy":
+        res.append(dpt_ext.reshape(arrays[0], (1, -1) + sh[2:], copy=True))
+        res.append(dpt_ext.reshape(arrays[1], sh, copy=True))
+        arrays, sh = arrays[2:], sh[-2:] + sh[:-2]
+
+    for array in arrays:
+        res.append(dpt_ext.reshape(array, sh, copy=True))
+        sh = sh[-1:] + sh[:-1]
+
+    output = dpt_ext.broadcast_arrays(*res)
 
+    return output
 
-def eye(
-    n_rows,
-    n_cols=None,
-    /,
+
+def ones(
+    shape,
     *,
-    k=0,
     dtype=None,
     order="C",
     device=None,
     usm_type="device",
     sycl_queue=None,
 ):
-    """
-    eye(n_rows, n_cols=None, /, *, k=0, dtype=None, \
-        device=None, usm_type="device", sycl_queue=None)
+    """ones(shape, dtype=None, order="C", \
+            device=None, usm_type="device", sycl_queue=None)
 
-    Creates :class:`dpctl.tensor.usm_ndarray` with ones on the `k`-th
-    diagonal.
+    Returns a new :class:`dpctl.tensor.usm_ndarray` having a specified
+    shape and filled with ones.
 
     Args:
-        n_rows (int):
-            number of rows in the output array.
-        n_cols (int, optional):
-            number of columns in the output array. If ``None``,
-            ``n_cols = n_rows``. Default: ``None``
-        k (int):
-            index of the diagonal, with ``0`` as the main diagonal.
-            A positive value of ``k`` is a superdiagonal, a negative value
-            is a subdiagonal.
-            Raises :exc:`TypeError` if ``k`` is not an integer.
-            Default: ``0``
+        shape (Tuple[int], int):
+            Dimensions of the array to be created.
         dtype (optional):
             data type of the array. Can be typestring,
-            a :class:`numpy.dtype` object, :mod:`numpy` char string, or
-            a NumPy scalar type. Default: ``None``
-        order ("C" or "F"):
-            memory layout for the array. Default: ``"C"``
-        device (optional):
-            array API concept of device where the output array
+            a :class:`numpy.dtype` object, :mod:`numpy` char string,
+            or a NumPy scalar type. Default: ``None``
+        order ("C", or "F"): memory layout for the array. Default: ``"C"``
+        device (optional): array API concept of device where the output array
             is created. ``device`` can be ``None``, a oneAPI filter selector
             string, an instance of :class:`dpctl.SyclDevice` corresponding to
             a non-partitioned SYCL device, an instance of
@@ -158,79 +1563,48 @@ def eye(
 
     Returns:
         usm_ndarray:
-            A diagonal matrix.
+            Created array initialized with ones.
     """
     if not isinstance(order, str) or len(order) == 0 or order[0] not in "CcFf":
         raise ValueError(
             "Unrecognized order keyword value, expecting 'F' or 'C'."
         )
     order = order[0].upper()
-    n_rows = operator.index(n_rows)
-    n_cols = n_rows if n_cols is None else operator.index(n_cols)
-    k = operator.index(k)
-    if k >= n_cols or -k >= n_rows:
-        return dpt.zeros(
-            (n_rows, n_cols),
-            dtype=dtype,
-            order=order,
-            device=device,
-            usm_type=usm_type,
-            sycl_queue=sycl_queue,
-        )
     dpctl.utils.validate_usm_type(usm_type, allow_none=False)
     sycl_queue = normalize_queue_device(sycl_queue=sycl_queue, device=device)
     dtype = _get_dtype(dtype, sycl_queue)
-    _ensure_native_dtype_device_support(dtype, sycl_queue.sycl_device)
     res = dpt.usm_ndarray(
-        (n_rows, n_cols),
+        shape,
         dtype=dtype,
         buffer=usm_type,
         order=order,
         buffer_ctor_kwargs={"queue": sycl_queue},
     )
-    if n_rows != 0 and n_cols != 0:
-        _manager = dpctl.utils.SequentialOrderManager[sycl_queue]
-        hev, eye_ev = ti._eye(k, dst=res, sycl_queue=sycl_queue)
-        _manager.add_event_pair(hev, eye_ev)
+    _manager = dpctl.utils.SequentialOrderManager[sycl_queue]
+    # populating new allocation, no dependent events
+    hev, full_ev = ti._full_usm_ndarray(1, res, sycl_queue)
+    _manager.add_event_pair(hev, full_ev)
     return res
 
 
-def _validate_fill_value(fill_val):
-    """Validates that `fill_val` is a numeric or boolean scalar."""
-    # TODO: verify if `np.True_` and `np.False_` should be instances of
-    # Number in NumPy, like other NumPy scalars and like Python bools
-    # check for `np.bool_` separately as NumPy<2 has no `np.bool`
-    if not isinstance(fill_val, Number) and not isinstance(fill_val, np.bool_):
-        raise TypeError(
-            f"array cannot be filled with scalar of type {type(fill_val)}"
-        )
-
-
-def full(
-    shape,
-    fill_value,
-    *,
-    dtype=None,
-    order="C",
-    device=None,
-    usm_type=None,
-    sycl_queue=None,
+def ones_like(
+    x, /, *, dtype=None, order="K", device=None, usm_type=None, sycl_queue=None
 ):
     """
-    Returns a new :class:`dpctl.tensor.usm_ndarray` having a specified
-    shape and filled with `fill_value`.
+    Returns a new :class:`dpctl.tensor.usm_ndarray` filled with ones and
+    having the same `shape` as the input array `x`.
 
     Args:
-        shape (tuple):
-            Dimensions of the array to be created.
-        fill_value (int,float,complex,usm_ndarray):
-            fill value
-        dtype (optional): data type of the array. Can be typestring,
+        x (usm_ndarray):
+            Input array from which to derive the output array shape
+        dtype (optional):
+            data type of the array. Can be typestring,
             a :class:`numpy.dtype` object, :mod:`numpy` char string,
-            or a NumPy scalar type. Default: ``None``
-        order ("C", or "F"):
+            or a NumPy scalar type. Default: `None`
+        order ("C", "F", "A", or "K"):
             memory layout for the array. Default: ``"C"``
-        device (optional): array API concept of device where the output array
+        device (optional):
+            array API concept of device where the output array
             is created. ``device`` can be ``None``, a oneAPI filter selector
             string, an instance of :class:`dpctl.SyclDevice` corresponding to
             a non-partitioned SYCL device, an instance of
@@ -251,54 +1625,47 @@ def full(
 
     Returns:
         usm_ndarray:
-            New array initialized with given value.
+            New array initialized with ones.
     """
-    if not isinstance(order, str) or len(order) == 0 or order[0] not in "CcFf":
+    if not isinstance(x, dpt.usm_ndarray):
+        raise TypeError(f"Expected instance of dpt.usm_ndarray, got {type(x)}.")
+    if (
+        not isinstance(order, str)
+        or len(order) == 0
+        or order[0] not in "CcFfAaKk"
+    ):
         raise ValueError(
-            "Unrecognized order keyword value, expecting 'F' or 'C'."
+            "Unrecognized order keyword value, expecting 'C', 'F', 'A', or 'K'."
         )
     order = order[0].upper()
-    dpctl.utils.validate_usm_type(usm_type, allow_none=True)
-
-    if isinstance(fill_value, (dpt.usm_ndarray, np.ndarray, tuple, list)):
-        if (
-            isinstance(fill_value, dpt.usm_ndarray)
-            and sycl_queue is None
-            and device is None
-        ):
-            sycl_queue = fill_value.sycl_queue
-        else:
-            sycl_queue = normalize_queue_device(
-                sycl_queue=sycl_queue, device=device
-            )
-        X = dpt.asarray(
-            fill_value,
+    if dtype is None:
+        dtype = x.dtype
+    if usm_type is None:
+        usm_type = x.usm_type
+    dpctl.utils.validate_usm_type(usm_type, allow_none=False)
+    if device is None and sycl_queue is None:
+        device = x.device
+    sycl_queue = normalize_queue_device(sycl_queue=sycl_queue, device=device)
+    dtype = dpt.dtype(dtype)
+    order = _normalize_order(order, x)
+    if order == "K":
+        _ensure_native_dtype_device_support(dtype, sycl_queue.sycl_device)
+        res = _empty_like_orderK(x, dtype, usm_type, sycl_queue)
+        _manager = dpctl.utils.SequentialOrderManager[sycl_queue]
+        # populating new allocation, no dependent events
+        hev, full_ev = ti._full_usm_ndarray(1, res, sycl_queue)
+        _manager.add_event_pair(hev, full_ev)
+        return res
+    else:
+        sh = x.shape
+        return ones(
+            sh,
             dtype=dtype,
             order=order,
+            device=device,
             usm_type=usm_type,
             sycl_queue=sycl_queue,
         )
-        return dpt_ext.copy(dpt.broadcast_to(X, shape), order=order)
-    else:
-        _validate_fill_value(fill_value)
-
-    sycl_queue = normalize_queue_device(sycl_queue=sycl_queue, device=device)
-    usm_type = usm_type if usm_type is not None else "device"
-    dtype = _get_dtype(dtype, sycl_queue, ref_type=type(fill_value))
-    res = dpt.usm_ndarray(
-        shape,
-        dtype=dtype,
-        buffer=usm_type,
-        order=order,
-        buffer_ctor_kwargs={"queue": sycl_queue},
-    )
-    fill_value = _cast_fill_val(fill_value, dtype)
-
-    _manager = dpctl.utils.SequentialOrderManager[sycl_queue]
-    # populating new allocation, no dependent events
-    hev, full_ev = ti._full_usm_ndarray(fill_value, res, sycl_queue)
-    _manager.add_event_pair(hev, full_ev)
-    return res
 
 
 def tril(x, /, *, k=0):
@@ -340,7 +1707,7 @@ def tril(x, /, *, k=0):
 
     q = x.sycl_queue
     if k >= shape[nd - 1] - 1:
-        res = dpt.empty(
+        res = dpt_ext.empty(
             x.shape,
             dtype=x.dtype,
             order=order,
@@ -354,7 +1721,7 @@ def tril(x, /, *, k=0):
         )
         _manager.add_event_pair(hev, cpy_ev)
     elif k < -shape[nd - 2]:
-        res = dpt.zeros(
+        res = dpt_ext.zeros(
             x.shape,
             dtype=x.dtype,
             order=order,
@@ -362,7 +1729,7 @@ def tril(x, /, *, k=0):
             sycl_queue=q,
         )
     else:
-        res = dpt.empty(
+        res = dpt_ext.empty(
             x.shape,
             dtype=x.dtype,
             order=order,
@@ -418,7 +1785,7 @@ def triu(x, /, *, k=0):
 
     q = x.sycl_queue
     if k > shape[nd - 1]:
-        res = dpt.zeros(
+        res = dpt_ext.zeros(
             x.shape,
             dtype=x.dtype,
             order=order,
@@ -426,7 +1793,7 @@ def triu(x, /, *, k=0):
             sycl_queue=q,
         )
     elif k <= -shape[nd - 2] + 1:
-        res = dpt.empty(
+        res = dpt_ext.empty(
             x.shape,
             dtype=x.dtype,
             order=order,
@@ -440,7 +1807,7 @@ def triu(x, /, *, k=0):
         )
         _manager.add_event_pair(hev, cpy_ev)
     else:
-        res = dpt.empty(
+        res = dpt_ext.empty(
             x.shape,
             dtype=x.dtype,
             order=order,
@@ -455,3 +1822,156 @@ def triu(x, /, *, k=0):
         _manager.add_event_pair(hev, triu_ev)
 
     return res
+
+
+def zeros(
+    shape,
+    *,
+    dtype=None,
+    order="C",
+    device=None,
+    usm_type="device",
+    sycl_queue=None,
+):
+    """
+    Returns a new :class:`dpctl.tensor.usm_ndarray` having a specified
+    shape and filled with zeros.
+
+    Args:
+        shape (Tuple[int], int):
+            Dimensions of the array to be created.
+        dtype (optional):
+            data type of the array. Can be typestring,
+            a :class:`numpy.dtype` object, :mod:`numpy` char string,
+            or a NumPy scalar type. Default: ``None``
+        order ("C", or "F"):
+            memory layout for the array. Default: ``"C"``
+        device (optional): array API concept of device where the output array
+            is created. ``device`` can be ``None``, a oneAPI filter selector
+            string, an instance of :class:`dpctl.SyclDevice` corresponding to
+            a non-partitioned SYCL device, an instance of
+            :class:`dpctl.SyclQueue`, or a :class:`dpctl.tensor.Device` object
+            returned by :attr:`dpctl.tensor.usm_ndarray.device`.
+            Default: ``None``
+        usm_type (``"device"``, ``"shared"``, ``"host"``, optional):
+            The type of SYCL USM allocation for the output array.
+            Default: ``"device"``
+        sycl_queue (:class:`dpctl.SyclQueue`, optional):
+            The SYCL queue to use
+            for output array allocation and copying. ``sycl_queue`` and
+            ``device`` are complementary arguments, i.e. use one or another.
+            If both are specified, a :exc:`TypeError` is raised unless both
+            imply the same underlying SYCL queue to be used. If both are
+            ``None``, a cached queue targeting default-selected device is
+            used for allocation and population. Default: ``None``
+
+    Returns:
+        usm_ndarray:
+            Constructed array initialized with zeros.
+    """
+    if not isinstance(order, str) or len(order) == 0 or order[0] not in "CcFf":
+        raise ValueError(
+            "Unrecognized order keyword value, expecting 'F' or 'C'."
+        )
+    order = order[0].upper()
+    dpctl.utils.validate_usm_type(usm_type, allow_none=False)
+    sycl_queue = normalize_queue_device(sycl_queue=sycl_queue, device=device)
+    dtype = _get_dtype(dtype, sycl_queue)
+    _ensure_native_dtype_device_support(dtype, sycl_queue.sycl_device)
+    res = dpt.usm_ndarray(
+        shape,
+        dtype=dtype,
+        buffer=usm_type,
+        order=order,
+        buffer_ctor_kwargs={"queue": sycl_queue},
+    )
+    _manager = dpctl.utils.SequentialOrderManager[sycl_queue]
+    # populating new allocation, no dependent events
+    hev, zeros_ev = ti._zeros_usm_ndarray(res, sycl_queue)
+    _manager.add_event_pair(hev, zeros_ev)
+
+    return res
+
+
+def zeros_like(
+    x, /, *, dtype=None, order="K", device=None, usm_type=None, sycl_queue=None
+):
+    """
+    Creates :class:`dpctl.tensor.usm_ndarray` from USM allocation
+    initialized with zeros.
+
+    Args:
+        x (usm_ndarray):
+            Input array from which to derive the shape of the
+            output array.
+        dtype (optional):
+            data type of the array. Can be typestring,
+            a :class:`numpy.dtype` object, :mod:`numpy` char string, or a
+            NumPy scalar type. If `None`, output array has the same data
+            type as the input array. Default: ``None``
+        order ("C", or "F"):
+            memory layout for the array. Default: ``"C"``
+        device (optional):
+            array API concept of device where the output array
+            is created. ``device`` can be ``None``, a oneAPI filter selector
+            string, an instance of :class:`dpctl.SyclDevice` corresponding to
+            a non-partitioned SYCL device, an instance of
+            :class:`dpctl.SyclQueue`, or a :class:`dpctl.tensor.Device` object
+            returned by :attr:`dpctl.tensor.usm_ndarray.device`.
+            Default: ``None``
+        usm_type (``"device"``, ``"shared"``, ``"host"``, optional):
+            The type of SYCL USM allocation for the output array.
+            Default: ``"device"``
+        sycl_queue (:class:`dpctl.SyclQueue`, optional):
+            The SYCL queue to use
+            for output array allocation and copying. ``sycl_queue`` and
+            ``device`` are complementary arguments, i.e. use one or another.
+            If both are specified, a :exc:`TypeError` is raised unless both
+            imply the same underlying SYCL queue to be used. If both are
+            ``None``, a cached queue targeting default-selected device is
+            used for allocation and population. Default: ``None``
+
+    Returns:
+        usm_ndarray:
+            New array initialized with zeros.
+    """
+    if not isinstance(x, dpt.usm_ndarray):
+        raise TypeError(f"Expected instance of dpt.usm_ndarray, got {type(x)}.")
+    if (
+        not isinstance(order, str)
+        or len(order) == 0
+        or order[0] not in "CcFfAaKk"
+    ):
+        raise ValueError(
+            "Unrecognized order keyword value, expecting 'C', 'F', 'A', or 'K'."
+        )
+    order = order[0].upper()
+    if dtype is None:
+        dtype = x.dtype
+    if usm_type is None:
+        usm_type = x.usm_type
+    dpctl.utils.validate_usm_type(usm_type, allow_none=False)
+    if device is None and sycl_queue is None:
+        device = x.device
+    sycl_queue = normalize_queue_device(sycl_queue=sycl_queue, device=device)
+    dtype = dpt.dtype(dtype)
+    order = _normalize_order(order, x)
+    if order == "K":
+        _ensure_native_dtype_device_support(dtype, sycl_queue.sycl_device)
+        res = _empty_like_orderK(x, dtype, usm_type, sycl_queue)
+        _manager = dpctl.utils.SequentialOrderManager[sycl_queue]
+        # populating new allocation, no dependent events
+        hev, full_ev = ti._full_usm_ndarray(0, res, sycl_queue)
+        _manager.add_event_pair(hev, full_ev)
+        return res
+    else:
+        _ensure_native_dtype_device_support(dtype, sycl_queue.sycl_device)
+        sh = x.shape
+        return zeros(
+            sh,
+            dtype=dtype,
+            order=order,
+            device=device,
+            usm_type=usm_type,
+            sycl_queue=sycl_queue,
+        )
diff --git a/dpctl_ext/tensor/_indexing_functions.py b/dpctl_ext/tensor/_indexing_functions.py
index 6ca327192f73..5b4eb1aaf7a2 100644
--- a/dpctl_ext/tensor/_indexing_functions.py
+++ b/dpctl_ext/tensor/_indexing_functions.py
@@ -57,7 +57,7 @@ def _get_indexing_mode(name):
 
 
 def _range(sh_i, i, nd, q, usm_t, dt):
-    ind = dpt.arange(sh_i, dtype=dt, usm_type=usm_t, sycl_queue=q)
+    ind = dpt_ext.arange(sh_i, dtype=dt, usm_type=usm_t, sycl_queue=q)
     ind.shape = tuple(sh_i if i == j else 1 for j in range(nd))
     return ind
 
@@ -177,7 +177,7 @@ def place(arr, mask, vals):
         raise dpctl.utils.ExecutionPlacementError
     if arr.shape != mask.shape or vals.ndim != 1:
         raise ValueError("Array sizes are not as required")
-    cumsum = dpt.empty(mask.size, dtype="i8", sycl_queue=exec_q)
+    cumsum = dpt_ext.empty(mask.size, dtype="i8", sycl_queue=exec_q)
     _manager = dpctl.utils.SequentialOrderManager[exec_q]
     deps_ev = _manager.submitted_events
     nz_count = ti.mask_positions(
@@ -190,7 +190,7 @@ def place(arr, mask, vals):
     if vals.dtype == arr.dtype:
         rhs = vals
     else:
-        rhs = dpt.astype(vals, arr.dtype)
+        rhs = dpt_ext.astype(vals, arr.dtype)
     hev, pl_ev = ti._place(
         dst=arr,
         cumsum=cumsum,
@@ -329,7 +329,7 @@ def put_vec_duplicates(vec, ind, vals):
         val_shape = indices.shape
 
     if not isinstance(vals, dpt.usm_ndarray):
-        vals = dpt.asarray(
+        vals = dpt_ext.asarray(
             vals, dtype=x.dtype, usm_type=vals_usm_type, sycl_queue=exec_q
         )
     # choose to throw here for consistency with `place`
@@ -341,7 +341,7 @@ def put_vec_duplicates(vec, ind, vals):
         rhs = vals
     else:
         rhs = dpt_ext.astype(vals, x.dtype)
-    rhs = dpt.broadcast_to(rhs, val_shape)
+    rhs = dpt_ext.broadcast_to(rhs, val_shape)
 
     _manager = dpctl.utils.SequentialOrderManager[exec_q]
     deps_ev = _manager.submitted_events
@@ -540,9 +540,9 @@ def take(x, indices, /, *, axis=None, out=None, mode="wrap"):
                 "Input and output allocation queues are not compatible"
             )
         if ti._array_overlap(x, out):
-            out = dpt.empty_like(out)
+            out = dpt_ext.empty_like(out)
     else:
-        out = dpt.empty(
+        out = dpt_ext.empty(
             res_shape, dtype=dt, usm_type=res_usm_type, sycl_queue=exec_q
         )
 
diff --git a/dpctl_ext/tensor/_manipulation_functions.py b/dpctl_ext/tensor/_manipulation_functions.py
index f1b8b46dbcbc..08459dcaea76 100644
--- a/dpctl_ext/tensor/_manipulation_functions.py
+++ b/dpctl_ext/tensor/_manipulation_functions.py
@@ -40,6 +40,7 @@
 import dpctl_ext.tensor._tensor_impl as ti
 
 from ._numpy_helper import normalize_axis_index, normalize_axis_tuple
+from ._type_utils import _supported_dtype, _to_device_supported_dtype
 
 __doc__ = (
     "Implementation module for array manipulation "
@@ -47,6 +48,55 @@
 )
 
 
+def _arrays_validation(arrays, check_ndim=True):
+    n = len(arrays)
+    if n == 0:
+        raise TypeError("Missing 1 required positional argument: 'arrays'.")
+
+    if not isinstance(arrays, (list, tuple)):
+        raise TypeError(f"Expected tuple or list type, got {type(arrays)}.")
+
+    for X in arrays:
+        if not isinstance(X, dpt.usm_ndarray):
+            raise TypeError(f"Expected usm_ndarray type, got {type(X)}.")
+
+    exec_q = dputils.get_execution_queue([X.sycl_queue for X in arrays])
+    if exec_q is None:
+        raise ValueError("All the input arrays must have same sycl queue.")
+
+    res_usm_type = dputils.get_coerced_usm_type([X.usm_type for X in arrays])
+    if res_usm_type is None:
+        raise ValueError("All the input arrays must have usm_type.")
+
+    X0 = arrays[0]
+    _supported_dtype(Xi.dtype for Xi in arrays)
+
+    res_dtype = X0.dtype
+    dev = exec_q.sycl_device
+    for i in range(1, n):
+        res_dtype = np.promote_types(res_dtype, arrays[i])
+        res_dtype = _to_device_supported_dtype(res_dtype, dev)
+
+    if check_ndim:
+        for i in range(1, n):
+            if X0.ndim != arrays[i].ndim:
+                raise ValueError(
+                    "All the input arrays must have same number of dimensions, "
+                    f"but the array at index 0 has {X0.ndim} dimension(s) and "
+                    f"the array at index {i} has {arrays[i].ndim} dimension(s)."
+                )
+    return res_dtype, res_usm_type, exec_q
+
+
+def _broadcast_shapes(*args):
+    """
+    Broadcast the input shapes into a single shape;
+    returns tuple broadcasted shape.
+    """
+    array_shapes = [array.shape for array in args]
+    return _broadcast_shape_impl(array_shapes)
+
+
 def _broadcast_shape_impl(shapes):
     if len(set(shapes)) == 1:
         return shapes[0]
@@ -86,6 +136,395 @@ def _broadcast_shape_impl(shapes):
     return tuple(common_shape)
 
 
+def _broadcast_strides(X_shape, X_strides, res_ndim):
+    """
+    Broadcasts strides to match the given dimensions;
+    returns tuple type strides.
+    """
+    out_strides = [0] * res_ndim
+    X_shape_len = len(X_shape)
+    str_dim = -X_shape_len
+    for i in range(X_shape_len):
+        shape_value = X_shape[i]
+        if not shape_value == 1:
+            out_strides[str_dim] = X_strides[i]
+        str_dim += 1
+
+    return tuple(out_strides)
+
+
+def _check_same_shapes(X0_shape, axis, n, arrays):
+    for i in range(1, n):
+        Xi_shape = arrays[i].shape
+        for j, X0j in enumerate(X0_shape):
+            if X0j != Xi_shape[j] and j != axis:
+                raise ValueError(
+                    "All the input array dimensions for the concatenation "
+                    f"axis must match exactly, but along dimension {j}, the "
+                    f"array at index 0 has size {X0j} and the array "
+                    f"at index {i} has size {Xi_shape[j]}."
+                )
+
+
+def _concat_axis_None(arrays):
+    """Implementation of concat(arrays, axis=None)."""
+    res_dtype, res_usm_type, exec_q = _arrays_validation(
+        arrays, check_ndim=False
+    )
+    res_shape = 0
+    for array in arrays:
+        res_shape += array.size
+    res = dpt_ext.empty(
+        res_shape, dtype=res_dtype, usm_type=res_usm_type, sycl_queue=exec_q
+    )
+
+    fill_start = 0
+    _manager = dputils.SequentialOrderManager[exec_q]
+    deps = _manager.submitted_events
+    for array in arrays:
+        fill_end = fill_start + array.size
+        if array.flags.c_contiguous:
+            hev, cpy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
+                src=dpt_ext.reshape(array, -1),
+                dst=res[fill_start:fill_end],
+                sycl_queue=exec_q,
+                depends=deps,
+            )
+            _manager.add_event_pair(hev, cpy_ev)
+        else:
+            src_ = array
+            # _copy_usm_ndarray_for_reshape requires src and dst to have
+            # the same data type
+            if not array.dtype == res_dtype:
+                src2_ = dpt_ext.empty_like(src_, dtype=res_dtype)
+                ht_copy_ev, cpy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
+                    src=src_, dst=src2_, sycl_queue=exec_q, depends=deps
+                )
+                _manager.add_event_pair(ht_copy_ev, cpy_ev)
+                hev, reshape_copy_ev = ti._copy_usm_ndarray_for_reshape(
+                    src=src2_,
+                    dst=res[fill_start:fill_end],
+                    sycl_queue=exec_q,
+                    depends=[cpy_ev],
+                )
+                _manager.add_event_pair(hev, reshape_copy_ev)
+            else:
+                hev, cpy_ev = ti._copy_usm_ndarray_for_reshape(
+                    src=src_,
+                    dst=res[fill_start:fill_end],
+                    sycl_queue=exec_q,
+                    depends=deps,
+                )
+                _manager.add_event_pair(hev, cpy_ev)
+        fill_start = fill_end
+
+    return res
+
+
+def broadcast_arrays(*args):
+    """broadcast_arrays(*arrays)
+
+    Broadcasts one or more :class:`dpctl.tensor.usm_ndarrays` against
+    one another.
+
+    Args:
+        arrays (usm_ndarray): an arbitrary number of arrays to be
+            broadcasted.
+
+    Returns:
+        List[usm_ndarray]:
+            A list of broadcasted arrays. Each array
+            must have the same shape. Each array must have the same `dtype`,
+            `device` and `usm_type` attributes as its corresponding input
+            array.
+    """
+    if len(args) == 0:
+        raise ValueError("`broadcast_arrays` requires at least one argument")
+    for X in args:
+        if not isinstance(X, dpt.usm_ndarray):
+            raise TypeError(f"Expected usm_ndarray type, got {type(X)}.")
+
+    shape = _broadcast_shapes(*args)
+
+    if all(X.shape == shape for X in args):
+        return args
+
+    return [broadcast_to(X, shape) for X in args]
+
+
+def broadcast_to(X, /, shape):
+    """broadcast_to(x, shape)
+
+    Broadcast an array to a new `shape`; returns the broadcasted
+    :class:`dpctl.tensor.usm_ndarray` as a view.
+
+    Args:
+        x (usm_ndarray): input array
+        shape (Tuple[int,...]): array shape. The `shape` must be
+            compatible with `x` according to broadcasting rules.
+
+    Returns:
+        usm_ndarray:
+            An array with the specified `shape`.
+            The output array is a view of the input array, and
+            hence has the same data type, USM allocation type and
+            device attributes.
+    """
+    if not isinstance(X, dpt.usm_ndarray):
+        raise TypeError(f"Expected usm_ndarray type, got {type(X)}.")
+
+    # Use numpy.broadcast_to to check the validity of the input
+    # parameter 'shape'. Raise ValueError if 'X' is not compatible
+    # with 'shape' according to NumPy's broadcasting rules.
+    new_array = np.broadcast_to(
+        np.broadcast_to(np.empty(tuple(), dtype="u1"), X.shape), shape
+    )
+    new_sts = _broadcast_strides(X.shape, X.strides, new_array.ndim)
+    return dpt.usm_ndarray(
+        shape=new_array.shape,
+        dtype=X.dtype,
+        buffer=X,
+        strides=new_sts,
+        offset=X._element_offset,
+    )
+
+
+def concat(arrays, /, *, axis=0):
+    """concat(arrays, axis)
+
+    Joins a sequence of arrays along an existing axis.
+
+    Args:
+        arrays (Union[List[usm_ndarray, Tuple[usm_ndarray,...]]]):
+            input arrays to join. The arrays must have the same shape,
+            except in the dimension specified by `axis`.
+        axis (Optional[int]): axis along which the arrays will be joined.
+            If `axis` is `None`, arrays must be flattened before
+            concatenation. If `axis` is negative, it is understood as
+            being counted from the last dimension. Default: `0`.
+
+    Returns:
+        usm_ndarray:
+            An output array containing the concatenated
+            values. The output array data type is determined by Type
+            Promotion Rules of array API.
+
+    All input arrays must have the same device attribute. The output array
+    is allocated on that same device, and data movement operations are
+    scheduled on a queue underlying the device. The USM allocation type
+    of the output array is determined by USM allocation type promotion
+    rules.
+    """
+    if axis is None:
+        return _concat_axis_None(arrays)
+
+    res_dtype, res_usm_type, exec_q = _arrays_validation(arrays)
+    n = len(arrays)
+    X0 = arrays[0]
+
+    axis = normalize_axis_index(axis, X0.ndim)
+    X0_shape = X0.shape
+    _check_same_shapes(X0_shape, axis, n, arrays)
+
+    res_shape_axis = 0
+    for X in arrays:
+        res_shape_axis = res_shape_axis + X.shape[axis]
+
+    res_shape = tuple(
+        X0_shape[i] if i != axis else res_shape_axis for i in range(X0.ndim)
+    )
+
+    res = dpt_ext.empty(
+        res_shape, dtype=res_dtype, usm_type=res_usm_type, sycl_queue=exec_q
+    )
+
+    _manager = dputils.SequentialOrderManager[exec_q]
+    deps = _manager.submitted_events
+    fill_start = 0
+    for i in range(n):
+        fill_end = fill_start + arrays[i].shape[axis]
+        c_shapes_copy = tuple(
+            np.s_[fill_start:fill_end] if j == axis else np.s_[:]
+            for j in range(X0.ndim)
+        )
+        hev, cpy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
+            src=arrays[i],
+            dst=res[c_shapes_copy],
+            sycl_queue=exec_q,
+            depends=deps,
+        )
+        _manager.add_event_pair(hev, cpy_ev)
+        fill_start = fill_end
+
+    return res
+
+
+def expand_dims(X, /, *, axis=0):
+    """expand_dims(x, axis)
+
+    Expands the shape of an array by inserting a new axis (dimension)
+    of size one at the position specified by axis.
+
+    Args:
+        x (usm_ndarray):
+            input array
+        axis (Union[int, Tuple[int]]):
+            axis position in the expanded axes (zero-based). If `x` has rank
+            (i.e, number of dimensions) `N`, a valid `axis` must reside
+            in the closed-interval `[-N-1, N]`. If provided a negative
+            `axis`, the `axis` position at which to insert a singleton
+            dimension is computed as `N + axis + 1`. Hence, if
+            provided `-1`, the resolved axis position is `N` (i.e.,
+            a singleton dimension must be appended to the input array `x`).
+            If provided `-N-1`, the resolved axis position is `0` (i.e., a
+            singleton dimension is prepended to the input array `x`).
+
+    Returns:
+        usm_ndarray:
+            Returns a view, if possible, and a copy otherwise with the number
+            of dimensions increased.
+            The expanded array has the same data type as the input array `x`.
+            The expanded array is located on the same device as the input
+            array, and has the same USM allocation type.
+
+    Raises:
+        IndexError: if `axis` value is invalid.
+    """
+    if not isinstance(X, dpt.usm_ndarray):
+        raise TypeError(f"Expected usm_ndarray type, got {type(X)}.")
+
+    if type(axis) not in (tuple, list):
+        axis = (axis,)
+
+    out_ndim = len(axis) + X.ndim
+    axis = normalize_axis_tuple(axis, out_ndim)
+
+    shape_it = iter(X.shape)
+    shape = tuple(1 if ax in axis else next(shape_it) for ax in range(out_ndim))
+
+    return dpt_ext.reshape(X, shape)
+
+
+def flip(X, /, *, axis=None):
+    """flip(x, axis)
+
+    Reverses the order of elements in an array `x` along the given `axis`.
+    The shape of the array is preserved, but the elements are reordered.
+
+    Args:
+        x (usm_ndarray): input array.
+        axis (Optional[Union[int, Tuple[int,...]]]): axis (or axes) along
+            which to flip.
+            If `axis` is `None`, all input array axes are flipped.
+            If `axis` is negative, the flipped axis is counted from the
+            last dimension. If provided more than one axis, only the specified
+            axes are flipped. Default: `None`.
+
+    Returns:
+        usm_ndarray:
+            A view of `x` with the entries of `axis` reversed.
+    """
+    if not isinstance(X, dpt.usm_ndarray):
+        raise TypeError(f"Expected usm_ndarray type, got {type(X)}.")
+    X_ndim = X.ndim
+    if axis is None:
+        indexer = (np.s_[::-1],) * X_ndim
+    else:
+        axis = normalize_axis_tuple(axis, X_ndim)
+        indexer = tuple(
+            np.s_[::-1] if i in axis else np.s_[:] for i in range(X.ndim)
+        )
+    return X[indexer]
+
+
+def moveaxis(X, source, destination, /):
+    """moveaxis(x, source, destination)
+
+    Moves axes of an array to new positions.
+
+    Args:
+        x (usm_ndarray): input array
+
+        source (int or a sequence of int):
+            Original positions of the axes to move.
+            These must be unique. If `x` has rank (i.e., number of
+            dimensions) `N`, a valid `axis` must be in the
+            half-open interval `[-N, N)`.
+
+        destination (int or a sequence of int):
+            Destination positions for each of the original axes.
+            These must also be unique. If `x` has rank
+            (i.e., number of dimensions) `N`, a valid `axis` must be
+            in the half-open interval `[-N, N)`.
+
+    Returns:
+        usm_ndarray:
+            Array with moved axes.
+            The returned array must has the same data type as `x`,
+            is created on the same device as `x` and has the same
+            USM allocation type as `x`.
+
+    Raises:
+        AxisError: if `axis` value is invalid.
+        ValueError: if `src` and `dst` have not equal number of elements.
+    """
+    if not isinstance(X, dpt.usm_ndarray):
+        raise TypeError(f"Expected usm_ndarray type, got {type(X)}.")
+
+    source = normalize_axis_tuple(source, X.ndim, "source")
+    destination = normalize_axis_tuple(destination, X.ndim, "destination")
+
+    if len(source) != len(destination):
+        raise ValueError(
+            "`source` and `destination` arguments must have "
+            "the same number of elements"
+        )
+
+    ind = [n for n in range(X.ndim) if n not in source]
+
+    for src, dst in sorted(zip(destination, source)):
+        ind.insert(src, dst)
+
+    return dpt_ext.permute_dims(X, tuple(ind))
+
+
+def permute_dims(X, /, axes):
+    """permute_dims(x, axes)
+
+    Permute the axes (dimensions) of an array; returns the permuted
+    array as a view.
+
+    Args:
+        x (usm_ndarray): input array.
+        axes (Tuple[int, ...]): tuple containing permutation of
+           `(0,1,...,N-1)` where `N` is the number of axes (dimensions)
+           of `x`.
+    Returns:
+        usm_ndarray:
+            An array with permuted axes.
+            The returned array must has the same data type as `x`,
+            is created on the same device as `x` and has the same USM allocation
+            type as `x`.
+    """
+    if not isinstance(X, dpt.usm_ndarray):
+        raise TypeError(f"Expected usm_ndarray type, got {type(X)}.")
+    axes = normalize_axis_tuple(axes, X.ndim, "axes")
+    if not X.ndim == len(axes):
+        raise ValueError(
+            "The length of the passed axes does not match "
+            "to the number of usm_ndarray dimensions."
+        )
+    newstrides = tuple(X.strides[i] for i in axes)
+    newshape = tuple(X.shape[i] for i in axes)
+    return dpt.usm_ndarray(
+        shape=newshape,
+        dtype=X.dtype,
+        buffer=X,
+        strides=newstrides,
+        offset=X._element_offset,
+    )
+
+
 def repeat(x, repeats, /, *, axis=None):
     """repeat(x, repeats, axis=None)
 
@@ -204,7 +643,7 @@ def repeat(x, repeats, /, *, axis=None):
                     "`repeats` sequence must have the same length as the "
                     "repeated axis"
                 )
-            repeats = dpt.asarray(
+            repeats = dpt_ext.asarray(
                 repeats, dtype=dpt.int64, usm_type=usm_type, sycl_queue=exec_q
             )
             if not dpt.all(repeats >= 0):
@@ -223,7 +662,7 @@ def repeat(x, repeats, /, *, axis=None):
             res_shape = x_shape[:axis] + (res_axis_size,) + x_shape[axis + 1 :]
         else:
             res_shape = (res_axis_size,)
-        res = dpt.empty(
+        res = dpt_ext.empty(
             res_shape, dtype=x.dtype, usm_type=usm_type, sycl_queue=exec_q
         )
         if res_axis_size > 0:
@@ -238,7 +677,7 @@ def repeat(x, repeats, /, *, axis=None):
             _manager.add_event_pair(ht_rep_ev, rep_ev)
     else:
         if repeats.dtype != dpt.int64:
-            rep_buf = dpt.empty(
+            rep_buf = dpt_ext.empty(
                 repeats.shape,
                 dtype=dpt.int64,
                 usm_type=usm_type,
@@ -248,7 +687,7 @@ def repeat(x, repeats, /, *, axis=None):
                 src=repeats, dst=rep_buf, sycl_queue=exec_q, depends=dep_evs
             )
             _manager.add_event_pair(ht_copy_ev, copy_ev)
-            cumsum = dpt.empty(
+            cumsum = dpt_ext.empty(
                 (axis_size,),
                 dtype=dpt.int64,
                 usm_type=usm_type,
@@ -264,7 +703,7 @@ def repeat(x, repeats, /, *, axis=None):
                 )
             else:
                 res_shape = (res_axis_size,)
-            res = dpt.empty(
+            res = dpt_ext.empty(
                 res_shape,
                 dtype=x.dtype,
                 usm_type=usm_type,
@@ -281,7 +720,7 @@ def repeat(x, repeats, /, *, axis=None):
                 )
                 _manager.add_event_pair(ht_rep_ev, rep_ev)
         else:
-            cumsum = dpt.empty(
+            cumsum = dpt_ext.empty(
                 (axis_size,),
                 dtype=dpt.int64,
                 usm_type=usm_type,
@@ -296,7 +735,7 @@ def repeat(x, repeats, /, *, axis=None):
                 )
             else:
                 res_shape = (res_axis_size,)
-            res = dpt.empty(
+            res = dpt_ext.empty(
                 res_shape,
                 dtype=x.dtype,
                 usm_type=usm_type,
@@ -353,7 +792,7 @@ def roll(x, /, shift, *, axis=None):
     _manager = dputils.SequentialOrderManager[exec_q]
     if axis is None:
         shift = operator.index(shift)
-        res = dpt.empty(
+        res = dpt_ext.empty(
             x.shape, dtype=x.dtype, usm_type=x.usm_type, sycl_queue=exec_q
         )
         sz = operator.index(x.size)
@@ -380,7 +819,7 @@ def roll(x, /, shift, *, axis=None):
         n_i = operator.index(shape[ax])
         shifted = shifts[ax] + operator.index(sh)
         shifts[ax] = (shifted % n_i) if n_i > 0 else 0
-    res = dpt.empty(
+    res = dpt_ext.empty(
         x.shape, dtype=x.dtype, usm_type=x.usm_type, sycl_queue=exec_q
     )
     dep_evs = _manager.submitted_events
@@ -389,3 +828,273 @@ def roll(x, /, shift, *, axis=None):
     )
     _manager.add_event_pair(ht_e, roll_ev)
     return res
+
+
+def squeeze(X, /, axis=None):
+    """squeeze(x, axis)
+
+    Removes singleton dimensions (axes) from array `x`.
+
+    Args:
+        x (usm_ndarray): input array
+        axis (Union[int, Tuple[int,...]]): axis (or axes) to squeeze.
+
+    Returns:
+        usm_ndarray:
+            Output array is a view, if possible,
+            and a copy otherwise, but with all or a subset of the
+            dimensions of length 1 removed. Output has the same data
+            type as the input, is allocated on the same device as the
+            input and has the same USM allocation type as the input
+            array `x`.
+
+    Raises:
+        ValueError: if the specified axis has a size greater than one.
+    """
+    if not isinstance(X, dpt.usm_ndarray):
+        raise TypeError(f"Expected usm_ndarray type, got {type(X)}.")
+    X_shape = X.shape
+    if axis is not None:
+        axis = normalize_axis_tuple(axis, X.ndim if X.ndim != 0 else X.ndim + 1)
+        new_shape = []
+        for i, x in enumerate(X_shape):
+            if i not in axis:
+                new_shape.append(x)
+            else:
+                if x != 1:
+                    raise ValueError(
+                        "Cannot select an axis to squeeze out "
+                        "which has size not equal to one."
+                    )
+        new_shape = tuple(new_shape)
+    else:
+        new_shape = tuple(axis for axis in X_shape if axis != 1)
+    if new_shape == X.shape:
+        return X
+    else:
+        return dpt_ext.reshape(X, new_shape)
+
+
+def stack(arrays, /, *, axis=0):
+    """
+    stack(arrays, axis)
+
+    Joins a sequence of arrays along a new axis.
+
+    Args:
+        arrays (Union[List[usm_ndarray], Tuple[usm_ndarray,...]]):
+            input arrays to join. Each array must have the same shape.
+        axis (int): axis along which the arrays will be joined. Providing
+            an `axis` specified the index of the new axis in the dimensions
+            of the output array. A valid axis must be on the interval
+            `[-N, N)`, where `N` is the rank (number of dimensions) of `x`.
+            Default: `0`.
+
+    Returns:
+        usm_ndarray:
+            An output array having rank `N+1`, where `N` is
+            the rank (number of dimensions) of `x`. If the input arrays have
+            different data types, array API Type Promotion Rules apply.
+
+    Raises:
+        ValueError: if not all input arrays have the same shape
+        IndexError: if provided an `axis` outside of the required interval.
+    """
+    res_dtype, res_usm_type, exec_q = _arrays_validation(arrays)
+
+    n = len(arrays)
+    X0 = arrays[0]
+    res_ndim = X0.ndim + 1
+    axis = normalize_axis_index(axis, res_ndim)
+    X0_shape = X0.shape
+
+    for i in range(1, n):
+        if X0_shape != arrays[i].shape:
+            raise ValueError("All input arrays must have the same shape")
+
+    res_shape = tuple(
+        X0_shape[i - 1 * (i >= axis)] if i != axis else n
+        for i in range(res_ndim)
+    )
+
+    res = dpt_ext.empty(
+        res_shape, dtype=res_dtype, usm_type=res_usm_type, sycl_queue=exec_q
+    )
+
+    _manager = dputils.SequentialOrderManager[exec_q]
+    dep_evs = _manager.submitted_events
+    for i in range(n):
+        c_shapes_copy = tuple(
+            i if j == axis else np.s_[:] for j in range(res_ndim)
+        )
+        _dst = res[c_shapes_copy]
+        hev, cpy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
+            src=arrays[i], dst=_dst, sycl_queue=exec_q, depends=dep_evs
+        )
+        _manager.add_event_pair(hev, cpy_ev)
+
+    return res
+
+
+def swapaxes(X, axis1, axis2):
+    """swapaxes(x, axis1, axis2)
+
+    Interchanges two axes of an array.
+
+    Args:
+        x (usm_ndarray): input array
+
+        axis1 (int): First axis.
+            If `x` has rank (i.e., number of dimensions) `N`,
+            a valid `axis` must be in the half-open interval `[-N, N)`.
+
+        axis2 (int): Second axis.
+            If `x` has rank (i.e., number of dimensions) `N`,
+            a valid `axis` must be in the half-open interval `[-N, N)`.
+
+    Returns:
+        usm_ndarray:
+            Array with swapped axes.
+            The returned array must has the same data type as `x`,
+            is created on the same device as `x` and has the same USM
+            allocation type as `x`.
+
+    Raises:
+        AxisError: if `axis` value is invalid.
+    """
+    if not isinstance(X, dpt.usm_ndarray):
+        raise TypeError(f"Expected usm_ndarray type, got {type(X)}.")
+
+    axis1 = normalize_axis_index(axis1, X.ndim, "axis1")
+    axis2 = normalize_axis_index(axis2, X.ndim, "axis2")
+
+    ind = list(range(0, X.ndim))
+    ind[axis1] = axis2
+    ind[axis2] = axis1
+    return dpt_ext.permute_dims(X, tuple(ind))
+
+
+def unstack(X, /, *, axis=0):
+    """unstack(x, axis=0)
+
+    Splits an array in a sequence of arrays along the given axis.
+
+    Args:
+        x (usm_ndarray): input array
+
+        axis (int, optional): axis along which `x` is unstacked.
+            If `x` has rank (i.e, number of dimensions) `N`,
+            a valid `axis` must reside in the half-open interval `[-N, N)`.
+            Default: `0`.
+
+    Returns:
+        Tuple[usm_ndarray,...]:
+            Output sequence of arrays which are views into the input array.
+
+    Raises:
+        AxisError: if the `axis` value is invalid.
+    """
+    if not isinstance(X, dpt.usm_ndarray):
+        raise TypeError(f"Expected usm_ndarray type, got {type(X)}.")
+
+    axis = normalize_axis_index(axis, X.ndim)
+    Y = dpt_ext.moveaxis(X, axis, 0)
+
+    return tuple(Y[i] for i in range(Y.shape[0]))
+
+
+def tile(x, repetitions, /):
+    """tile(x, repetitions)
+
+    Repeat an input array `x` along each axis a number of times given by
+    `repetitions`.
+
+    For `N` = len(`repetitions`) and `M` = len(`x.shape`):
+
+        * If `M < N`, `x` will have `N - M` new axes prepended to its shape
+        * If `M > N`, `repetitions` will have `M - N` ones prepended to it
+
+    Args:
+        x (usm_ndarray): input array
+
+        repetitions (Union[int, Tuple[int, ...]]):
+            The number of repetitions along each dimension of `x`.
+
+    Returns:
+        usm_ndarray:
+            tiled output array.
+
+            The returned array will have rank `max(M, N)`. If `S` is the
+            shape of `x` after prepending dimensions and `R` is
+            `repetitions` after prepending ones, then the shape of the
+            result will be `S[i] * R[i]` for each dimension `i`.
+
+            The returned array will have the same data type as `x`.
+            The returned array will be located on the same device as `x` and
+            have the same USM allocation type as `x`.
+    """
+    if not isinstance(x, dpt.usm_ndarray):
+        raise TypeError(f"Expected usm_ndarray type, got {type(x)}.")
+
+    if not isinstance(repetitions, tuple):
+        if isinstance(repetitions, int):
+            repetitions = (repetitions,)
+        else:
+            raise TypeError(
+                f"Expected tuple or integer type, got {type(repetitions)}."
+            )
+
+    rep_dims = len(repetitions)
+    x_dims = x.ndim
+    if rep_dims < x_dims:
+        repetitions = (x_dims - rep_dims) * (1,) + repetitions
+    elif x_dims < rep_dims:
+        x = dpt_ext.reshape(x, (rep_dims - x_dims) * (1,) + x.shape)
+    res_shape = tuple(map(lambda sh, rep: sh * rep, x.shape, repetitions))
+    # case of empty input
+    if x.size == 0:
+        return dpt_ext.empty(
+            res_shape,
+            dtype=x.dtype,
+            usm_type=x.usm_type,
+            sycl_queue=x.sycl_queue,
+        )
+    in_sh = x.shape
+    if res_shape == in_sh:
+        return dpt_ext.copy(x)
+    expanded_sh = []
+    broadcast_sh = []
+    out_sz = 1
+    for i in range(len(res_shape)):
+        out_sz *= res_shape[i]
+        reps, sh = repetitions[i], in_sh[i]
+        if reps == 1:
+            # dimension will be unchanged
+            broadcast_sh.append(sh)
+            expanded_sh.append(sh)
+        elif sh == 1:
+            # dimension will be broadcast
+            broadcast_sh.append(reps)
+            expanded_sh.append(sh)
+        else:
+            broadcast_sh.extend([reps, sh])
+            expanded_sh.extend([1, sh])
+    exec_q = x.sycl_queue
+    xdt = x.dtype
+    xut = x.usm_type
+    res = dpt_ext.empty((out_sz,), dtype=xdt, usm_type=xut, sycl_queue=exec_q)
+    # no need to copy data for empty output
+    if out_sz > 0:
+        x = dpt_ext.broadcast_to(
+            # this reshape should never copy
+            dpt_ext.reshape(x, expanded_sh),
+            broadcast_sh,
+        )
+        # copy broadcast input into flat array
+        _manager = dputils.SequentialOrderManager[exec_q]
+        dep_evs = _manager.submitted_events
+        hev, cp_ev = ti._copy_usm_ndarray_for_reshape(
+            src=x, dst=res, sycl_queue=exec_q, depends=dep_evs
+        )
+        _manager.add_event_pair(hev, cp_ev)
+    return dpt_ext.reshape(res, res_shape)
diff --git a/dpctl_ext/tensor/_reshape.py b/dpctl_ext/tensor/_reshape.py
index 61aa6c9c754f..23cf47a83568 100644
--- a/dpctl_ext/tensor/_reshape.py
+++ b/dpctl_ext/tensor/_reshape.py
@@ -32,9 +32,11 @@
 import dpctl.utils
 import numpy as np
 
-# TODO: revert to `from dpctl.tensor._tensor_impl...`
+# TODO: revert to `import dpctl.tensor...`
 # when dpnp fully migrates dpctl/tensor
-from dpctl_ext.tensor._tensor_impl import (
+import dpctl_ext.tensor as dpt_ext
+
+from ._tensor_impl import (
     _copy_usm_ndarray_for_reshape,
     _ravel_multi_index,
     _unravel_index,
@@ -187,7 +189,7 @@ def reshape(X, /, shape, *, order="C", copy=None):
                 src=X, dst=flat_res, sycl_queue=copy_q, depends=dep_evs
             )
         else:
-            X_t = dpt.permute_dims(X, range(X.ndim - 1, -1, -1))
+            X_t = dpt_ext.permute_dims(X, range(X.ndim - 1, -1, -1))
             hev, r_e = _copy_usm_ndarray_for_reshape(
                 src=X_t, dst=flat_res, sycl_queue=copy_q, depends=dep_evs
             )
diff --git a/dpctl_ext/tensor/_scalar_utils.py b/dpctl_ext/tensor/_scalar_utils.py
index 86787baea8cc..3ab92b42ad00 100644
--- a/dpctl_ext/tensor/_scalar_utils.py
+++ b/dpctl_ext/tensor/_scalar_utils.py
@@ -33,6 +33,10 @@
 import numpy as np
 from dpctl.tensor._usmarray import _is_object_with_buffer_protocol as _is_buffer
 
+# TODO: revert to `import dpctl.tensor...`
+# when dpnp fully migrates dpctl/tensor
+import dpctl_ext.tensor as dpt_ext
+
 from ._type_utils import (
     WeakBooleanType,
     WeakComplexType,
@@ -59,7 +63,7 @@ def _get_dtype(o, dev):
     if isinstance(o, dpt.usm_ndarray):
         return o.dtype
     if hasattr(o, "__sycl_usm_array_interface__"):
-        return dpt.asarray(o).dtype
+        return dpt_ext.asarray(o).dtype
     if _is_buffer(o):
         host_dt = np.array(o).dtype
         dev_dt = _to_device_supported_dtype(host_dt, dev)
diff --git a/dpctl_ext/tensor/_search_functions.py b/dpctl_ext/tensor/_search_functions.py
index a82845e3520c..285a02b42bb8 100644
--- a/dpctl_ext/tensor/_search_functions.py
+++ b/dpctl_ext/tensor/_search_functions.py
@@ -291,7 +291,7 @@ def where(condition, x1, x2, /, *, order="K", out=None):
         if ti._array_overlap(condition, out) and not ti._same_logical_tensors(
             condition, out
         ):
-            out = dpt.empty_like(out)
+            out = dpt_ext.empty_like(out)
 
         if isinstance(x1, dpt.usm_ndarray):
             if (
@@ -299,7 +299,7 @@ def where(condition, x1, x2, /, *, order="K", out=None):
                 and not ti._same_logical_tensors(x1, out)
                 and x1_dtype == out_dtype
             ):
-                out = dpt.empty_like(out)
+                out = dpt_ext.empty_like(out)
 
         if isinstance(x2, dpt.usm_ndarray):
             if (
@@ -307,7 +307,7 @@ def where(condition, x1, x2, /, *, order="K", out=None):
                 and not ti._same_logical_tensors(x2, out)
                 and x2_dtype == out_dtype
             ):
-                out = dpt.empty_like(out)
+                out = dpt_ext.empty_like(out)
 
     if order == "A":
         order = (
@@ -323,9 +323,9 @@ def where(condition, x1, x2, /, *, order="K", out=None):
             else "C"
         )
     if not isinstance(x1, dpt.usm_ndarray):
-        x1 = dpt.asarray(x1, dtype=x1_dtype, sycl_queue=exec_q)
+        x1 = dpt_ext.asarray(x1, dtype=x1_dtype, sycl_queue=exec_q)
     if not isinstance(x2, dpt.usm_ndarray):
-        x2 = dpt.asarray(x2, dtype=x2_dtype, sycl_queue=exec_q)
+        x2 = dpt_ext.asarray(x2, dtype=x2_dtype, sycl_queue=exec_q)
 
     if condition.size == 0:
         if out is not None:
@@ -342,7 +342,7 @@ def where(condition, x1, x2, /, *, order="K", out=None):
                     exec_q,
                 )
             else:
-                return dpt.empty(
+                return dpt_ext.empty(
                     res_shape,
                     dtype=out_dtype,
                     order=order,
@@ -356,7 +356,7 @@ def where(condition, x1, x2, /, *, order="K", out=None):
         if order == "K":
             _x1 = _empty_like_orderK(x1, out_dtype)
         else:
-            _x1 = dpt.empty_like(x1, dtype=out_dtype, order=order)
+            _x1 = dpt_ext.empty_like(x1, dtype=out_dtype, order=order)
         ht_copy1_ev, copy1_ev = ti._copy_usm_ndarray_into_usm_ndarray(
             src=x1, dst=_x1, sycl_queue=exec_q, depends=dep_evs
         )
@@ -367,7 +367,7 @@ def where(condition, x1, x2, /, *, order="K", out=None):
         if order == "K":
             _x2 = _empty_like_orderK(x2, out_dtype)
         else:
-            _x2 = dpt.empty_like(x2, dtype=out_dtype, order=order)
+            _x2 = dpt_ext.empty_like(x2, dtype=out_dtype, order=order)
         ht_copy2_ev, copy2_ev = ti._copy_usm_ndarray_into_usm_ndarray(
             src=x2, dst=_x2, sycl_queue=exec_q, depends=dep_evs
         )
@@ -380,7 +380,7 @@ def where(condition, x1, x2, /, *, order="K", out=None):
                 condition, x1, x2, out_dtype, res_shape, out_usm_type, exec_q
             )
         else:
-            out = dpt.empty(
+            out = dpt_ext.empty(
                 res_shape,
                 dtype=out_dtype,
                 order=order,
@@ -389,11 +389,11 @@ def where(condition, x1, x2, /, *, order="K", out=None):
             )
 
     if condition_shape != res_shape:
-        condition = dpt.broadcast_to(condition, res_shape)
+        condition = dpt_ext.broadcast_to(condition, res_shape)
     if x1_shape != res_shape:
-        x1 = dpt.broadcast_to(x1, res_shape)
+        x1 = dpt_ext.broadcast_to(x1, res_shape)
     if x2_shape != res_shape:
-        x2 = dpt.broadcast_to(x2, res_shape)
+        x2 = dpt_ext.broadcast_to(x2, res_shape)
 
     dep_evs = _manager.submitted_events
     hev, where_ev = ti._where(
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/constructors.hpp b/dpctl_ext/tensor/libtensor/include/kernels/constructors.hpp
index f48dfa4d4077..67f2502067ca 100644
--- a/dpctl_ext/tensor/libtensor/include/kernels/constructors.hpp
+++ b/dpctl_ext/tensor/libtensor/include/kernels/constructors.hpp
@@ -36,6 +36,7 @@
 
 #include <array>
 #include <cstddef>
+#include <type_traits>
 #include <vector>
 
 #include <sycl/sycl.hpp>
@@ -54,6 +55,10 @@ using dpctl::tensor::ssize_t;
   @defgroup CtorKernels
  */
 
+template <typename Ty>
+class linear_sequence_step_kernel;
+template <typename Ty, typename wTy>
+class linear_sequence_affine_kernel;
 template <typename Ty>
 class full_strided_kernel;
 template <typename Ty>
@@ -61,6 +66,179 @@ class eye_kernel;
 
 using namespace dpctl::tensor::offset_utils;
 
+template <typename Ty>
+class LinearSequenceStepFunctor
+{
+private:
+    Ty *p = nullptr;
+    Ty start_v;
+    Ty step_v;
+
+public:
+    LinearSequenceStepFunctor(char *dst_p, Ty v0, Ty dv)
+        : p(reinterpret_cast<Ty *>(dst_p)), start_v(v0), step_v(dv)
+    {
+    }
+
+    void operator()(sycl::id<1> wiid) const
+    {
+        auto i = wiid.get(0);
+        using dpctl::tensor::type_utils::is_complex;
+        if constexpr (is_complex<Ty>::value) {
+            p[i] = Ty{start_v.real() + i * step_v.real(),
+                      start_v.imag() + i * step_v.imag()};
+        }
+        else {
+            p[i] = start_v + i * step_v;
+        }
+    }
+};
+
+/*!
+ * @brief Function to submit kernel to populate given contiguous memory
+ * allocation with linear sequence specified by typed starting value and
+ * increment.
+ *
+ * @param q  Sycl queue to which the kernel is submitted
+ * @param nelems Length of the sequence
+ * @param start_v Typed starting value of the sequence
+ * @param step_v  Typed increment of the sequence
+ * @param array_data Kernel accessible USM pointer to the start of array to be
+ * populated.
+ * @param depends List of events to wait for before starting computations, if
+ * any.
+ *
+ * @return Event to wait on to ensure that computation completes.
+ * @defgroup CtorKernels
+ */
+template <typename Ty>
+sycl::event lin_space_step_impl(sycl::queue &exec_q,
+                                std::size_t nelems,
+                                Ty start_v,
+                                Ty step_v,
+                                char *array_data,
+                                const std::vector<sycl::event> &depends)
+{
+    dpctl::tensor::type_utils::validate_type_for_device<Ty>(exec_q);
+    sycl::event lin_space_step_event = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+        cgh.parallel_for<linear_sequence_step_kernel<Ty>>(
+            sycl::range<1>{nelems},
+            LinearSequenceStepFunctor<Ty>(array_data, start_v, step_v));
+    });
+
+    return lin_space_step_event;
+}
+
+// Constructor to populate tensor with linear sequence defined by
+// start and data
+
+template <typename Ty, typename wTy>
+class LinearSequenceAffineFunctor
+{
+private:
+    Ty *p = nullptr;
+    Ty start_v;
+    Ty end_v;
+    std::size_t n;
+
+public:
+    LinearSequenceAffineFunctor(char *dst_p, Ty v0, Ty v1, std::size_t den)
+        : p(reinterpret_cast<Ty *>(dst_p)), start_v(v0), end_v(v1),
+          n((den == 0) ? 1 : den)
+    {
+    }
+
+    void operator()(sycl::id<1> wiid) const
+    {
+        auto i = wiid.get(0);
+        wTy wc = wTy(i) / n;
+        wTy w = wTy(n - i) / n;
+        using dpctl::tensor::type_utils::is_complex;
+        if constexpr (is_complex<Ty>::value) {
+            using reT = typename Ty::value_type;
+            auto _w = static_cast<reT>(w);
+            auto _wc = static_cast<reT>(wc);
+            auto re_comb = sycl::fma(start_v.real(), _w, reT(0));
+            re_comb =
+                sycl::fma(end_v.real(), _wc,
+                          re_comb); // start_v.real() * _w + end_v.real() * _wc;
+            auto im_comb =
+                sycl::fma(start_v.imag(), _w,
+                          reT(0)); // start_v.imag() * _w + end_v.imag() * _wc;
+            im_comb = sycl::fma(end_v.imag(), _wc, im_comb);
+            Ty affine_comb = Ty{re_comb, im_comb};
+            p[i] = affine_comb;
+        }
+        else if constexpr (std::is_floating_point<Ty>::value) {
+            Ty _w = static_cast<Ty>(w);
+            Ty _wc = static_cast<Ty>(wc);
+            auto affine_comb =
+                sycl::fma(start_v, _w, Ty(0)); // start_v * w + end_v * wc;
+            affine_comb = sycl::fma(end_v, _wc, affine_comb);
+            p[i] = affine_comb;
+        }
+        else {
+            using dpctl::tensor::type_utils::convert_impl;
+            auto affine_comb = start_v * w + end_v * wc;
+            p[i] = convert_impl<Ty, decltype(affine_comb)>(affine_comb);
+        }
+    }
+};
+
+/*!
+ * @brief Function to submit kernel to populate given contiguous memory
+ * allocation with linear sequence specified by typed starting and end values.
+ *
+ * @param exec_q  Sycl queue to which kernel is submitted for execution.
+ * @param nelems  Length of the sequence.
+ * @param start_v Starting value of the sequence.
+ * @param end_v   End-value of the sequence.
+ * @param include_endpoint  Whether the end-value is included in the sequence.
+ * @param array_data Kernel accessible USM pointer to the start of array to be
+ * populated.
+ * @param depends  List of events to wait for before starting computations, if
+ * any.
+ *
+ * @return Event to wait on to ensure that computation completes.
+ * @defgroup CtorKernels
+ */
+template <typename Ty>
+sycl::event lin_space_affine_impl(sycl::queue &exec_q,
+                                  std::size_t nelems,
+                                  Ty start_v,
+                                  Ty end_v,
+                                  bool include_endpoint,
+                                  char *array_data,
+                                  const std::vector<sycl::event> &depends)
+{
+    dpctl::tensor::type_utils::validate_type_for_device<Ty>(exec_q);
+
+    const bool device_supports_doubles =
+        exec_q.get_device().has(sycl::aspect::fp64);
+    const std::size_t den = (include_endpoint) ? nelems - 1 : nelems;
+
+    sycl::event lin_space_affine_event = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+        if (device_supports_doubles) {
+            using KernelName = linear_sequence_affine_kernel<Ty, double>;
+            using Impl = LinearSequenceAffineFunctor<Ty, double>;
+
+            cgh.parallel_for<KernelName>(sycl::range<1>{nelems},
+                                         Impl(array_data, start_v, end_v, den));
+        }
+        else {
+            using KernelName = linear_sequence_affine_kernel<Ty, float>;
+            using Impl = LinearSequenceAffineFunctor<Ty, float>;
+
+            cgh.parallel_for<KernelName>(sycl::range<1>{nelems},
+                                         Impl(array_data, start_v, end_v, den));
+        }
+    });
+
+    return lin_space_affine_event;
+}
+
 /* ================ Full ================== */
 
 /*!
diff --git a/dpctl_ext/tensor/libtensor/source/linear_sequences.cpp b/dpctl_ext/tensor/libtensor/source/linear_sequences.cpp
new file mode 100644
index 000000000000..9a7bf2dbcc0f
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/linear_sequences.cpp
@@ -0,0 +1,306 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions
+//===---------------------------------------------------------------------===//
+
+#include <cstddef>
+#include <utility>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/complex.h> // py::cast<std::complex<T>>
+#include <pybind11/pybind11.h>
+
+#include "kernels/constructors.hpp"
+#include "utils/output_validation.hpp"
+#include "utils/type_dispatch.hpp"
+
+#include "linear_sequences.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+// Constructor to populate tensor with linear sequence defined by
+// start and step data
+
+typedef sycl::event (*lin_space_step_fn_ptr_t)(
+    sycl::queue &,
+    std::size_t, // num_elements
+    const py::object &start,
+    const py::object &step,
+    char *, // dst_data_ptr
+    const std::vector<sycl::event> &);
+
+/*!
+ * @brief Function to submit kernel to populate given contiguous memory
+ * allocation with linear sequence specified by starting value and increment
+ * given as Python objects.
+ *
+ * @param q  Sycl queue to which the kernel is submitted
+ * @param nelems Length of the sequence
+ * @param start Starting value of the sequence as Python object. Must be
+ * convertible to array element data type `Ty`.
+ * @param step  Increment of the sequence as Python object. Must be convertible
+ * to array element data type `Ty`.
+ * @param array_data Kernel accessible USM pointer to the start of array to be
+ * populated.
+ * @param depends List of events to wait for before starting computations, if
+ * any.
+ *
+ * @return Event to wait on to ensure that computation completes.
+ * @defgroup CtorKernels
+ */
+template <typename Ty>
+sycl::event lin_space_step_impl(sycl::queue &exec_q,
+                                std::size_t nelems,
+                                const py::object &start,
+                                const py::object &step,
+                                char *array_data,
+                                const std::vector<sycl::event> &depends)
+{
+    Ty start_v = py::cast<Ty>(start);
+    Ty step_v = py::cast<Ty>(step);
+
+    using dpctl::tensor::kernels::constructors::lin_space_step_impl;
+
+    auto lin_space_step_event = lin_space_step_impl<Ty>(
+        exec_q, nelems, start_v, step_v, array_data, depends);
+
+    return lin_space_step_event;
+}
+
+typedef sycl::event (*lin_space_affine_fn_ptr_t)(
+    sycl::queue &,
+    std::size_t, // num_elements
+    const py::object &start,
+    const py::object &end,
+    bool include_endpoint,
+    char *, // dst_data_ptr
+    const std::vector<sycl::event> &);
+
+/*!
+ * @brief Function to submit kernel to populate given contiguous memory
+ * allocation with linear sequence specified  by starting and end values given
+ * as Python objects.
+ *
+ * @param exec_q  Sycl queue to which kernel is submitted for execution.
+ * @param nelems  Length of the sequence
+ * @param start Stating value of the sequence as Python object. Must be
+ * convertible to array data element type `Ty`.
+ * @param end   End-value of the sequence as Python object. Must be convertible
+ * to array data element type `Ty`.
+ * @param include_endpoint  Whether the end-value is included in the sequence
+ * @param array_data Kernel accessible USM pointer to the start of array to be
+ * populated.
+ * @param depends  List of events to wait for before starting computations, if
+ * any.
+ *
+ * @return Event to wait on to ensure that computation completes.
+ * @defgroup CtorKernels
+ */
+template <typename Ty>
+sycl::event lin_space_affine_impl(sycl::queue &exec_q,
+                                  std::size_t nelems,
+                                  const py::object &start,
+                                  const py::object &end,
+                                  bool include_endpoint,
+                                  char *array_data,
+                                  const std::vector<sycl::event> &depends)
+{
+    Ty start_v = py::cast<Ty>(start);
+    Ty end_v = py::cast<Ty>(end);
+
+    using dpctl::tensor::kernels::constructors::lin_space_affine_impl;
+
+    auto lin_space_affine_event = lin_space_affine_impl<Ty>(
+        exec_q, nelems, start_v, end_v, include_endpoint, array_data, depends);
+
+    return lin_space_affine_event;
+}
+
+using dpctl::utils::keep_args_alive;
+
+static lin_space_step_fn_ptr_t lin_space_step_dispatch_vector[td_ns::num_types];
+
+static lin_space_affine_fn_ptr_t
+    lin_space_affine_dispatch_vector[td_ns::num_types];
+
+std::pair<sycl::event, sycl::event>
+    usm_ndarray_linear_sequence_step(const py::object &start,
+                                     const py::object &dt,
+                                     const dpctl::tensor::usm_ndarray &dst,
+                                     sycl::queue &exec_q,
+                                     const std::vector<sycl::event> &depends)
+{
+    // dst must be 1D and C-contiguous
+    // start, end should be coercible into data type of dst
+
+    if (dst.get_ndim() != 1) {
+        throw py::value_error(
+            "usm_ndarray_linspace: Expecting 1D array to populate");
+    }
+
+    if (!dst.is_c_contiguous()) {
+        throw py::value_error(
+            "usm_ndarray_linspace: Non-contiguous arrays are not supported");
+    }
+
+    if (!dpctl::utils::queues_are_compatible(exec_q, {dst})) {
+        throw py::value_error(
+            "Execution queue is not compatible with the allocation queue");
+    }
+
+    dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst);
+
+    auto array_types = td_ns::usm_ndarray_types();
+    int dst_typenum = dst.get_typenum();
+    int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum);
+
+    py::ssize_t len = dst.get_shape(0);
+    if (len == 0) {
+        // nothing to do
+        return std::make_pair(sycl::event{}, sycl::event{});
+    }
+
+    char *dst_data = dst.get_data();
+    sycl::event linspace_step_event;
+
+    auto fn = lin_space_step_dispatch_vector[dst_typeid];
+
+    linspace_step_event =
+        fn(exec_q, static_cast<std::size_t>(len), start, dt, dst_data, depends);
+
+    return std::make_pair(keep_args_alive(exec_q, {dst}, {linspace_step_event}),
+                          linspace_step_event);
+}
+
+std::pair<sycl::event, sycl::event>
+    usm_ndarray_linear_sequence_affine(const py::object &start,
+                                       const py::object &end,
+                                       const dpctl::tensor::usm_ndarray &dst,
+                                       bool include_endpoint,
+                                       sycl::queue &exec_q,
+                                       const std::vector<sycl::event> &depends)
+{
+    // dst must be 1D and C-contiguous
+    // start, end should be coercible into data type of dst
+
+    if (dst.get_ndim() != 1) {
+        throw py::value_error(
+            "usm_ndarray_linspace: Expecting 1D array to populate");
+    }
+
+    if (!dst.is_c_contiguous()) {
+        throw py::value_error(
+            "usm_ndarray_linspace: Non-contiguous arrays are not supported");
+    }
+
+    if (!dpctl::utils::queues_are_compatible(exec_q, {dst})) {
+        throw py::value_error(
+            "Execution queue context is not the same as allocation context");
+    }
+
+    dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst);
+
+    auto array_types = td_ns::usm_ndarray_types();
+    int dst_typenum = dst.get_typenum();
+    int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum);
+
+    py::ssize_t len = dst.get_shape(0);
+    if (len == 0) {
+        // nothing to do
+        return std::make_pair(sycl::event{}, sycl::event{});
+    }
+
+    char *dst_data = dst.get_data();
+    sycl::event linspace_affine_event;
+
+    auto fn = lin_space_affine_dispatch_vector[dst_typeid];
+
+    linspace_affine_event = fn(exec_q, static_cast<std::size_t>(len), start,
+                               end, include_endpoint, dst_data, depends);
+
+    return std::make_pair(
+        keep_args_alive(exec_q, {dst}, {linspace_affine_event}),
+        linspace_affine_event);
+}
+
+/*!
+ * @brief  Factor to get function pointer of type `fnT` for array with elements
+ * of type `Ty`.
+ * @defgroup CtorKernels
+ */
+template <typename fnT, typename Ty>
+struct LinSpaceStepFactory
+{
+    fnT get()
+    {
+        fnT f = lin_space_step_impl<Ty>;
+        return f;
+    }
+};
+
+/*!
+ * @brief Factory to get function pointer of type `fnT` for array data type
+ * `Ty`.
+ */
+template <typename fnT, typename Ty>
+struct LinSpaceAffineFactory
+{
+    fnT get()
+    {
+        fnT f = lin_space_affine_impl<Ty>;
+        return f;
+    }
+};
+
+void init_linear_sequences_dispatch_vectors(void)
+{
+    using namespace td_ns;
+
+    DispatchVectorBuilder<lin_space_step_fn_ptr_t, LinSpaceStepFactory,
+                          num_types>
+        dvb1;
+    dvb1.populate_dispatch_vector(lin_space_step_dispatch_vector);
+
+    DispatchVectorBuilder<lin_space_affine_fn_ptr_t, LinSpaceAffineFactory,
+                          num_types>
+        dvb2;
+    dvb2.populate_dispatch_vector(lin_space_affine_dispatch_vector);
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/linear_sequences.hpp b/dpctl_ext/tensor/libtensor/source/linear_sequences.hpp
new file mode 100644
index 000000000000..45cf45153462
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/linear_sequences.hpp
@@ -0,0 +1,66 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <utility>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern std::pair<sycl::event, sycl::event> usm_ndarray_linear_sequence_step(
+    const py::object &start,
+    const py::object &dt,
+    const dpctl::tensor::usm_ndarray &dst,
+    sycl::queue &exec_q,
+    const std::vector<sycl::event> &depends = {});
+
+extern std::pair<sycl::event, sycl::event> usm_ndarray_linear_sequence_affine(
+    const py::object &start,
+    const py::object &end,
+    const dpctl::tensor::usm_ndarray &dst,
+    bool include_endpoint,
+    sycl::queue &exec_q,
+    const std::vector<sycl::event> &depends = {});
+
+extern void init_linear_sequences_dispatch_vectors(void);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp b/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp
index 5e5b07c087f8..cdd6e43ed9c5 100644
--- a/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp
+++ b/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp
@@ -56,7 +56,7 @@
 #include "full_ctor.hpp"
 #include "integer_advanced_indexing.hpp"
 #include "kernels/dpctl_tensor_types.hpp"
-// #include "linear_sequences.hpp"
+#include "linear_sequences.hpp"
 #include "repeat.hpp"
 #include "simplify_iteration_space.hpp"
 #include "triul_ctor.hpp"
@@ -94,8 +94,8 @@ using dpctl::tensor::py_internal::copy_numpy_ndarray_into_usm_ndarray;
 
 /* ============= linear-sequence ==================== */
 
-// using dpctl::tensor::py_internal::usm_ndarray_linear_sequence_affine;
-// using dpctl::tensor::py_internal::usm_ndarray_linear_sequence_step;
+using dpctl::tensor::py_internal::usm_ndarray_linear_sequence_affine;
+using dpctl::tensor::py_internal::usm_ndarray_linear_sequence_step;
 
 /* ================ Full ================== */
 
@@ -154,7 +154,7 @@ void init_dispatch_vectors(void)
     init_copy_as_contig_dispatch_vectors();
     init_copy_for_reshape_dispatch_vectors();
     init_copy_for_roll_dispatch_vectors();
-    // init_linear_sequences_dispatch_vectors();
+    init_linear_sequences_dispatch_vectors();
     init_full_ctor_dispatch_vectors();
     init_zeros_ctor_dispatch_vectors();
     init_eye_ctor_dispatch_vectors();
@@ -297,20 +297,22 @@ PYBIND11_MODULE(_tensor_impl, m)
           py::arg("src"), py::arg("dst"), py::arg("shifts"),
           py::arg("sycl_queue"), py::arg("depends") = py::list());
 
-    //     m.def("_linspace_step", &usm_ndarray_linear_sequence_step,
-    //           "Fills input 1D contiguous usm_ndarray `dst` with linear
-    //           sequence " "specified by " "starting point `start` and step
-    //           `dt`. " "Returns a tuple of events: (ht_event, comp_event)",
-    //           py::arg("start"), py::arg("dt"), py::arg("dst"),
-    //           py::arg("sycl_queue"), py::arg("depends") = py::list());
-
-    //     m.def("_linspace_affine", &usm_ndarray_linear_sequence_affine,
-    //           "Fills input 1D contiguous usm_ndarray `dst` with linear
-    //           sequence " "specified by " "starting point `start` and end
-    //           point `end`. " "Returns a tuple of events: (ht_event,
-    //           comp_event)", py::arg("start"), py::arg("end"), py::arg("dst"),
-    //           py::arg("include_endpoint"), py::arg("sycl_queue"),
-    //           py::arg("depends") = py::list());
+    m.def("_linspace_step", &usm_ndarray_linear_sequence_step,
+          "Fills input 1D contiguous usm_ndarray `dst` with linear sequence "
+          "specified by "
+          "starting point `start` and step `dt`. "
+          "Returns a tuple of events: (ht_event, comp_event)",
+          py::arg("start"), py::arg("dt"), py::arg("dst"),
+          py::arg("sycl_queue"), py::arg("depends") = py::list());
+
+    m.def("_linspace_affine", &usm_ndarray_linear_sequence_affine,
+          "Fills input 1D contiguous usm_ndarray `dst` with linear sequence "
+          "specified by "
+          "starting point `start` and end point `end`. "
+          "Returns a tuple of events: (ht_event, comp_event)",
+          py::arg("start"), py::arg("end"), py::arg("dst"),
+          py::arg("include_endpoint"), py::arg("sycl_queue"),
+          py::arg("depends") = py::list());
 
     m.def("_copy_numpy_ndarray_into_usm_ndarray",
           &copy_numpy_ndarray_into_usm_ndarray,
diff --git a/dpnp/dpnp_algo/dpnp_arraycreation.py b/dpnp/dpnp_algo/dpnp_arraycreation.py
index f3dd18153563..4e2ee8531a18 100644
--- a/dpnp/dpnp_algo/dpnp_arraycreation.py
+++ b/dpnp/dpnp_algo/dpnp_arraycreation.py
@@ -53,7 +53,7 @@ def _as_usm_ndarray(a, usm_type, sycl_queue):
 
     if isinstance(a, dpnp_array):
         a = a.get_array()
-    return dpt.asarray(a, usm_type=usm_type, sycl_queue=sycl_queue)
+    return dpt_ext.asarray(a, usm_type=usm_type, sycl_queue=sycl_queue)
 
 
 def _check_has_zero_val(a):
@@ -196,7 +196,7 @@ def dpnp_linspace(
 
     if dpnp.isscalar(start) and dpnp.isscalar(stop):
         # Call linspace() function for scalars.
-        usm_res = dpt.linspace(
+        usm_res = dpt_ext.linspace(
             start,
             stop,
             num,
@@ -213,19 +213,19 @@ def dpnp_linspace(
             else:
                 step = dpnp.nan
     else:
-        usm_start = dpt.asarray(
+        usm_start = dpt_ext.asarray(
             start,
             dtype=dt,
             usm_type=_usm_type,
             sycl_queue=sycl_queue_normalized,
         )
-        usm_stop = dpt.asarray(
+        usm_stop = dpt_ext.asarray(
             stop, dtype=dt, usm_type=_usm_type, sycl_queue=sycl_queue_normalized
         )
 
         delta = usm_stop - usm_start
 
-        usm_res = dpt.arange(
+        usm_res = dpt_ext.arange(
             0,
             stop=num,
             step=1,
@@ -256,7 +256,7 @@ def dpnp_linspace(
             usm_res[-1, ...] = usm_stop
 
     if axis != 0:
-        usm_res = dpt.moveaxis(usm_res, 0, axis)
+        usm_res = dpt_ext.moveaxis(usm_res, 0, axis)
 
     if dpnp.issubdtype(dtype, dpnp.integer):
         dpt.floor(usm_res, out=usm_res)
@@ -266,7 +266,7 @@ def dpnp_linspace(
 
     if retstep is True:
         if dpnp.isscalar(step):
-            step = dpt.asarray(
+            step = dpt_ext.asarray(
                 step, usm_type=res.usm_type, sycl_queue=res.sycl_queue
             )
         return res, dpnp_array._create_from_usm_ndarray(step)
diff --git a/dpnp/dpnp_algo/dpnp_elementwise_common.py b/dpnp/dpnp_algo/dpnp_elementwise_common.py
index 722b8cb3b3f0..d7eeccf78489 100644
--- a/dpnp/dpnp_algo/dpnp_elementwise_common.py
+++ b/dpnp/dpnp_algo/dpnp_elementwise_common.py
@@ -467,7 +467,7 @@ def __call__(
                     )
 
                 # Allocate a temporary buffer with the required dtype
-                out[i] = dpt.empty_like(res, dtype=res_dt)
+                out[i] = dpt_ext.empty_like(res, dtype=res_dt)
             elif (
                 buf_dt is None
                 and dti._array_overlap(x, res)
@@ -476,7 +476,7 @@ def __call__(
                 # Allocate a temporary buffer to avoid memory overlapping.
                 # Note if `buf_dt` is not None, a temporary copy of `x` will be
                 # created, so the array overlap check isn't needed.
-                out[i] = dpt.empty_like(res)
+                out[i] = dpt_ext.empty_like(res)
 
         _manager = dpu.SequentialOrderManager[exec_q]
         dep_evs = _manager.submitted_events
@@ -486,7 +486,7 @@ def __call__(
             if order == "K":
                 buf = dtc._empty_like_orderK(x, buf_dt)
             else:
-                buf = dpt.empty_like(x, dtype=buf_dt, order=order)
+                buf = dpt_ext.empty_like(x, dtype=buf_dt, order=order)
 
             ht_copy_ev, copy_ev = dti._copy_usm_ndarray_into_usm_ndarray(
                 src=x, dst=buf, sycl_queue=exec_q, depends=dep_evs
@@ -503,7 +503,7 @@ def __call__(
                 if order == "K":
                     out[i] = dtc._empty_like_orderK(x, res_dt)
                 else:
-                    out[i] = dpt.empty_like(x, dtype=res_dt, order=order)
+                    out[i] = dpt_ext.empty_like(x, dtype=res_dt, order=order)
 
         # Call the unary function with input and output arrays
         ht_unary_ev, unary_ev = self.get_implementation_function()(
@@ -713,7 +713,7 @@ def __call__(
 
         if dtype is not None:
             if dpnp.isscalar(x1):
-                x1_usm = dpt.asarray(
+                x1_usm = dpt_ext.asarray(
                     x1,
                     dtype=dtype,
                     sycl_queue=x2.sycl_queue,
@@ -722,7 +722,7 @@ def __call__(
                 x2_usm = dpt_ext.astype(x2_usm, dtype, copy=False)
             elif dpnp.isscalar(x2):
                 x1_usm = dpt_ext.astype(x1_usm, dtype, copy=False)
-                x2_usm = dpt.asarray(
+                x2_usm = dpt_ext.asarray(
                     x2,
                     dtype=dtype,
                     sycl_queue=x1.sycl_queue,
@@ -1078,7 +1078,7 @@ def __call__(
                     )
 
                 # Allocate a temporary buffer with the required dtype
-                out[i] = dpt.empty_like(res, dtype=res_dt)
+                out[i] = dpt_ext.empty_like(res, dtype=res_dt)
             else:
                 # If `dt` is not None, a temporary copy of `x` will be created,
                 # so the array overlap check isn't needed.
@@ -1094,7 +1094,7 @@ def __call__(
                     for x in x_to_check
                 ):
                     # allocate a temporary buffer to avoid memory overlapping
-                    out[i] = dpt.empty_like(res)
+                    out[i] = dpt_ext.empty_like(res)
 
         x1 = dpnp.as_usm_ndarray(x1, dtype=x1_dt, sycl_queue=exec_q)
         x2 = dpnp.as_usm_ndarray(x2, dtype=x2_dt, sycl_queue=exec_q)
@@ -1127,7 +1127,7 @@ def __call__(
                 if order == "K":
                     buf = dtc._empty_like_orderK(x, buf_dt)
                 else:
-                    buf = dpt.empty_like(x, dtype=buf_dt, order=order)
+                    buf = dpt_ext.empty_like(x, dtype=buf_dt, order=order)
 
                 ht_copy_ev, copy_ev = dti._copy_usm_ndarray_into_usm_ndarray(
                     src=x, dst=buf, sycl_queue=exec_q, depends=dep_evs
@@ -1146,7 +1146,7 @@ def __call__(
                         x1, x2, res_dt, res_shape, res_usm_type, exec_q
                     )
                 else:
-                    out[i] = dpt.empty(
+                    out[i] = dpt_ext.empty(
                         res_shape,
                         dtype=res_dt,
                         order=order,
@@ -1156,9 +1156,9 @@ def __call__(
 
         # Broadcast shapes of input arrays
         if x1.shape != res_shape:
-            x1 = dpt.broadcast_to(x1, res_shape)
+            x1 = dpt_ext.broadcast_to(x1, res_shape)
         if x2.shape != res_shape:
-            x2 = dpt.broadcast_to(x2, res_shape)
+            x2 = dpt_ext.broadcast_to(x2, res_shape)
 
         # Call the binary function with input and output arrays
         ht_binary_ev, binary_ev = self.get_implementation_function()(
diff --git a/dpnp/dpnp_algo/dpnp_fill.py b/dpnp/dpnp_algo/dpnp_fill.py
index ddba9f634cb1..c9ae58a114a9 100644
--- a/dpnp/dpnp_algo/dpnp_fill.py
+++ b/dpnp/dpnp_algo/dpnp_fill.py
@@ -28,14 +28,13 @@
 
 from numbers import Number
 
-import dpctl.tensor as dpt
 import dpctl.utils as dpu
-from dpctl.tensor._ctors import _cast_fill_val
 
 # TODO: revert to `from dpctl.tensor...`
 # when dpnp fully migrates dpctl/tensor
-import dpctl_ext.tensor as dpt_ext
+import dpctl_ext.tensor as dpt
 import dpnp
+from dpctl_ext.tensor._ctors import _cast_fill_val
 from dpctl_ext.tensor._tensor_impl import (
     _copy_usm_ndarray_into_usm_ndarray,
     _full_usm_ndarray,
@@ -56,7 +55,7 @@ def dpnp_fill(arr, val):
             raise dpu.ExecutionPlacementError(
                 "Input arrays have incompatible queues."
             )
-        a_val = dpt_ext.astype(val, arr.dtype)
+        a_val = dpt.astype(val, arr.dtype)
         a_val = dpt.broadcast_to(a_val, arr.shape)
         _manager = dpu.SequentialOrderManager[exec_q]
         dep_evs = _manager.submitted_events
diff --git a/dpnp/dpnp_array.py b/dpnp/dpnp_array.py
index b3ed3770396d..6418302d6e7b 100644
--- a/dpnp/dpnp_array.py
+++ b/dpnp/dpnp_array.py
@@ -2283,7 +2283,7 @@ def transpose(self, *axes):
                 # self.transpose(None).shape == self.shape[::-1]
                 axes = tuple((ndim - x - 1) for x in range(ndim))
 
-            usm_res = dpt.permute_dims(self._array_obj, axes)
+            usm_res = dpt_ext.permute_dims(self._array_obj, axes)
         return dpnp_array._create_from_usm_ndarray(usm_res)
 
     def var(
diff --git a/dpnp/dpnp_container.py b/dpnp/dpnp_container.py
index 0727b9bfd775..9fe955746593 100644
--- a/dpnp/dpnp_container.py
+++ b/dpnp/dpnp_container.py
@@ -35,12 +35,11 @@
 
 """
 
-import dpctl.tensor as dpt
 import dpctl.utils as dpu
 
 # TODO: revert to `import dpctl.tensor...`
 # when dpnp fully migrates dpctl/tensor
-import dpctl_ext.tensor as dpt_ext
+import dpctl_ext.tensor as dpt
 import dpnp
 from dpnp.dpnp_array import dpnp_array
 
@@ -143,7 +142,7 @@ def copy(x1, /, *, order="K"):
     if order is None:
         order = "K"
 
-    array_obj = dpt_ext.copy(dpnp.get_usm_ndarray(x1), order=order)
+    array_obj = dpt.copy(dpnp.get_usm_ndarray(x1), order=order)
     return dpnp_array._create_from_usm_ndarray(array_obj)
 
 
@@ -196,7 +195,7 @@ def eye(
         order = "C"
 
     """Creates `dpnp_array` with ones on the `k`th diagonal."""
-    array_obj = dpt_ext.eye(
+    array_obj = dpt.eye(
         N,
         M,
         k=k,
@@ -231,7 +230,7 @@ def full(
         fill_value = fill_value.get_array()
 
     """Creates `dpnp_array` having a specified shape, filled with fill_value."""
-    array_obj = dpt_ext.full(
+    array_obj = dpt.full(
         shape,
         fill_value,
         dtype=dtype,
@@ -272,13 +271,13 @@ def ones(
 
 def tril(x1, /, *, k=0):
     """Creates `dpnp_array` as lower triangular part of an input array."""
-    array_obj = dpt_ext.tril(dpnp.get_usm_ndarray(x1), k=k)
+    array_obj = dpt.tril(dpnp.get_usm_ndarray(x1), k=k)
     return dpnp_array._create_from_usm_ndarray(array_obj)
 
 
 def triu(x1, /, *, k=0):
     """Creates `dpnp_array` as upper triangular part of an input array."""
-    array_obj = dpt_ext.triu(dpnp.get_usm_ndarray(x1), k=k)
+    array_obj = dpt.triu(dpnp.get_usm_ndarray(x1), k=k)
     return dpnp_array._create_from_usm_ndarray(array_obj)
 
 
diff --git a/dpnp/dpnp_iface.py b/dpnp/dpnp_iface.py
index 6c050a208981..9fca083a6413 100644
--- a/dpnp/dpnp_iface.py
+++ b/dpnp/dpnp_iface.py
@@ -191,7 +191,7 @@ def as_usm_ndarray(a, dtype=None, device=None, usm_type=None, sycl_queue=None):
     if is_supported_array_type(a):
         return get_usm_ndarray(a)
 
-    return dpt.asarray(
+    return dpt_ext.asarray(
         a, dtype=dtype, device=device, usm_type=usm_type, sycl_queue=sycl_queue
     )
 
diff --git a/dpnp/dpnp_iface_arraycreation.py b/dpnp/dpnp_iface_arraycreation.py
index 52fc4b7f6448..d09cc17bde79 100644
--- a/dpnp/dpnp_iface_arraycreation.py
+++ b/dpnp/dpnp_iface_arraycreation.py
@@ -3131,7 +3131,7 @@ def meshgrid(*xi, copy=True, sparse=False, indexing="xy"):
         output[1] = dpt_ext.reshape(output[1], (-1, 1) + s0[2:])
 
     if not sparse:
-        output = dpt.broadcast_arrays(*output)
+        output = dpt_ext.broadcast_arrays(*output)
 
     if copy:
         output = [dpt_ext.copy(x) for x in output]
@@ -3696,7 +3696,7 @@ def tri(
     if usm_type is None:
         usm_type = "device"
 
-    m = dpt.ones(
+    m = dpt_ext.ones(
         (N, M),
         dtype=_dtype,
         device=device,
@@ -3912,7 +3912,7 @@ def vander(
 
     if dpnp.is_supported_array_type(x):
         x = dpnp.get_usm_ndarray(x)
-    usm_x = dpt.asarray(
+    usm_x = dpt_ext.asarray(
         x, device=device, usm_type=usm_type, sycl_queue=sycl_queue
     )
 
@@ -3935,7 +3935,7 @@ def vander(
     tmp = m[:, ::-1] if not increasing else m
     dpnp.power(
         dpt_ext.reshape(usm_x, (-1, 1)),
-        dpt.arange(
+        dpt_ext.arange(
             N, dtype=_dtype, usm_type=x_usm_type, sycl_queue=x_sycl_queue
         ),
         out=tmp,
diff --git a/dpnp/dpnp_iface_indexing.py b/dpnp/dpnp_iface_indexing.py
index bc190db70c4e..a52196e9e4db 100644
--- a/dpnp/dpnp_iface_indexing.py
+++ b/dpnp/dpnp_iface_indexing.py
@@ -141,9 +141,9 @@ def _choose_run(inds, chcs, q, usm_type, out=None, mode=0):
             ti._array_overlap(out, chc) for chc in chcs
         ):
             # Allocate a temporary buffer to avoid memory overlapping.
-            out = dpt.empty_like(out)
+            out = dpt_ext.empty_like(out)
     else:
-        out = dpt.empty(
+        out = dpt_ext.empty(
             inds.shape, dtype=chcs[0].dtype, usm_type=usm_type, sycl_queue=q
         )
 
@@ -260,7 +260,7 @@ def choose(a, choices, out=None, mode="wrap"):
                 choices,
             )
         )
-    arrs_broadcast = dpt.broadcast_arrays(inds, *choices)
+    arrs_broadcast = dpt_ext.broadcast_arrays(inds, *choices)
     inds = arrs_broadcast[0]
     choices = tuple(arrs_broadcast[1:])
 
@@ -301,9 +301,11 @@ def _take_index(x, inds, axis, q, usm_type, out=None, mode=0):
 
         if ti._array_overlap(x, out):
             # Allocate a temporary buffer to avoid memory overlapping.
-            out = dpt.empty_like(out)
+            out = dpt_ext.empty_like(out)
     else:
-        out = dpt.empty(res_sh, dtype=x.dtype, usm_type=usm_type, sycl_queue=q)
+        out = dpt_ext.empty(
+            res_sh, dtype=x.dtype, usm_type=usm_type, sycl_queue=q
+        )
 
     _manager = dpu.SequentialOrderManager[q]
     dep_evs = _manager.submitted_events
@@ -1803,7 +1805,7 @@ def put_along_axis(a, ind, values, axis, mode="wrap"):
     if dpnp.is_supported_array_type(values):
         usm_vals = dpnp.get_usm_ndarray(values)
     else:
-        usm_vals = dpt.asarray(
+        usm_vals = dpt_ext.asarray(
             values, usm_type=a.usm_type, sycl_queue=a.sycl_queue
         )
 
@@ -2151,7 +2153,7 @@ def take(a, indices, /, *, axis=None, out=None, mode="wrap"):
 
     usm_a = dpnp.get_usm_ndarray(a)
     if not dpnp.is_supported_array_type(indices):
-        usm_ind = dpt.asarray(
+        usm_ind = dpt_ext.asarray(
             indices, usm_type=a.usm_type, sycl_queue=a.sycl_queue
         )
     else:
diff --git a/dpnp/dpnp_iface_manipulation.py b/dpnp/dpnp_iface_manipulation.py
index d5e1e1aa5706..d188ae098cd9 100644
--- a/dpnp/dpnp_iface_manipulation.py
+++ b/dpnp/dpnp_iface_manipulation.py
@@ -375,7 +375,7 @@ def _get_first_nan_index(usm_a):
         ):
             if dpnp.issubdtype(usm_a.dtype, dpnp.complexfloating):
                 # for complex all NaNs are considered equivalent
-                true_val = dpt.asarray(
+                true_val = dpt_ext.asarray(
                     True, sycl_queue=usm_a.sycl_queue, usm_type=usm_a.usm_type
                 )
                 return dpt.searchsorted(dpt.isnan(usm_a), true_val, side="left")
@@ -1093,7 +1093,9 @@ def broadcast_arrays(*args, subok=False):
     if len(args) == 0:
         return []
 
-    usm_arrays = dpt.broadcast_arrays(*[dpnp.get_usm_ndarray(a) for a in args])
+    usm_arrays = dpt_ext.broadcast_arrays(
+        *[dpnp.get_usm_ndarray(a) for a in args]
+    )
     return [dpnp_array._create_from_usm_ndarray(a) for a in usm_arrays]
 
 
@@ -1178,7 +1180,7 @@ def broadcast_to(array, /, shape, subok=False):
         raise NotImplementedError(f"subok={subok} is currently not supported")
 
     usm_array = dpnp.get_usm_ndarray(array)
-    new_array = dpt.broadcast_to(usm_array, shape)
+    new_array = dpt_ext.broadcast_to(usm_array, shape)
     return dpnp_array._create_from_usm_ndarray(new_array)
 
 
@@ -1416,7 +1418,7 @@ def concatenate(
         )
 
     usm_arrays = [dpnp.get_usm_ndarray(x) for x in arrays]
-    usm_res = dpt.concat(usm_arrays, axis=axis)
+    usm_res = dpt_ext.concat(usm_arrays, axis=axis)
 
     res = dpnp_array._create_from_usm_ndarray(usm_res)
     if dtype is not None:
@@ -1521,7 +1523,7 @@ def copyto(dst, src, casting="same_kind", where=True):
                 f"but got {where.dtype}"
             )
 
-        dst_usm, src_usm, mask_usm = dpt.broadcast_arrays(
+        dst_usm, src_usm, mask_usm = dpt_ext.broadcast_arrays(
             dpnp.get_usm_ndarray(dst),
             dpnp.get_usm_ndarray(src),
             dpnp.get_usm_ndarray(where),
@@ -1849,7 +1851,7 @@ def expand_dims(a, axis):
     """
 
     usm_a = dpnp.get_usm_ndarray(a)
-    usm_res = dpt.expand_dims(usm_a, axis=axis)
+    usm_res = dpt_ext.expand_dims(usm_a, axis=axis)
     return dpnp_array._create_from_usm_ndarray(usm_res)
 
 
@@ -1920,7 +1922,7 @@ def flip(m, axis=None):
     """
 
     m_usm = dpnp.get_usm_ndarray(m)
-    return dpnp_array._create_from_usm_ndarray(dpt.flip(m_usm, axis=axis))
+    return dpnp_array._create_from_usm_ndarray(dpt_ext.flip(m_usm, axis=axis))
 
 
 def fliplr(m):
@@ -2408,7 +2410,7 @@ def moveaxis(a, source, destination):
 
     usm_array = dpnp.get_usm_ndarray(a)
     return dpnp_array._create_from_usm_ndarray(
-        dpt.moveaxis(usm_array, source, destination)
+        dpt_ext.moveaxis(usm_array, source, destination)
     )
 
 
@@ -3663,7 +3665,7 @@ def squeeze(a, /, axis=None):
     """
 
     usm_a = dpnp.get_usm_ndarray(a)
-    usm_res = dpt.squeeze(usm_a, axis=axis)
+    usm_res = dpt_ext.squeeze(usm_a, axis=axis)
     return dpnp_array._create_from_usm_ndarray(usm_res)
 
 
@@ -3751,7 +3753,7 @@ def stack(arrays, /, *, axis=0, out=None, dtype=None, casting="same_kind"):
         )
 
     usm_arrays = [dpnp.get_usm_ndarray(x) for x in arrays]
-    usm_res = dpt.stack(usm_arrays, axis=axis)
+    usm_res = dpt_ext.stack(usm_arrays, axis=axis)
 
     res = dpnp_array._create_from_usm_ndarray(usm_res)
     if dtype is not None:
@@ -3812,7 +3814,7 @@ def swapaxes(a, axis1, axis2):
     """
 
     usm_a = dpnp.get_usm_ndarray(a)
-    usm_res = dpt.swapaxes(usm_a, axis1=axis1, axis2=axis2)
+    usm_res = dpt_ext.swapaxes(usm_a, axis1=axis1, axis2=axis2)
     return dpnp_array._create_from_usm_ndarray(usm_res)
 
 
@@ -3892,7 +3894,7 @@ def tile(A, reps):
     """
 
     usm_a = dpnp.get_usm_ndarray(A)
-    usm_res = dpt.tile(usm_a, reps)
+    usm_res = dpt_ext.tile(usm_a, reps)
     return dpnp_array._create_from_usm_ndarray(usm_res)
 
 
@@ -4522,7 +4524,7 @@ def unstack(x, /, *, axis=0):
     if usm_x.ndim == 0:
         raise ValueError("Input array must be at least 1-d.")
 
-    res = dpt.unstack(usm_x, axis=axis)
+    res = dpt_ext.unstack(usm_x, axis=axis)
     return tuple(dpnp_array._create_from_usm_ndarray(a) for a in res)
 
 
diff --git a/dpnp/dpnp_iface_searching.py b/dpnp/dpnp_iface_searching.py
index a2389978d506..15f52338ec7e 100644
--- a/dpnp/dpnp_iface_searching.py
+++ b/dpnp/dpnp_iface_searching.py
@@ -376,7 +376,7 @@ def searchsorted(a, v, side="left", sorter=None):
 
     usm_a = dpnp.get_usm_ndarray(a)
     if dpnp.isscalar(v):
-        usm_v = dpt.asarray(v, sycl_queue=a.sycl_queue, usm_type=a.usm_type)
+        usm_v = dpt_ext.asarray(v, sycl_queue=a.sycl_queue, usm_type=a.usm_type)
     else:
         usm_v = dpnp.get_usm_ndarray(v)
 
diff --git a/dpnp/fft/dpnp_utils_fft.py b/dpnp/fft/dpnp_utils_fft.py
index b959b78e1ad0..20d0dcd0cff2 100644
--- a/dpnp/fft/dpnp_utils_fft.py
+++ b/dpnp/fft/dpnp_utils_fft.py
@@ -42,19 +42,13 @@
 from collections.abc import Sequence
 
 import dpctl
-
-# pylint: disable=no-name-in-module
-# TODO: remove it when ti.__linspace_step
-# is migrated to dpctl_ext/tensor
-import dpctl.tensor._tensor_impl as ti
 import dpctl.utils as dpu
 import numpy
 from dpctl.utils import ExecutionPlacementError
 
-# pylint: disable=no-name-in-module
 # TODO: revert to `import dpctl.tensor...`
 # when dpnp fully migrates dpctl/tensor
-import dpctl_ext.tensor._tensor_impl as ti_ext
+import dpctl_ext.tensor._tensor_impl as ti
 import dpnp
 import dpnp.backend.extensions.fft._fft_impl as fi
 from dpctl_ext.tensor._numpy_helper import (
@@ -205,7 +199,7 @@ def _compute_result(dsc, a, out, forward, c2c, out_strides):
         if (
             out is not None
             and out.strides == tuple(out_strides)
-            and not ti_ext._array_overlap(a_usm, dpnp.get_usm_ndarray(out))
+            and not ti._array_overlap(a_usm, dpnp.get_usm_ndarray(out))
         ):
             res_usm = out_usm
             result = out
@@ -538,7 +532,7 @@ def _truncate_or_pad(a, shape, axes):
             )
             _manager = dpu.SequentialOrderManager[exec_q]
             dep_evs = _manager.submitted_events
-            ht_copy_ev, copy_ev = ti_ext._copy_usm_ndarray_into_usm_ndarray(
+            ht_copy_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
                 src=dpnp.get_usm_ndarray(a),
                 dst=z.get_array()[tuple(index)],
                 sycl_queue=exec_q,
diff --git a/dpnp/tests/test_arraycreation.py b/dpnp/tests/test_arraycreation.py
index 88e6aacb997d..8d89f2a42ca8 100644
--- a/dpnp/tests/test_arraycreation.py
+++ b/dpnp/tests/test_arraycreation.py
@@ -2,7 +2,6 @@
 from math import prod
 
 import dpctl
-import dpctl.tensor as dpt
 import numpy
 import pytest
 from numpy.testing import (
@@ -15,7 +14,7 @@
 
 # TODO: revert to `import dpctl.tensor...`
 # when dpnp fully migrates dpctl/tensor
-import dpctl_ext.tensor as dpt_ext
+import dpctl_ext.tensor as dpt
 import dpnp
 
 from .helper import (
@@ -972,7 +971,7 @@ def test_ones_like(array, dtype, order):
     ],
 )
 def test_dpctl_tensor_input(func, args):
-    x0 = dpt_ext.reshape(dpt.arange(9), (3, 3))
+    x0 = dpt.reshape(dpt.arange(9), (3, 3))
     new_args = [eval(val, {"x0": x0}) for val in args]
     X = getattr(dpt, func)(*new_args)
     Y = getattr(dpnp, func)(*new_args)
diff --git a/dpnp/tests/test_arraymanipulation.py b/dpnp/tests/test_arraymanipulation.py
index 17d2c07fd6cc..f7df6387caf6 100644
--- a/dpnp/tests/test_arraymanipulation.py
+++ b/dpnp/tests/test_arraymanipulation.py
@@ -1,10 +1,10 @@
-import warnings
-
-import dpctl.tensor as dpt
 import numpy
 import pytest
 from numpy.testing import assert_array_equal, assert_equal, assert_raises
 
+# TODO: revert to `import dpctl.tensor...`
+# when dpnp fully migrates dpctl/tensor
+import dpctl_ext.tensor as dpt
 import dpnp
 
 # TODO: revert to `from dpctl.tensor...`
diff --git a/dpnp/tests/test_fft.py b/dpnp/tests/test_fft.py
index 226420057748..3a19a2cf3668 100644
--- a/dpnp/tests/test_fft.py
+++ b/dpnp/tests/test_fft.py
@@ -1,10 +1,12 @@
 import dpctl
-import dpctl.tensor as dpt
 import numpy
 import pytest
 from dpctl.utils import ExecutionPlacementError
 from numpy.testing import assert_raises
 
+# TODO: revert to `import dpctl.tensor...`
+# when dpnp fully migrates dpctl/tensor
+import dpctl_ext.tensor as dpt
 import dpnp
 from dpnp.dpnp_utils import map_dtype_to_device
 
diff --git a/dpnp/tests/test_indexing.py b/dpnp/tests/test_indexing.py
index 79c41a2f45f7..d8822d77080b 100644
--- a/dpnp/tests/test_indexing.py
+++ b/dpnp/tests/test_indexing.py
@@ -1,7 +1,6 @@
 import functools
 
 import dpctl
-import dpctl.tensor as dpt
 import numpy
 import pytest
 from dpctl.utils import ExecutionPlacementError
@@ -13,10 +12,10 @@
     assert_raises_regex,
 )
 
-import dpnp
-
 # TODO: revert to `import dpctl.tensor...`
 # when dpnp fully migrates dpctl/tensor
+import dpctl_ext.tensor as dpt
+import dpnp
 from dpctl_ext.tensor._numpy_helper import AxisError
 from dpctl_ext.tensor._type_utils import _to_device_supported_dtype
 from dpnp.dpnp_array import dpnp_array
diff --git a/dpnp/tests/test_linalg.py b/dpnp/tests/test_linalg.py
index e1ad9af7d220..dfd6e21c2a95 100644
--- a/dpnp/tests/test_linalg.py
+++ b/dpnp/tests/test_linalg.py
@@ -1,7 +1,6 @@
 import warnings
 
 import dpctl
-import dpctl.tensor as dpt
 import numpy
 import pytest
 from dpctl.utils import ExecutionPlacementError
@@ -13,6 +12,9 @@
     assert_raises_regex,
 )
 
+# TODO: revert to `import dpctl.tensor...`
+# when dpnp fully migrates dpctl/tensor
+import dpctl_ext.tensor as dpt
 import dpnp
 
 # TODO: revert to `from dpctl.tensor...`
diff --git a/dpnp/tests/test_manipulation.py b/dpnp/tests/test_manipulation.py
index 2512c0955da7..d30c08a65f1e 100644
--- a/dpnp/tests/test_manipulation.py
+++ b/dpnp/tests/test_manipulation.py
@@ -1,6 +1,5 @@
 import itertools
 
-import dpctl.tensor as dpt
 import numpy
 import pytest
 from numpy.testing import (
@@ -9,6 +8,9 @@
     assert_raises,
 )
 
+# TODO: revert to `import dpctl.tensor...`
+# when dpnp fully migrates dpctl/tensor
+import dpctl_ext.tensor as dpt
 import dpnp
 
 # TODO: revert to `from dpctl.tensor...`
diff --git a/dpnp/tests/test_mathematical.py b/dpnp/tests/test_mathematical.py
index ef8f6731ffd2..c03787790280 100644
--- a/dpnp/tests/test_mathematical.py
+++ b/dpnp/tests/test_mathematical.py
@@ -11,6 +11,9 @@
     assert_raises_regex,
 )
 
+# TODO: revert to `import dpctl.tensor...`
+# when dpnp fully migrates dpctl/tensor
+import dpctl_ext.tensor as dpt_ext
 import dpnp
 
 # TODO: revert to `from dpctl.tensor...`
@@ -669,15 +672,15 @@ def test_to_begin_to_end(self, to_begin, to_end):
         "to_begin, to_end",
         [
             (-20, 20),
-            (dpt.asarray([-20, -30]), dpt.asarray([20, 15])),
-            (dpt.asarray([[-20, -30]]), dpt.asarray([[20, 15]])),
+            (dpt_ext.asarray([-20, -30]), dpt_ext.asarray([20, 15])),
+            (dpt_ext.asarray([[-20, -30]]), dpt_ext.asarray([[20, 15]])),
             ([1, 2], [3, 4]),
             ((1, 2), (3, 4)),
         ],
     )
     def test_usm_ndarray(self, to_begin, to_end):
         a = numpy.array([[1, 2, 0]])
-        dpt_a = dpt.asarray(a)
+        dpt_a = dpt_ext.asarray(a)
 
         if isinstance(to_begin, dpt.usm_ndarray):
             np_to_begin = dpt.asnumpy(to_begin)
@@ -1578,7 +1581,7 @@ def test_out(self):
         assert_allclose(result, expected)
 
         # output is usm_ndarray
-        dpt_out = dpt.empty(expected.shape, dtype=expected.dtype)
+        dpt_out = dpt_ext.empty(expected.shape, dtype=expected.dtype)
         result = dpnp.prod(ia, axis=0, out=dpt_out)
         assert dpt_out is result.get_array()
         assert_allclose(result, expected)
@@ -2631,7 +2634,7 @@ def test_out_float16(self, func):
     def test_out_usm_ndarray(self, func, dt):
         a = generate_random_numpy_array(10, dt)
         out = numpy.empty(a.shape, dtype=dt)
-        ia, usm_out = dpnp.array(a), dpt.asarray(out)
+        ia, usm_out = dpnp.array(a), dpt_ext.asarray(out)
 
         expected = getattr(numpy, func)(a, out=out)
         result = getattr(dpnp, func)(ia, out=usm_out)
diff --git a/dpnp/tests/test_memory.py b/dpnp/tests/test_memory.py
index 1bc0da8c1535..94aeda33f505 100644
--- a/dpnp/tests/test_memory.py
+++ b/dpnp/tests/test_memory.py
@@ -2,6 +2,9 @@
 import numpy
 import pytest
 
+# TODO: revert to `import dpctl.tensor...`
+# when dpnp fully migrates dpctl/tensor
+import dpctl_ext.tensor as dpt_ext
 import dpnp
 import dpnp.memory as dpm
 
@@ -21,7 +24,7 @@ def test_wrong_input_type(self, x):
             dpm.create_data(x)
 
     def test_wrong_usm_data(self):
-        a = dpt.ones(10)
+        a = dpt_ext.ones(10)
         d = IntUsmData(a.shape, buffer=a)
 
         with pytest.raises(TypeError):
diff --git a/dpnp/tests/test_nanfunctions.py b/dpnp/tests/test_nanfunctions.py
index d92cee045a72..2cb70df5954a 100644
--- a/dpnp/tests/test_nanfunctions.py
+++ b/dpnp/tests/test_nanfunctions.py
@@ -1,5 +1,4 @@
 import dpctl
-import dpctl.tensor as dpt
 import numpy
 import pytest
 from dpctl.utils import ExecutionPlacementError
@@ -12,6 +11,9 @@
     assert_raises_regex,
 )
 
+# TODO: revert to `import dpctl.tensor...`
+# when dpnp fully migrates dpctl/tensor
+import dpctl_ext.tensor as dpt
 import dpnp
 
 from .helper import (
diff --git a/dpnp/tests/test_ndarray.py b/dpnp/tests/test_ndarray.py
index 4e4e42bbc85e..a27f0fe6aa14 100644
--- a/dpnp/tests/test_ndarray.py
+++ b/dpnp/tests/test_ndarray.py
@@ -9,6 +9,9 @@
     assert_raises_regex,
 )
 
+# TODO: revert to `import dpctl.tensor...`
+# when dpnp fully migrates dpctl/tensor
+import dpctl_ext.tensor as dpt_ext
 import dpnp
 
 from .helper import (
@@ -407,7 +410,7 @@ def test_error(self):
 class TestUsmNdarrayProtocol:
     def test_basic(self):
         a = dpnp.arange(256, dtype=dpnp.int64)
-        usm_a = dpt.asarray(a)
+        usm_a = dpt_ext.asarray(a)
 
         assert a.sycl_queue == usm_a.sycl_queue
         assert a.usm_type == usm_a.usm_type
diff --git a/dpnp/tests/test_search.py b/dpnp/tests/test_search.py
index 64c4eb75f906..36e0032ccff1 100644
--- a/dpnp/tests/test_search.py
+++ b/dpnp/tests/test_search.py
@@ -1,8 +1,10 @@
-import dpctl.tensor as dpt
 import numpy
 import pytest
 from numpy.testing import assert_array_equal, assert_equal, assert_raises
 
+# TODO: revert to `import dpctl.tensor...`
+# when dpnp fully migrates dpctl/tensor
+import dpctl_ext.tensor as dpt
 import dpnp
 
 from .helper import (
diff --git a/dpnp/tests/test_statistics.py b/dpnp/tests/test_statistics.py
index cf436087b607..fe8848b6c858 100644
--- a/dpnp/tests/test_statistics.py
+++ b/dpnp/tests/test_statistics.py
@@ -1,5 +1,4 @@
 import dpctl
-import dpctl.tensor as dpt
 import numpy
 import pytest
 from numpy.testing import (
@@ -9,6 +8,9 @@
     assert_raises_regex,
 )
 
+# TODO: revert to `import dpctl.tensor...`
+# when dpnp fully migrates dpctl/tensor
+import dpctl_ext.tensor as dpt
 import dpnp
 
 from .helper import (
diff --git a/dpnp/tests/test_sycl_queue.py b/dpnp/tests/test_sycl_queue.py
index d1853579036a..a9c076a7c476 100644
--- a/dpnp/tests/test_sycl_queue.py
+++ b/dpnp/tests/test_sycl_queue.py
@@ -2,12 +2,14 @@
 import tempfile
 
 import dpctl
-import dpctl.tensor as dpt
 import numpy
 import pytest
 from dpctl.utils import ExecutionPlacementError
 from numpy.testing import assert_array_equal, assert_raises
 
+# TODO: revert to `import dpctl.tensor...`
+# when dpnp fully migrates dpctl/tensor
+import dpctl_ext.tensor as dpt
 import dpnp
 import dpnp.linalg
 from dpnp.dpnp_array import dpnp_array
diff --git a/dpnp/tests/test_usm_type.py b/dpnp/tests/test_usm_type.py
index 4fc0f2b958fa..8f8efd1cdd10 100644
--- a/dpnp/tests/test_usm_type.py
+++ b/dpnp/tests/test_usm_type.py
@@ -2,11 +2,13 @@
 import tempfile
 from math import prod
 
-import dpctl.tensor as dpt
 import dpctl.utils as du
 import numpy
 import pytest
 
+# TODO: revert to `import dpctl.tensor...`
+# when dpnp fully migrates dpctl/tensor
+import dpctl_ext.tensor as dpt
 import dpnp
 from dpnp.dpnp_utils import get_usm_allocations
 
diff --git a/dpnp/tests/test_utils.py b/dpnp/tests/test_utils.py
index eef9132e5b55..ddbd267c2108 100644
--- a/dpnp/tests/test_utils.py
+++ b/dpnp/tests/test_utils.py
@@ -1,7 +1,9 @@
-import dpctl.tensor as dpt
 import numpy
 import pytest
 
+# TODO: revert to `import dpctl.tensor...`
+# when dpnp fully migrates dpctl/tensor
+import dpctl_ext.tensor as dpt
 import dpnp
 
 

From e96405c2b97940eab02c06474b50f87a495b5005 Mon Sep 17 00:00:00 2001
From: vlad-perevezentsev <vladislav.perevezentsev@intel.com>
Date: Mon, 9 Mar 2026 13:46:06 +0100
Subject: [PATCH 09/43] Move `_tensor_accumulation_impl `extension and use it
 for dpnp (#2791)

This PR completely moves `_tensor_accumulation_impl` pybind11 extension
into `dpctl_ext.tensor` and extends `dpctl_ext.tensor` Python API with
the functions `cumulative_logsumexp, cumulative_prod and cumulative_sum`
reusing them in dpnp
---
 dpctl_ext/tensor/CMakeLists.txt               |  32 +-
 dpctl_ext/tensor/__init__.py                  |   4 +
 dpctl_ext/tensor/_accumulation.py             | 470 ++++++++++++++++++
 .../accumulators/accumulate_over_axis.hpp     | 462 +++++++++++++++++
 .../accumulators/accumulators_common.cpp      |  55 ++
 .../accumulators/accumulators_common.hpp      |  46 ++
 .../accumulators/cumulative_logsumexp.cpp     | 347 +++++++++++++
 .../accumulators/cumulative_logsumexp.hpp     |  46 ++
 .../source/accumulators/cumulative_prod.cpp   | 356 +++++++++++++
 .../source/accumulators/cumulative_prod.hpp   |  46 ++
 .../source/accumulators/cumulative_sum.cpp    | 354 +++++++++++++
 .../source/accumulators/cumulative_sum.hpp    |  46 ++
 .../libtensor/source/tensor_accumulation.cpp  |  43 ++
 dpnp/dpnp_iface_mathematical.py               |   8 +-
 dpnp/dpnp_iface_trigonometric.py              |   3 +-
 15 files changed, 2305 insertions(+), 13 deletions(-)
 create mode 100644 dpctl_ext/tensor/_accumulation.py
 create mode 100644 dpctl_ext/tensor/libtensor/source/accumulators/accumulate_over_axis.hpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/accumulators/accumulators_common.cpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/accumulators/accumulators_common.hpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/accumulators/cumulative_logsumexp.cpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/accumulators/cumulative_logsumexp.hpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/accumulators/cumulative_prod.cpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/accumulators/cumulative_prod.hpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/accumulators/cumulative_sum.cpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/accumulators/cumulative_sum.hpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/tensor_accumulation.cpp

diff --git a/dpctl_ext/tensor/CMakeLists.txt b/dpctl_ext/tensor/CMakeLists.txt
index 864e34ddaba4..eff5e7552648 100644
--- a/dpctl_ext/tensor/CMakeLists.txt
+++ b/dpctl_ext/tensor/CMakeLists.txt
@@ -63,6 +63,16 @@ set(_tensor_impl_sources
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/repeat.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/clip.cpp
 )
+set(_accumulator_sources
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/accumulators/accumulators_common.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/accumulators/cumulative_logsumexp.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/accumulators/cumulative_prod.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/accumulators/cumulative_sum.cpp
+)
+set(_tensor_accumulation_impl_sources
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/tensor_accumulation.cpp
+    ${_accumulator_sources}
+)
 
 set(_static_lib_trgt simplify_iteration_space)
 
@@ -85,6 +95,12 @@ add_sycl_to_target(TARGET ${python_module_name} SOURCES ${_tensor_impl_sources})
 target_link_libraries(${python_module_name} PRIVATE ${_static_lib_trgt})
 list(APPEND _py_trgts ${python_module_name})
 
+set(python_module_name _tensor_accumulation_impl)
+pybind11_add_module(${python_module_name} MODULE ${_tensor_accumulation_impl_sources})
+add_sycl_to_target(TARGET ${python_module_name} SOURCES ${_tensor_accumulation_impl_sources})
+target_link_libraries(${python_module_name} PRIVATE ${_static_lib_trgt})
+list(APPEND _py_trgts ${python_module_name})
+
 set(_clang_prefix "")
 if(WIN32)
     set(_clang_prefix "/clang:")
@@ -97,14 +113,14 @@ set(_no_fast_math_sources
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/clip.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/where.cpp
 )
-#list(
-#APPEND _no_fast_math_sources
-# ${_elementwise_sources}
-# ${_reduction_sources}
-# ${_sorting_sources}
-# ${_linalg_sources}
-# ${_accumulator_sources}
-#)
+list(
+    APPEND _no_fast_math_sources
+    # ${_elementwise_sources}
+    # ${_reduction_sources}
+    # ${_sorting_sources}
+    # ${_linalg_sources}
+    ${_accumulator_sources}
+)
 
 foreach(_src_fn ${_no_fast_math_sources})
     get_source_file_property(_cmpl_options_prop ${_src_fn} COMPILE_OPTIONS)
diff --git a/dpctl_ext/tensor/__init__.py b/dpctl_ext/tensor/__init__.py
index 9d4013e146a7..72c7536ed473 100644
--- a/dpctl_ext/tensor/__init__.py
+++ b/dpctl_ext/tensor/__init__.py
@@ -27,6 +27,7 @@
 # *****************************************************************************
 
 
+from ._accumulation import cumulative_logsumexp, cumulative_prod, cumulative_sum
 from ._clip import clip
 from ._copy_utils import (
     asnumpy,
@@ -92,6 +93,9 @@
     "concat",
     "copy",
     "clip",
+    "cumulative_logsumexp",
+    "cumulative_prod",
+    "cumulative_sum",
     "empty",
     "empty_like",
     "extract",
diff --git a/dpctl_ext/tensor/_accumulation.py b/dpctl_ext/tensor/_accumulation.py
new file mode 100644
index 000000000000..2dfe9656e198
--- /dev/null
+++ b/dpctl_ext/tensor/_accumulation.py
@@ -0,0 +1,470 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+import dpctl
+import dpctl.tensor as dpt
+from dpctl.utils import ExecutionPlacementError, SequentialOrderManager
+
+# TODO: revert to `import dpctl.tensor...`
+# when dpnp fully migrates dpctl/tensor
+import dpctl_ext.tensor as dpt_ext
+import dpctl_ext.tensor._tensor_accumulation_impl as tai
+import dpctl_ext.tensor._tensor_impl as ti
+
+from ._numpy_helper import normalize_axis_index
+from ._type_utils import (
+    _default_accumulation_dtype,
+    _default_accumulation_dtype_fp_types,
+    _to_device_supported_dtype,
+)
+
+
+def _accumulate_common(
+    x,
+    axis,
+    dtype,
+    include_initial,
+    out,
+    _accumulate_fn,
+    _accumulate_include_initial_fn,
+    _dtype_supported,
+    _default_accumulation_type_fn,
+):
+    if not isinstance(x, dpt.usm_ndarray):
+        raise TypeError(f"Expected dpctl.tensor.usm_ndarray, got {type(x)}")
+    appended_axis = False
+    if x.ndim == 0:
+        x = x[dpt.newaxis]
+        appended_axis = True
+    nd = x.ndim
+    if axis is None:
+        if nd > 1:
+            raise ValueError(
+                "`axis` cannot be `None` for array of dimension `{}`".format(nd)
+            )
+        axis = 0
+    else:
+        axis = normalize_axis_index(axis, nd, "axis")
+    sh = x.shape
+    res_sh = (
+        sh[:axis] + (sh[axis] + 1,) + sh[axis + 1 :] if include_initial else sh
+    )
+    a1 = axis + 1
+    if a1 == nd:
+        perm = list(range(nd))
+        arr = x
+    else:
+        perm = [i for i in range(nd) if i != axis] + [
+            axis,
+        ]
+        arr = dpt_ext.permute_dims(x, perm)
+    q = x.sycl_queue
+    inp_dt = x.dtype
+    res_usm_type = x.usm_type
+    if dtype is None:
+        res_dt = _default_accumulation_type_fn(inp_dt, q)
+    else:
+        res_dt = dpt.dtype(dtype)
+        res_dt = _to_device_supported_dtype(res_dt, q.sycl_device)
+
+    # checking now avoids unnecessary allocations
+    implemented_types = _dtype_supported(inp_dt, res_dt)
+    if dtype is None and not implemented_types:
+        raise RuntimeError(
+            "Automatically determined accumulation data type does not "
+            "have direct implementation"
+        )
+    orig_out = out
+    if out is not None:
+        if not isinstance(out, dpt.usm_ndarray):
+            raise TypeError(
+                f"output array must be of usm_ndarray type, got {type(out)}"
+            )
+        if not out.flags.writable:
+            raise ValueError("provided `out` array is read-only")
+        out_sh = out.shape
+        # append an axis to `out` if scalar
+        if appended_axis and not include_initial:
+            out = out[dpt.newaxis, ...]
+            orig_out = out
+            final_res_sh = res_sh[1:]
+        else:
+            final_res_sh = res_sh
+        if not out_sh == final_res_sh:
+            raise ValueError(
+                "The shape of input and output arrays are inconsistent. "
+                f"Expected output shape is {final_res_sh}, got {out_sh}"
+            )
+        if res_dt != out.dtype:
+            raise ValueError(
+                f"Output array of type {res_dt} is needed, " f"got {out.dtype}"
+            )
+        if dpctl.utils.get_execution_queue((q, out.sycl_queue)) is None:
+            raise ExecutionPlacementError(
+                "Input and output allocation queues are not compatible"
+            )
+        # permute out array dims if necessary
+        if a1 != nd:
+            out = dpt_ext.permute_dims(out, perm)
+            orig_out = out
+        if ti._array_overlap(x, out) and implemented_types:
+            out = dpt_ext.empty_like(out)
+    else:
+        out = dpt_ext.empty(
+            res_sh, dtype=res_dt, usm_type=res_usm_type, sycl_queue=q
+        )
+        if a1 != nd:
+            out = dpt_ext.permute_dims(out, perm)
+
+    _manager = SequentialOrderManager[q]
+    depends = _manager.submitted_events
+    if implemented_types:
+        if not include_initial:
+            ht_e, acc_ev = _accumulate_fn(
+                src=arr,
+                trailing_dims_to_accumulate=1,
+                dst=out,
+                sycl_queue=q,
+                depends=depends,
+            )
+        else:
+            ht_e, acc_ev = _accumulate_include_initial_fn(
+                src=arr, dst=out, sycl_queue=q, depends=depends
+            )
+        _manager.add_event_pair(ht_e, acc_ev)
+        if not (orig_out is None or out is orig_out):
+            # Copy the out data from temporary buffer to original memory
+            ht_e_cpy, cpy_e = ti._copy_usm_ndarray_into_usm_ndarray(
+                src=out, dst=orig_out, sycl_queue=q, depends=[acc_ev]
+            )
+            _manager.add_event_pair(ht_e_cpy, cpy_e)
+            out = orig_out
+    else:
+        if _dtype_supported(res_dt, res_dt):
+            tmp = dpt_ext.empty(
+                arr.shape, dtype=res_dt, usm_type=res_usm_type, sycl_queue=q
+            )
+            ht_e_cpy, cpy_e = ti._copy_usm_ndarray_into_usm_ndarray(
+                src=arr, dst=tmp, sycl_queue=q, depends=depends
+            )
+            _manager.add_event_pair(ht_e_cpy, cpy_e)
+            if not include_initial:
+                ht_e, acc_ev = _accumulate_fn(
+                    src=tmp,
+                    trailing_dims_to_accumulate=1,
+                    dst=out,
+                    sycl_queue=q,
+                    depends=[cpy_e],
+                )
+            else:
+                ht_e, acc_ev = _accumulate_include_initial_fn(
+                    src=tmp,
+                    dst=out,
+                    sycl_queue=q,
+                    depends=[cpy_e],
+                )
+            _manager.add_event_pair(ht_e, acc_ev)
+        else:
+            buf_dt = _default_accumulation_type_fn(inp_dt, q)
+            tmp = dpt_ext.empty(
+                arr.shape, dtype=buf_dt, usm_type=res_usm_type, sycl_queue=q
+            )
+            ht_e_cpy, cpy_e = ti._copy_usm_ndarray_into_usm_ndarray(
+                src=arr, dst=tmp, sycl_queue=q, depends=depends
+            )
+            _manager.add_event_pair(ht_e_cpy, cpy_e)
+            tmp_res = dpt_ext.empty(
+                res_sh, dtype=buf_dt, usm_type=res_usm_type, sycl_queue=q
+            )
+            if a1 != nd:
+                tmp_res = dpt_ext.permute_dims(tmp_res, perm)
+            if not include_initial:
+                ht_e, acc_ev = _accumulate_fn(
+                    src=tmp,
+                    trailing_dims_to_accumulate=1,
+                    dst=tmp_res,
+                    sycl_queue=q,
+                    depends=[cpy_e],
+                )
+            else:
+                ht_e, acc_ev = _accumulate_include_initial_fn(
+                    src=tmp,
+                    dst=tmp_res,
+                    sycl_queue=q,
+                    depends=[cpy_e],
+                )
+            _manager.add_event_pair(ht_e, acc_ev)
+            ht_e_cpy2, cpy_e2 = ti._copy_usm_ndarray_into_usm_ndarray(
+                src=tmp_res, dst=out, sycl_queue=q, depends=[acc_ev]
+            )
+            _manager.add_event_pair(ht_e_cpy2, cpy_e2)
+
+    if appended_axis:
+        out = dpt_ext.squeeze(out)
+    if a1 != nd:
+        inv_perm = sorted(range(nd), key=lambda d: perm[d])
+        out = dpt_ext.permute_dims(out, inv_perm)
+
+    return out
+
+
+def cumulative_sum(
+    x, /, *, axis=None, dtype=None, include_initial=False, out=None
+):
+    """
+    cumulative_sum(x, /, *, axis=None, dtype=None, include_initial=False,
+                   out=None)
+
+    Calculates the cumulative sum of elements in the input array `x`.
+
+    Args:
+        x (usm_ndarray):
+            input array.
+        axis (Optional[int]):
+            axis along which cumulative sum must be computed.
+            If `None`, the sum is computed over the entire array.
+            If `x` is a one-dimensional array, providing an `axis` is optional;
+            however, if `x` has more than one dimension, providing an `axis`
+            is required.
+            Default: `None`.
+        dtype (Optional[dtype]):
+            data type of the returned array. If `None`, the default data
+            type is inferred from the "kind" of the input array data type.
+
+                * If `x` has a real- or complex-valued floating-point data
+                  type, the returned array will have the same data type as
+                  `x`.
+                * If `x` has signed integral data type, the returned array
+                  will have the default signed integral type for the device
+                  where input array `x` is allocated.
+                * If `x` has unsigned integral data type, the returned array
+                  will have the default unsigned integral type for the device
+                  where input array `x` is allocated.
+                * If `x` has a boolean data type, the returned array will
+                  have the default signed integral type for the device
+                  where input array `x` is allocated.
+
+            If the data type (either specified or resolved) differs from the
+            data type of `x`, the input array elements are cast to the
+            specified data type before computing the cumulative sum.
+            Default: `None`.
+        include_initial (bool):
+            boolean indicating whether to include the initial value (i.e., the
+            additive identity, zero) as the first value along the provided axis
+            in the output. Default: `False`.
+        out (Optional[usm_ndarray]):
+            the array into which the result is written.
+            The data type of `out` must match the expected shape and the
+            expected data type of the result or (if provided) `dtype`.
+            If `None` then a new array is returned. Default: `None`.
+
+    Returns:
+        usm_ndarray:
+            an array containing cumulative sums. The returned array has the data
+            type as described in the `dtype` parameter description above.
+
+            The returned array shape is determined as follows:
+
+                * If `include_initial` is `False`, the returned array will
+                  have the same shape as `x`
+                * If `include_initial` is `True`, the returned array will
+                  have the same shape as `x` except the axis along which the
+                  cumulative sum is calculated, which will have size `N+1`
+
+            where `N` is the size of the axis the cumulative sums are computed
+            along.
+    """
+    return _accumulate_common(
+        x,
+        axis,
+        dtype,
+        include_initial,
+        out,
+        tai._cumsum_over_axis,
+        tai._cumsum_final_axis_include_initial,
+        tai._cumsum_dtype_supported,
+        _default_accumulation_dtype,
+    )
+
+
+def cumulative_prod(
+    x, /, *, axis=None, dtype=None, include_initial=False, out=None
+):
+    """
+    cumulative_prod(x, /, *, axis=None, dtype=None, include_initial=False,
+                   out=None)
+
+    Calculates the cumulative product of elements in the input array `x`.
+
+    Args:
+        x (usm_ndarray):
+            input array.
+        axis (Optional[int]):
+            axis along which cumulative product must be computed.
+            If `None`, the product is computed over the entire array.
+            If `x` is a one-dimensional array, providing an `axis` is optional;
+            however, if `x` has more than one dimension, providing an `axis`
+            is required.
+            Default: `None`.
+        dtype (Optional[dtype]):
+            data type of the returned array. If `None`, the default data
+            type is inferred from the "kind" of the input array data type.
+
+                * If `x` has a real- or complex-valued floating-point data
+                  type, the returned array will have the same data type as
+                  `x`.
+                * If `x` has signed integral data type, the returned array
+                  will have the default signed integral type for the device
+                  where input array `x` is allocated.
+                * If `x` has unsigned integral data type, the returned array
+                  will have the default unsigned integral type for the device
+                  where input array `x` is allocated.
+                * If `x` has a boolean data type, the returned array will
+                  have the default signed integral type for the device
+                  where input array `x` is allocated.
+
+            If the data type (either specified or resolved) differs from the
+            data type of `x`, the input array elements are cast to the
+            specified data type before computing the cumulative product.
+            Default: `None`.
+        include_initial (bool):
+            boolean indicating whether to include the initial value (i.e., the
+            additive identity, zero) as the first value along the provided
+            axis in the output. Default: `False`.
+        out (Optional[usm_ndarray]):
+            the array into which the result is written.
+            The data type of `out` must match the expected shape and the
+            expected data type of the result or (if provided) `dtype`.
+            If `None` then a new array is returned. Default: `None`.
+
+    Returns:
+        usm_ndarray:
+            an array containing cumulative products. The returned array has
+            the data type as described in the `dtype` parameter description
+            above.
+
+            The returned array shape is determined as follows:
+
+                * If `include_initial` is `False`, the returned array will
+                  have the same shape as `x`
+                * If `include_initial` is `True`, the returned array will
+                  have the same shape as `x` except the axis along which the
+                  cumulative product is calculated, which will have size `N+1`
+
+            where `N` is the size of the axis the cumulative products are
+            computed along.
+    """
+    return _accumulate_common(
+        x,
+        axis,
+        dtype,
+        include_initial,
+        out,
+        tai._cumprod_over_axis,
+        tai._cumprod_final_axis_include_initial,
+        tai._cumprod_dtype_supported,
+        _default_accumulation_dtype,
+    )
+
+
+def cumulative_logsumexp(
+    x, /, *, axis=None, dtype=None, include_initial=False, out=None
+):
+    """
+    cumulative_logsumexp(x, /, *, axis=None, dtype=None, include_initial=False,
+                   out=None)
+
+    Calculates the cumulative logsmumexp of elements in the input array `x`.
+
+    Args:
+        x (usm_ndarray):
+            input array.
+        axis (Optional[int]):
+            axis along which cumulative logsumexp must be computed.
+            If `None`, the logsumexp is computed over the entire array.
+            If `x` is a one-dimensional array, providing an `axis` is optional;
+            however, if `x` has more than one dimension, providing an `axis`
+            is required.
+            Default: `None`.
+        dtype (Optional[dtype]):
+            data type of the returned array. If `None`, the default data
+            type is inferred from the "kind" of the input array data type.
+
+                * If `x` has a real- or complex-valued floating-point data
+                  type, the returned array will have the same data type as
+                  `x`.
+                * If `x` has signed integral data type, the returned array
+                  will have the default signed integral type for the device
+                  where input array `x` is allocated.
+                * If `x` has unsigned integral data type, the returned array
+                  will have the default unsigned integral type for the device
+                  where input array `x` is allocated.
+                * If `x` has a boolean data type, the returned array will
+                  have the default signed integral type for the device
+                  where input array `x` is allocated.
+
+            If the data type (either specified or resolved) differs from the
+            data type of `x`, the input array elements are cast to the
+            specified data type before computing the cumulative logsumexp.
+            Default: `None`.
+        include_initial (bool):
+            boolean indicating whether to include the initial value (i.e., the
+            additive identity, zero) as the first value along the provided axis
+            in the output. Default: `False`.
+        out (Optional[usm_ndarray]):
+            the array into which the result is written.
+            The data type of `out` must match the expected shape and the
+            expected data type of the result or (if provided) `dtype`.
+            If `None` then a new array is returned. Default: `None`.
+
+    Returns:
+        usm_ndarray:
+            an array containing cumulative logsumexp results. The returned
+            array has the data type as described in the `dtype` parameter
+            description above.
+
+            The returned array shape is determined as follows:
+
+                * If `include_initial` is `False`, the returned array will
+                  have the same shape as `x`
+                * If `include_initial` is `True`, the returned array will
+                  have the same shape as `x` except the axis along which the
+                  cumulative logsumexp is calculated, which will have size
+                  `N+1`
+    """
+    return _accumulate_common(
+        x,
+        axis,
+        dtype,
+        include_initial,
+        out,
+        tai._cumlogsumexp_over_axis,
+        tai._cumlogsumexp_final_axis_include_initial,
+        tai._cumlogsumexp_dtype_supported,
+        _default_accumulation_dtype_fp_types,
+    )
diff --git a/dpctl_ext/tensor/libtensor/source/accumulators/accumulate_over_axis.hpp b/dpctl_ext/tensor/libtensor/source/accumulators/accumulate_over_axis.hpp
new file mode 100644
index 000000000000..4dd00620a260
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/accumulators/accumulate_over_axis.hpp
@@ -0,0 +1,462 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_accumulation_impl
+//  extensions
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <algorithm>
+#include <cstddef>
+#include <exception>
+#include <iterator>
+#include <stdexcept>
+#include <tuple>
+#include <type_traits>
+#include <utility>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+
+#include "kernels/accumulators.hpp"
+#include "simplify_iteration_space.hpp"
+#include "utils/memory_overlap.hpp"
+#include "utils/offset_utils.hpp"
+#include "utils/output_validation.hpp"
+#include "utils/sycl_alloc_utils.hpp"
+#include "utils/type_dispatch.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+template <typename strided_fnT, typename contig_fnT>
+std::pair<sycl::event, sycl::event>
+    py_accumulate_over_axis(const dpctl::tensor::usm_ndarray &src,
+                            const int trailing_dims_to_accumulate,
+                            const dpctl::tensor::usm_ndarray &dst,
+                            sycl::queue &exec_q,
+                            std::vector<sycl::event> const &depends,
+                            const strided_fnT &strided_dispatch_table,
+                            const contig_fnT &contig_dispatch_table)
+{
+    int src_nd = src.get_ndim();
+    int dst_nd = dst.get_ndim();
+    if (src_nd != dst_nd) {
+        throw py::value_error("The input and output arrays must have "
+                              "the same array ranks");
+    }
+    int iter_nd = src_nd - trailing_dims_to_accumulate;
+    if (trailing_dims_to_accumulate <= 0 || iter_nd < 0) {
+        throw py::value_error(
+            "trailing_dims_to_accumulate must be positive, but no "
+            "greater than rank of the input array");
+    }
+
+    const py::ssize_t *src_shape_ptr = src.get_shape_raw();
+    const py::ssize_t *dst_shape_ptr = dst.get_shape_raw();
+
+    bool same_shapes = true;
+    std::size_t iter_nelems(1);
+    for (int i = 0; same_shapes && (i < iter_nd); ++i) {
+        auto src_shape_i = src_shape_ptr[i];
+        same_shapes = same_shapes && (src_shape_i == dst_shape_ptr[i]);
+        iter_nelems *= static_cast<std::size_t>(src_shape_i);
+    }
+
+    std::size_t acc_nelems(1);
+    for (int i = iter_nd; same_shapes && (i < src_nd); ++i) {
+        auto dst_shape_i = dst_shape_ptr[i];
+        same_shapes = same_shapes && (src_shape_ptr[i] == dst_shape_i);
+        acc_nelems *= static_cast<std::size_t>(dst_shape_i);
+    }
+
+    if (!same_shapes) {
+        throw py::value_error(
+            "Destination shape does not match the input shape");
+    }
+
+    if (!dpctl::utils::queues_are_compatible(exec_q, {src, dst})) {
+        throw py::value_error(
+            "Execution queue is not compatible with allocation queues");
+    }
+
+    dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst);
+
+    if ((iter_nelems == 0) || (acc_nelems == 0)) {
+        return std::make_pair(sycl::event(), sycl::event());
+    }
+
+    auto const &overlap = dpctl::tensor::overlap::MemoryOverlap();
+    if (overlap(src, dst)) {
+        throw py::value_error("Arrays index overlapping segments of memory");
+    }
+
+    dpctl::tensor::validation::AmpleMemory::throw_if_not_ample(
+        dst, acc_nelems * iter_nelems);
+
+    const char *src_data = src.get_data();
+    char *dst_data = dst.get_data();
+
+    int src_typenum = src.get_typenum();
+    int dst_typenum = dst.get_typenum();
+
+    const auto &array_types = td_ns::usm_ndarray_types();
+    int src_typeid = array_types.typenum_to_lookup_id(src_typenum);
+    int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum);
+
+    bool is_src_c_contig = src.is_c_contiguous();
+    bool is_dst_c_contig = dst.is_c_contiguous();
+
+    std::vector<sycl::event> host_task_events;
+
+    if ((is_src_c_contig && is_dst_c_contig) && iter_nd == 0) {
+        auto fn = contig_dispatch_table[src_typeid][dst_typeid];
+        if (fn == nullptr) {
+            throw std::runtime_error("Datatypes are not supported");
+        }
+
+        sycl::event acc_ev = fn(exec_q, acc_nelems, src_data, dst_data,
+                                host_task_events, depends);
+
+        return std::make_pair(
+            dpctl::utils::keep_args_alive(exec_q, {src, dst}, {acc_ev}),
+            acc_ev);
+    }
+
+    auto src_shape_vec = src.get_shape_vector();
+    auto src_strides_vec = src.get_strides_vector();
+    auto dst_strides_vec = dst.get_strides_vector();
+
+    int acc_nd = trailing_dims_to_accumulate;
+
+    using shT = std::vector<py::ssize_t>;
+    shT acc_shape(std::begin(src_shape_vec) + iter_nd, std::end(src_shape_vec));
+
+    shT acc_src_strides(std::begin(src_strides_vec) + iter_nd,
+                        std::end(src_strides_vec));
+
+    shT acc_dst_strides(std::begin(dst_strides_vec) + iter_nd,
+                        std::end(dst_strides_vec));
+
+    shT iter_shape(std::begin(src_shape_vec),
+                   std::begin(src_shape_vec) + iter_nd);
+
+    shT iter_src_strides(std::begin(src_strides_vec),
+                         std::begin(src_strides_vec) + iter_nd);
+
+    shT iter_dst_strides(std::begin(dst_strides_vec),
+                         std::begin(dst_strides_vec) + iter_nd);
+
+    shT simplified_iter_shape;
+    shT simplified_iter_src_strides;
+    shT simplified_iter_dst_strides;
+    py::ssize_t iter_src_offset(0);
+    py::ssize_t iter_dst_offset(0);
+
+    if (iter_nd == 0) {
+        iter_nd = 1;
+        simplified_iter_shape.push_back(1);
+        simplified_iter_src_strides.push_back(0);
+        simplified_iter_dst_strides.push_back(0);
+    }
+    else {
+        simplify_iteration_space(
+            iter_nd, src_shape_ptr, iter_src_strides, iter_dst_strides,
+            // output
+            simplified_iter_shape, simplified_iter_src_strides,
+            simplified_iter_dst_strides, iter_src_offset, iter_dst_offset);
+    }
+
+    // Strided implementation
+    auto strided_fn = strided_dispatch_table[src_typeid][dst_typeid];
+    if (strided_fn == nullptr) {
+        throw std::runtime_error("Datatypes are not supported");
+    }
+
+    using dpctl::tensor::offset_utils::device_allocate_and_pack;
+    auto ptr_size_event_tuple = device_allocate_and_pack<py::ssize_t>(
+        exec_q, host_task_events, simplified_iter_shape,
+        simplified_iter_src_strides, simplified_iter_dst_strides, acc_shape,
+        acc_src_strides, acc_dst_strides);
+    auto packed_shapes_and_strides_owner =
+        std::move(std::get<0>(ptr_size_event_tuple));
+    const auto &copy_shapes_strides_ev = std::get<2>(ptr_size_event_tuple);
+    const py::ssize_t *packed_shapes_and_strides =
+        packed_shapes_and_strides_owner.get();
+
+    const py::ssize_t *iter_shape_and_strides = packed_shapes_and_strides;
+    const py::ssize_t *acc_shapes_and_strides =
+        packed_shapes_and_strides + 3 * simplified_iter_shape.size();
+
+    std::vector<sycl::event> all_deps;
+    all_deps.reserve(depends.size() + 1);
+    all_deps.insert(all_deps.end(), copy_shapes_strides_ev);
+    all_deps.insert(all_deps.end(), depends.begin(), depends.end());
+
+    sycl::event acc_ev = strided_fn(
+        exec_q, iter_nelems, acc_nelems, src_data, iter_nd,
+        iter_shape_and_strides, iter_src_offset, iter_dst_offset, acc_nd,
+        acc_shapes_and_strides, dst_data, host_task_events, all_deps);
+
+    sycl::event temp_cleanup_ev = dpctl::tensor::alloc_utils::async_smart_free(
+        exec_q, {acc_ev}, packed_shapes_and_strides_owner);
+    host_task_events.push_back(temp_cleanup_ev);
+
+    return std::make_pair(
+        dpctl::utils::keep_args_alive(exec_q, {src, dst}, host_task_events),
+        acc_ev);
+}
+
+template <typename strided_fnT, typename contig_fnT>
+std::pair<sycl::event, sycl::event> py_accumulate_final_axis_include_initial(
+    const dpctl::tensor::usm_ndarray &src,
+    const dpctl::tensor::usm_ndarray &dst,
+    sycl::queue &exec_q,
+    std::vector<sycl::event> const &depends,
+    const strided_fnT &strided_dispatch_table,
+    const contig_fnT &contig_dispatch_table)
+{
+    int src_nd = src.get_ndim();
+    int dst_nd = dst.get_ndim();
+
+    if (src_nd != dst_nd) {
+        throw py::value_error("The input and output arrays must have "
+                              "the same array ranks");
+    }
+
+    static constexpr int acc_nd = 1;
+
+    int iter_nd = src_nd - acc_nd;
+    if (iter_nd < 0) {
+        throw py::value_error("accumulation axis must not be greater than rank "
+                              "of the input array");
+    }
+
+    const py::ssize_t *src_shape_ptr = src.get_shape_raw();
+    const py::ssize_t *dst_shape_ptr = dst.get_shape_raw();
+
+    bool same_shapes = true;
+    std::size_t iter_nelems(1);
+    for (int i = 0; same_shapes && (i < iter_nd); ++i) {
+        auto src_shape_i = src_shape_ptr[i];
+        same_shapes = same_shapes && (src_shape_i == dst_shape_ptr[i]);
+        iter_nelems *= static_cast<std::size_t>(src_shape_i);
+    }
+
+    std::size_t acc_nelems(1);
+    for (int i = iter_nd; same_shapes && (i < src_nd); ++i) {
+        auto dst_shape_i = dst_shape_ptr[i];
+        same_shapes = same_shapes && (src_shape_ptr[i] + 1 == dst_shape_i);
+        acc_nelems *= static_cast<std::size_t>(dst_shape_i);
+    }
+
+    if (!same_shapes) {
+        throw py::value_error(
+            "Destination shape does not match the input shape");
+    }
+
+    if (!dpctl::utils::queues_are_compatible(exec_q, {src, dst})) {
+        throw py::value_error(
+            "Execution queue is not compatible with allocation queues");
+    }
+
+    dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst);
+
+    if ((iter_nelems == 0) || (acc_nelems == 0)) {
+        return std::make_pair(sycl::event(), sycl::event());
+    }
+
+    auto const &overlap = dpctl::tensor::overlap::MemoryOverlap();
+    if (overlap(src, dst)) {
+        throw py::value_error("Arrays index overlapping segments of memory");
+    }
+
+    dpctl::tensor::validation::AmpleMemory::throw_if_not_ample(
+        dst, acc_nelems * iter_nelems);
+
+    const char *src_data = src.get_data();
+    char *dst_data = dst.get_data();
+
+    int src_typenum = src.get_typenum();
+    int dst_typenum = dst.get_typenum();
+
+    const auto &array_types = td_ns::usm_ndarray_types();
+    int src_typeid = array_types.typenum_to_lookup_id(src_typenum);
+    int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum);
+
+    bool is_src_c_contig = src.is_c_contiguous();
+    bool is_dst_c_contig = dst.is_c_contiguous();
+
+    std::vector<sycl::event> host_task_events;
+
+    if ((is_src_c_contig && is_dst_c_contig) && iter_nd == 0) {
+        auto fn = contig_dispatch_table[src_typeid][dst_typeid];
+        if (fn == nullptr) {
+            throw std::runtime_error("Datatypes are not supported");
+        }
+
+        sycl::event acc_ev = fn(exec_q, acc_nelems, src_data, dst_data,
+                                host_task_events, depends);
+
+        return std::make_pair(
+            dpctl::utils::keep_args_alive(exec_q, {src, dst}, {acc_ev}),
+            acc_ev);
+    }
+
+    auto src_shape_vec = src.get_shape_vector();
+    auto src_strides_vec = src.get_strides_vector();
+    auto dst_strides_vec = dst.get_strides_vector();
+
+    using shT = std::vector<py::ssize_t>;
+    shT acc_shape(std::begin(src_shape_vec) + iter_nd, std::end(src_shape_vec));
+
+    shT acc_src_strides(std::begin(src_strides_vec) + iter_nd,
+                        std::end(src_strides_vec));
+
+    shT acc_dst_strides(std::begin(dst_strides_vec) + iter_nd,
+                        std::end(dst_strides_vec));
+
+    shT iter_shape(std::begin(src_shape_vec),
+                   std::begin(src_shape_vec) + iter_nd);
+
+    shT iter_src_strides(std::begin(src_strides_vec),
+                         std::begin(src_strides_vec) + iter_nd);
+
+    shT iter_dst_strides(std::begin(dst_strides_vec),
+                         std::begin(dst_strides_vec) + iter_nd);
+
+    shT simplified_iter_shape;
+    shT simplified_iter_src_strides;
+    shT simplified_iter_dst_strides;
+    py::ssize_t iter_src_offset(0);
+    py::ssize_t iter_dst_offset(0);
+
+    if (iter_nd == 0) {
+        iter_nd = 1;
+        simplified_iter_shape.push_back(1);
+        simplified_iter_src_strides.push_back(0);
+        simplified_iter_dst_strides.push_back(0);
+    }
+    else {
+        simplify_iteration_space(
+            iter_nd, src_shape_ptr, iter_src_strides, iter_dst_strides,
+            // output
+            simplified_iter_shape, simplified_iter_src_strides,
+            simplified_iter_dst_strides, iter_src_offset, iter_dst_offset);
+    }
+
+    // Strided implementation
+    auto strided_fn = strided_dispatch_table[src_typeid][dst_typeid];
+    if (strided_fn == nullptr) {
+        throw std::runtime_error("Datatypes are not supported");
+    }
+
+    using dpctl::tensor::offset_utils::device_allocate_and_pack;
+    auto ptr_size_event_tuple = device_allocate_and_pack<py::ssize_t>(
+        exec_q, host_task_events, simplified_iter_shape,
+        simplified_iter_src_strides, simplified_iter_dst_strides, acc_shape,
+        acc_src_strides, acc_dst_strides);
+    auto packed_shapes_and_strides_owner =
+        std::move(std::get<0>(ptr_size_event_tuple));
+    const auto &copy_shapes_strides_ev = std::get<2>(ptr_size_event_tuple);
+    const py::ssize_t *packed_shapes_and_strides =
+        packed_shapes_and_strides_owner.get();
+
+    const py::ssize_t *iter_shape_and_strides = packed_shapes_and_strides;
+    const py::ssize_t *acc_shapes_and_strides =
+        packed_shapes_and_strides + 3 * simplified_iter_shape.size();
+
+    std::vector<sycl::event> all_deps;
+    all_deps.reserve(depends.size() + 1);
+    all_deps.insert(all_deps.end(), copy_shapes_strides_ev);
+    all_deps.insert(all_deps.end(), depends.begin(), depends.end());
+
+    sycl::event acc_ev = strided_fn(
+        exec_q, iter_nelems, acc_nelems, src_data, iter_nd,
+        iter_shape_and_strides, iter_src_offset, iter_dst_offset, acc_nd,
+        acc_shapes_and_strides, dst_data, host_task_events, all_deps);
+
+    sycl::event temp_cleanup_ev = dpctl::tensor::alloc_utils::async_smart_free(
+        exec_q, {acc_ev}, packed_shapes_and_strides_owner);
+    host_task_events.push_back(temp_cleanup_ev);
+
+    return std::make_pair(
+        dpctl::utils::keep_args_alive(exec_q, {src, dst}, host_task_events),
+        acc_ev);
+}
+
+/*! @brief Template implementing Python API for querying accumulation
+ * type support */
+template <typename fnT>
+bool py_accumulate_dtype_supported(const py::dtype &input_dtype,
+                                   const py::dtype &output_dtype,
+                                   const fnT &dispatch_table)
+{
+    int arg_tn =
+        input_dtype.num(); // NumPy type numbers are the same as in dpctl
+    int out_tn =
+        output_dtype.num(); // NumPy type numbers are the same as in dpctl
+    int arg_typeid = -1;
+    int out_typeid = -1;
+
+    auto array_types = td_ns::usm_ndarray_types();
+
+    try {
+        arg_typeid = array_types.typenum_to_lookup_id(arg_tn);
+        out_typeid = array_types.typenum_to_lookup_id(out_tn);
+    } catch (const std::exception &e) {
+        throw py::value_error(e.what());
+    }
+
+    if (arg_typeid < 0 || arg_typeid >= td_ns::num_types || out_typeid < 0 ||
+        out_typeid >= td_ns::num_types)
+    {
+        throw std::runtime_error("Reduction type support check: lookup failed");
+    }
+
+    // remove_all_extents gets underlying type of table
+    using fn_ptrT = typename std::remove_all_extents<fnT>::type;
+    fn_ptrT fn = nullptr;
+
+    fn = dispatch_table[arg_typeid][out_typeid];
+
+    return (fn != nullptr);
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/accumulators/accumulators_common.cpp b/dpctl_ext/tensor/libtensor/source/accumulators/accumulators_common.cpp
new file mode 100644
index 000000000000..5e07e81b7ad5
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/accumulators/accumulators_common.cpp
@@ -0,0 +1,55 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_accumulation_impl
+//  extensions
+//===----------------------------------------------------------------------===//
+
+#include <pybind11/pybind11.h>
+
+#include "cumulative_logsumexp.hpp"
+#include "cumulative_prod.hpp"
+#include "cumulative_sum.hpp"
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+/*! @brief Add accumulators to Python module */
+void init_accumulator_functions(py::module_ m)
+{
+    init_cumulative_logsumexp(m);
+    init_cumulative_prod(m);
+    init_cumulative_sum(m);
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/accumulators/accumulators_common.hpp b/dpctl_ext/tensor/libtensor/source/accumulators/accumulators_common.hpp
new file mode 100644
index 000000000000..c33a040a7fa7
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/accumulators/accumulators_common.hpp
@@ -0,0 +1,46 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_accumulation_impl
+//  extensions
+//===----------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern void init_accumulator_functions(py::module_);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/accumulators/cumulative_logsumexp.cpp b/dpctl_ext/tensor/libtensor/source/accumulators/cumulative_logsumexp.cpp
new file mode 100644
index 000000000000..e24cf56ddd62
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/accumulators/cumulative_logsumexp.cpp
@@ -0,0 +1,347 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_accumulation_impl
+//  extensions
+//===----------------------------------------------------------------------===//
+
+#include <cstdint>
+#include <type_traits>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "accumulate_over_axis.hpp"
+#include "kernels/accumulators.hpp"
+#include "utils/sycl_utils.hpp"
+#include "utils/type_dispatch_building.hpp"
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace su_ns = dpctl::tensor::sycl_utils;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace impl
+{
+
+using dpctl::tensor::kernels::accumulators::accumulate_1d_contig_impl_fn_ptr_t;
+static accumulate_1d_contig_impl_fn_ptr_t
+    cumlogsumexp_1d_contig_dispatch_table[td_ns::num_types][td_ns::num_types];
+
+using dpctl::tensor::kernels::accumulators::accumulate_strided_impl_fn_ptr_t;
+static accumulate_strided_impl_fn_ptr_t
+    cumlogsumexp_strided_dispatch_table[td_ns::num_types][td_ns::num_types];
+
+static accumulate_1d_contig_impl_fn_ptr_t
+    cumlogsumexp_1d_include_initial_contig_dispatch_table[td_ns::num_types]
+                                                         [td_ns::num_types];
+
+static accumulate_strided_impl_fn_ptr_t
+    cumlogsumexp_include_initial_strided_dispatch_table[td_ns::num_types]
+                                                       [td_ns::num_types];
+
+template <typename argTy, typename outTy>
+struct TypePairSupportDataForLogSumExpAccumulation
+{
+    static constexpr bool is_defined = std::disjunction<
+        td_ns::TypePairDefinedEntry<argTy, bool, outTy, sycl::half>,
+        td_ns::TypePairDefinedEntry<argTy, bool, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, bool, outTy, double>,
+
+        // input int8_t
+        td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, sycl::half>,
+        td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, double>,
+
+        // input uint8_t
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, sycl::half>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, double>,
+
+        // input int16_t
+        td_ns::TypePairDefinedEntry<argTy, std::int16_t, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, std::int16_t, outTy, double>,
+
+        // input uint16_t
+        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, double>,
+
+        // input int32_t
+        td_ns::TypePairDefinedEntry<argTy, std::int32_t, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, std::int32_t, outTy, double>,
+
+        // input uint32_t
+        td_ns::TypePairDefinedEntry<argTy, std::uint32_t, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint32_t, outTy, double>,
+
+        // input int64_t
+        td_ns::TypePairDefinedEntry<argTy, std::int64_t, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, std::int64_t, outTy, double>,
+
+        // input uint64_t
+        td_ns::TypePairDefinedEntry<argTy, std::uint64_t, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint64_t, outTy, double>,
+
+        // input half
+        td_ns::TypePairDefinedEntry<argTy, sycl::half, outTy, sycl::half>,
+        td_ns::TypePairDefinedEntry<argTy, sycl::half, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, sycl::half, outTy, double>,
+
+        // input float
+        td_ns::TypePairDefinedEntry<argTy, float, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, float, outTy, double>,
+
+        // input double
+        td_ns::TypePairDefinedEntry<argTy, double, outTy, double>,
+
+        // fall-through
+        td_ns::NotDefinedEntry>::is_defined;
+};
+
+template <typename fnT, typename srcTy, typename dstTy>
+struct CumLogSumExp1DContigFactory
+{
+    fnT get()
+    {
+        if constexpr (TypePairSupportDataForLogSumExpAccumulation<
+                          srcTy, dstTy>::is_defined)
+        {
+            using ScanOpT = su_ns::LogSumExp<dstTy>;
+            static constexpr bool include_initial = false;
+            if constexpr (std::is_same_v<srcTy, dstTy>) {
+                using dpctl::tensor::kernels::accumulators::NoOpTransformer;
+                fnT fn = dpctl::tensor::kernels::accumulators::
+                    accumulate_1d_contig_impl<srcTy, dstTy,
+                                              NoOpTransformer<dstTy>, ScanOpT,
+                                              include_initial>;
+                return fn;
+            }
+            else {
+                using dpctl::tensor::kernels::accumulators::CastTransformer;
+                fnT fn = dpctl::tensor::kernels::accumulators::
+                    accumulate_1d_contig_impl<srcTy, dstTy,
+                                              CastTransformer<srcTy, dstTy>,
+                                              ScanOpT, include_initial>;
+                return fn;
+            }
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+template <typename fnT, typename srcTy, typename dstTy>
+struct CumLogSumExp1DIncludeInitialContigFactory
+{
+    fnT get()
+    {
+        if constexpr (TypePairSupportDataForLogSumExpAccumulation<
+                          srcTy, dstTy>::is_defined)
+        {
+            using ScanOpT = su_ns::LogSumExp<dstTy>;
+            static constexpr bool include_initial = true;
+            if constexpr (std::is_same_v<srcTy, dstTy>) {
+                using dpctl::tensor::kernels::accumulators::NoOpTransformer;
+                fnT fn = dpctl::tensor::kernels::accumulators::
+                    accumulate_1d_contig_impl<srcTy, dstTy,
+                                              NoOpTransformer<dstTy>, ScanOpT,
+                                              include_initial>;
+                return fn;
+            }
+            else {
+                using dpctl::tensor::kernels::accumulators::CastTransformer;
+                fnT fn = dpctl::tensor::kernels::accumulators::
+                    accumulate_1d_contig_impl<srcTy, dstTy,
+                                              CastTransformer<srcTy, dstTy>,
+                                              ScanOpT, include_initial>;
+                return fn;
+            }
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+template <typename fnT, typename srcTy, typename dstTy>
+struct CumLogSumExpStridedFactory
+{
+    fnT get()
+    {
+        if constexpr (TypePairSupportDataForLogSumExpAccumulation<
+                          srcTy, dstTy>::is_defined)
+        {
+            using ScanOpT = su_ns::LogSumExp<dstTy>;
+            static constexpr bool include_initial = false;
+            if constexpr (std::is_same_v<srcTy, dstTy>) {
+                using dpctl::tensor::kernels::accumulators::NoOpTransformer;
+                fnT fn = dpctl::tensor::kernels::accumulators::
+                    accumulate_strided_impl<srcTy, dstTy,
+                                            NoOpTransformer<dstTy>, ScanOpT,
+                                            include_initial>;
+                return fn;
+            }
+            else {
+                using dpctl::tensor::kernels::accumulators::CastTransformer;
+                fnT fn = dpctl::tensor::kernels::accumulators::
+                    accumulate_strided_impl<srcTy, dstTy,
+                                            CastTransformer<srcTy, dstTy>,
+                                            ScanOpT, include_initial>;
+                return fn;
+            }
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+template <typename fnT, typename srcTy, typename dstTy>
+struct CumLogSumExpIncludeInitialStridedFactory
+{
+    fnT get()
+    {
+        if constexpr (TypePairSupportDataForLogSumExpAccumulation<
+                          srcTy, dstTy>::is_defined)
+        {
+            using ScanOpT = su_ns::LogSumExp<dstTy>;
+            static constexpr bool include_initial = true;
+            if constexpr (std::is_same_v<srcTy, dstTy>) {
+                using dpctl::tensor::kernels::accumulators::NoOpTransformer;
+                fnT fn = dpctl::tensor::kernels::accumulators::
+                    accumulate_strided_impl<srcTy, dstTy,
+                                            NoOpTransformer<dstTy>, ScanOpT,
+                                            include_initial>;
+                return fn;
+            }
+            else {
+                using dpctl::tensor::kernels::accumulators::CastTransformer;
+                fnT fn = dpctl::tensor::kernels::accumulators::
+                    accumulate_strided_impl<srcTy, dstTy,
+                                            CastTransformer<srcTy, dstTy>,
+                                            ScanOpT, include_initial>;
+                return fn;
+            }
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+void populate_cumlogsumexp_dispatch_tables(void)
+{
+    td_ns::DispatchTableBuilder<accumulate_1d_contig_impl_fn_ptr_t,
+                                CumLogSumExp1DContigFactory, td_ns::num_types>
+        dtb1;
+    dtb1.populate_dispatch_table(cumlogsumexp_1d_contig_dispatch_table);
+
+    td_ns::DispatchTableBuilder<accumulate_strided_impl_fn_ptr_t,
+                                CumLogSumExpStridedFactory, td_ns::num_types>
+        dtb2;
+    dtb2.populate_dispatch_table(cumlogsumexp_strided_dispatch_table);
+
+    td_ns::DispatchTableBuilder<accumulate_1d_contig_impl_fn_ptr_t,
+                                CumLogSumExp1DIncludeInitialContigFactory,
+                                td_ns::num_types>
+        dtb3;
+    dtb3.populate_dispatch_table(
+        cumlogsumexp_1d_include_initial_contig_dispatch_table);
+
+    td_ns::DispatchTableBuilder<accumulate_strided_impl_fn_ptr_t,
+                                CumLogSumExpIncludeInitialStridedFactory,
+                                td_ns::num_types>
+        dtb4;
+    dtb4.populate_dispatch_table(
+        cumlogsumexp_include_initial_strided_dispatch_table);
+
+    return;
+}
+
+} // namespace impl
+
+void init_cumulative_logsumexp(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+
+    using impl::populate_cumlogsumexp_dispatch_tables;
+    populate_cumlogsumexp_dispatch_tables();
+
+    using impl::cumlogsumexp_1d_contig_dispatch_table;
+    using impl::cumlogsumexp_strided_dispatch_table;
+    auto cumlogsumexp_pyapi = [&](const arrayT &src,
+                                  int trailing_dims_to_accumulate,
+                                  const arrayT &dst, sycl::queue &exec_q,
+                                  const event_vecT &depends = {}) {
+        return py_accumulate_over_axis(src, trailing_dims_to_accumulate, dst,
+                                       exec_q, depends,
+                                       cumlogsumexp_strided_dispatch_table,
+                                       cumlogsumexp_1d_contig_dispatch_table);
+    };
+    m.def("_cumlogsumexp_over_axis", cumlogsumexp_pyapi, "", py::arg("src"),
+          py::arg("trailing_dims_to_accumulate"), py::arg("dst"),
+          py::arg("sycl_queue"), py::arg("depends") = py::list());
+
+    using impl::cumlogsumexp_1d_include_initial_contig_dispatch_table;
+    using impl::cumlogsumexp_include_initial_strided_dispatch_table;
+    auto cumlogsumexp_include_initial_pyapi =
+        [&](const arrayT &src, const arrayT &dst, sycl::queue &exec_q,
+            const event_vecT &depends = {}) {
+            return py_accumulate_final_axis_include_initial(
+                src, dst, exec_q, depends,
+                cumlogsumexp_include_initial_strided_dispatch_table,
+                cumlogsumexp_1d_include_initial_contig_dispatch_table);
+        };
+    m.def("_cumlogsumexp_final_axis_include_initial",
+          cumlogsumexp_include_initial_pyapi, "", py::arg("src"),
+          py::arg("dst"), py::arg("sycl_queue"),
+          py::arg("depends") = py::list());
+
+    auto cumlogsumexp_dtype_supported = [&](const py::dtype &input_dtype,
+                                            const py::dtype &output_dtype) {
+        return py_accumulate_dtype_supported(
+            input_dtype, output_dtype, cumlogsumexp_strided_dispatch_table);
+    };
+    m.def("_cumlogsumexp_dtype_supported", cumlogsumexp_dtype_supported, "",
+          py::arg("arg_dtype"), py::arg("out_dtype"));
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/accumulators/cumulative_logsumexp.hpp b/dpctl_ext/tensor/libtensor/source/accumulators/cumulative_logsumexp.hpp
new file mode 100644
index 000000000000..f1292320bd0d
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/accumulators/cumulative_logsumexp.hpp
@@ -0,0 +1,46 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_accumulation_impl
+//  extensions
+//===----------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern void init_cumulative_logsumexp(py::module_ m);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/accumulators/cumulative_prod.cpp b/dpctl_ext/tensor/libtensor/source/accumulators/cumulative_prod.cpp
new file mode 100644
index 000000000000..65f3c311eda1
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/accumulators/cumulative_prod.cpp
@@ -0,0 +1,356 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_accumulation_impl
+//  extensions
+//===----------------------------------------------------------------------===//
+
+#include <complex>
+#include <cstdint>
+#include <type_traits>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "accumulate_over_axis.hpp"
+#include "kernels/accumulators.hpp"
+#include "utils/type_dispatch_building.hpp"
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace impl
+{
+
+using dpctl::tensor::kernels::accumulators::accumulate_1d_contig_impl_fn_ptr_t;
+static accumulate_1d_contig_impl_fn_ptr_t
+    cumprod_1d_contig_dispatch_table[td_ns::num_types][td_ns::num_types];
+
+using dpctl::tensor::kernels::accumulators::accumulate_strided_impl_fn_ptr_t;
+static accumulate_strided_impl_fn_ptr_t
+    cumprod_strided_dispatch_table[td_ns::num_types][td_ns::num_types];
+
+static accumulate_1d_contig_impl_fn_ptr_t
+    cumprod_1d_include_initial_contig_dispatch_table[td_ns::num_types]
+                                                    [td_ns::num_types];
+
+static accumulate_strided_impl_fn_ptr_t
+    cumprod_include_initial_strided_dispatch_table[td_ns::num_types]
+                                                  [td_ns::num_types];
+
+template <typename argTy, typename outTy>
+struct TypePairSupportDataForProdAccumulation
+{
+    static constexpr bool is_defined = std::disjunction<
+        td_ns::TypePairDefinedEntry<argTy, bool, outTy, bool>,
+        td_ns::TypePairDefinedEntry<argTy, bool, outTy, std::int32_t>,
+        td_ns::TypePairDefinedEntry<argTy, bool, outTy, std::int64_t>,
+
+        // input int8_t
+        td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, std::int8_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, std::int32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, std::int64_t>,
+
+        // input uint8_t
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, std::uint8_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, std::uint32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, std::uint64_t>,
+
+        // input int16_t
+        td_ns::TypePairDefinedEntry<argTy, std::int16_t, outTy, std::int16_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int16_t, outTy, std::int32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int16_t, outTy, std::int64_t>,
+
+        // input uint16_t
+        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, std::uint16_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, std::uint32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, std::uint64_t>,
+
+        // input int32_t
+        td_ns::TypePairDefinedEntry<argTy, std::int32_t, outTy, std::int32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int32_t, outTy, std::int64_t>,
+
+        // input uint32_t
+        td_ns::TypePairDefinedEntry<argTy, std::uint32_t, outTy, std::uint32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint32_t, outTy, std::uint64_t>,
+
+        // input int64_t
+        td_ns::TypePairDefinedEntry<argTy, std::int64_t, outTy, std::int64_t>,
+
+        // input uint64_t
+        td_ns::TypePairDefinedEntry<argTy, std::uint64_t, outTy, std::uint64_t>,
+
+        // input half
+        td_ns::TypePairDefinedEntry<argTy, sycl::half, outTy, sycl::half>,
+
+        // input float
+        td_ns::TypePairDefinedEntry<argTy, float, outTy, float>,
+
+        // input double
+        td_ns::TypePairDefinedEntry<argTy, double, outTy, double>,
+
+        // input std::complex
+        td_ns::TypePairDefinedEntry<argTy,
+                                    std::complex<float>,
+                                    outTy,
+                                    std::complex<float>>,
+
+        td_ns::TypePairDefinedEntry<argTy,
+                                    std::complex<double>,
+                                    outTy,
+                                    std::complex<double>>,
+
+        // fall-through
+        td_ns::NotDefinedEntry>::is_defined;
+};
+
+template <typename T>
+using CumProdScanOpT = std::conditional_t<std::is_same_v<T, bool>,
+                                          sycl::logical_and<T>,
+                                          sycl::multiplies<T>>;
+
+template <typename fnT, typename srcTy, typename dstTy>
+struct CumProd1DContigFactory
+{
+    fnT get()
+    {
+        if constexpr (TypePairSupportDataForProdAccumulation<srcTy,
+                                                             dstTy>::is_defined)
+        {
+            using ScanOpT = CumProdScanOpT<dstTy>;
+            static constexpr bool include_initial = false;
+            if constexpr (std::is_same_v<srcTy, dstTy>) {
+                using dpctl::tensor::kernels::accumulators::NoOpTransformer;
+                fnT fn = dpctl::tensor::kernels::accumulators::
+                    accumulate_1d_contig_impl<srcTy, dstTy,
+                                              NoOpTransformer<dstTy>, ScanOpT,
+                                              include_initial>;
+                return fn;
+            }
+            else {
+                using dpctl::tensor::kernels::accumulators::CastTransformer;
+                fnT fn = dpctl::tensor::kernels::accumulators::
+                    accumulate_1d_contig_impl<srcTy, dstTy,
+                                              CastTransformer<srcTy, dstTy>,
+                                              ScanOpT, include_initial>;
+                return fn;
+            }
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+template <typename fnT, typename srcTy, typename dstTy>
+struct CumProd1DIncludeInitialContigFactory
+{
+    fnT get()
+    {
+        if constexpr (TypePairSupportDataForProdAccumulation<srcTy,
+                                                             dstTy>::is_defined)
+        {
+            using ScanOpT = CumProdScanOpT<dstTy>;
+            static constexpr bool include_initial = true;
+            if constexpr (std::is_same_v<srcTy, dstTy>) {
+                using dpctl::tensor::kernels::accumulators::NoOpTransformer;
+                fnT fn = dpctl::tensor::kernels::accumulators::
+                    accumulate_1d_contig_impl<srcTy, dstTy,
+                                              NoOpTransformer<dstTy>, ScanOpT,
+                                              include_initial>;
+                return fn;
+            }
+            else {
+                using dpctl::tensor::kernels::accumulators::CastTransformer;
+                fnT fn = dpctl::tensor::kernels::accumulators::
+                    accumulate_1d_contig_impl<srcTy, dstTy,
+                                              CastTransformer<srcTy, dstTy>,
+                                              ScanOpT, include_initial>;
+                return fn;
+            }
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+template <typename fnT, typename srcTy, typename dstTy>
+struct CumProdStridedFactory
+{
+    fnT get()
+    {
+        if constexpr (TypePairSupportDataForProdAccumulation<srcTy,
+                                                             dstTy>::is_defined)
+        {
+            using ScanOpT = CumProdScanOpT<dstTy>;
+            static constexpr bool include_initial = false;
+            if constexpr (std::is_same_v<srcTy, dstTy>) {
+                using dpctl::tensor::kernels::accumulators::NoOpTransformer;
+                fnT fn = dpctl::tensor::kernels::accumulators::
+                    accumulate_strided_impl<srcTy, dstTy,
+                                            NoOpTransformer<dstTy>, ScanOpT,
+                                            include_initial>;
+                return fn;
+            }
+            else {
+                using dpctl::tensor::kernels::accumulators::CastTransformer;
+                fnT fn = dpctl::tensor::kernels::accumulators::
+                    accumulate_strided_impl<srcTy, dstTy,
+                                            CastTransformer<srcTy, dstTy>,
+                                            ScanOpT, include_initial>;
+                return fn;
+            }
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+template <typename fnT, typename srcTy, typename dstTy>
+struct CumProdIncludeInitialStridedFactory
+{
+    fnT get()
+    {
+        if constexpr (TypePairSupportDataForProdAccumulation<srcTy,
+                                                             dstTy>::is_defined)
+        {
+            using ScanOpT = CumProdScanOpT<dstTy>;
+            static constexpr bool include_initial = true;
+            if constexpr (std::is_same_v<srcTy, dstTy>) {
+                using dpctl::tensor::kernels::accumulators::NoOpTransformer;
+                fnT fn = dpctl::tensor::kernels::accumulators::
+                    accumulate_strided_impl<srcTy, dstTy,
+                                            NoOpTransformer<dstTy>, ScanOpT,
+                                            include_initial>;
+                return fn;
+            }
+            else {
+                using dpctl::tensor::kernels::accumulators::CastTransformer;
+                fnT fn = dpctl::tensor::kernels::accumulators::
+                    accumulate_strided_impl<srcTy, dstTy,
+                                            CastTransformer<srcTy, dstTy>,
+                                            ScanOpT, include_initial>;
+                return fn;
+            }
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+void populate_cumprod_dispatch_tables(void)
+{
+    td_ns::DispatchTableBuilder<accumulate_1d_contig_impl_fn_ptr_t,
+                                CumProd1DContigFactory, td_ns::num_types>
+        dtb1;
+    dtb1.populate_dispatch_table(cumprod_1d_contig_dispatch_table);
+
+    td_ns::DispatchTableBuilder<accumulate_strided_impl_fn_ptr_t,
+                                CumProdStridedFactory, td_ns::num_types>
+        dtb2;
+    dtb2.populate_dispatch_table(cumprod_strided_dispatch_table);
+
+    td_ns::DispatchTableBuilder<accumulate_1d_contig_impl_fn_ptr_t,
+                                CumProd1DIncludeInitialContigFactory,
+                                td_ns::num_types>
+        dtb3;
+    dtb3.populate_dispatch_table(
+        cumprod_1d_include_initial_contig_dispatch_table);
+
+    td_ns::DispatchTableBuilder<accumulate_strided_impl_fn_ptr_t,
+                                CumProdIncludeInitialStridedFactory,
+                                td_ns::num_types>
+        dtb4;
+    dtb4.populate_dispatch_table(
+        cumprod_include_initial_strided_dispatch_table);
+
+    return;
+}
+
+} // namespace impl
+
+void init_cumulative_prod(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+
+    using impl::populate_cumprod_dispatch_tables;
+    populate_cumprod_dispatch_tables();
+
+    using impl::cumprod_1d_contig_dispatch_table;
+    using impl::cumprod_strided_dispatch_table;
+    auto cumprod_pyapi = [&](const arrayT &src, int trailing_dims_to_accumulate,
+                             const arrayT &dst, sycl::queue &exec_q,
+                             const event_vecT &depends = {}) {
+        return py_accumulate_over_axis(
+            src, trailing_dims_to_accumulate, dst, exec_q, depends,
+            cumprod_strided_dispatch_table, cumprod_1d_contig_dispatch_table);
+    };
+    m.def("_cumprod_over_axis", cumprod_pyapi, "", py::arg("src"),
+          py::arg("trailing_dims_to_accumulate"), py::arg("dst"),
+          py::arg("sycl_queue"), py::arg("depends") = py::list());
+
+    using impl::cumprod_1d_include_initial_contig_dispatch_table;
+    using impl::cumprod_include_initial_strided_dispatch_table;
+    auto cumprod_include_initial_pyapi =
+        [&](const arrayT &src, const arrayT &dst, sycl::queue &exec_q,
+            const event_vecT &depends = {}) {
+            return py_accumulate_final_axis_include_initial(
+                src, dst, exec_q, depends,
+                cumprod_include_initial_strided_dispatch_table,
+                cumprod_1d_include_initial_contig_dispatch_table);
+        };
+    m.def("_cumprod_final_axis_include_initial", cumprod_include_initial_pyapi,
+          "", py::arg("src"), py::arg("dst"), py::arg("sycl_queue"),
+          py::arg("depends") = py::list());
+
+    auto cumprod_dtype_supported = [&](const py::dtype &input_dtype,
+                                       const py::dtype &output_dtype) {
+        return py_accumulate_dtype_supported(input_dtype, output_dtype,
+                                             cumprod_strided_dispatch_table);
+    };
+    m.def("_cumprod_dtype_supported", cumprod_dtype_supported, "",
+          py::arg("arg_dtype"), py::arg("out_dtype"));
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/accumulators/cumulative_prod.hpp b/dpctl_ext/tensor/libtensor/source/accumulators/cumulative_prod.hpp
new file mode 100644
index 000000000000..e14bb2c44361
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/accumulators/cumulative_prod.hpp
@@ -0,0 +1,46 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_accumulation_impl
+//  extensions
+//===----------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern void init_cumulative_prod(py::module_ m);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/accumulators/cumulative_sum.cpp b/dpctl_ext/tensor/libtensor/source/accumulators/cumulative_sum.cpp
new file mode 100644
index 000000000000..60b46946acc9
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/accumulators/cumulative_sum.cpp
@@ -0,0 +1,354 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_accumulation_impl
+//  extensions
+//===----------------------------------------------------------------------===//
+
+#include <complex>
+#include <cstdint>
+#include <type_traits>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "accumulate_over_axis.hpp"
+#include "kernels/accumulators.hpp"
+#include "utils/type_dispatch_building.hpp"
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace impl
+{
+
+using dpctl::tensor::kernels::accumulators::accumulate_1d_contig_impl_fn_ptr_t;
+static accumulate_1d_contig_impl_fn_ptr_t
+    cumsum_1d_contig_dispatch_table[td_ns::num_types][td_ns::num_types];
+
+using dpctl::tensor::kernels::accumulators::accumulate_strided_impl_fn_ptr_t;
+static accumulate_strided_impl_fn_ptr_t
+    cumsum_strided_dispatch_table[td_ns::num_types][td_ns::num_types];
+
+static accumulate_1d_contig_impl_fn_ptr_t
+    cumsum_1d_include_initial_contig_dispatch_table[td_ns::num_types]
+                                                   [td_ns::num_types];
+
+static accumulate_strided_impl_fn_ptr_t
+    cumsum_include_initial_strided_dispatch_table[td_ns::num_types]
+                                                 [td_ns::num_types];
+
+template <typename argTy, typename outTy>
+struct TypePairSupportDataForSumAccumulation
+{
+    static constexpr bool is_defined = std::disjunction<
+        td_ns::TypePairDefinedEntry<argTy, bool, outTy, bool>,
+        td_ns::TypePairDefinedEntry<argTy, bool, outTy, std::int32_t>,
+        td_ns::TypePairDefinedEntry<argTy, bool, outTy, std::int64_t>,
+
+        // input int8_t
+        td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, std::int8_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, std::int32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, std::int64_t>,
+
+        // input uint8_t
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, std::uint8_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, std::uint32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, std::uint64_t>,
+
+        // input int16_t
+        td_ns::TypePairDefinedEntry<argTy, std::int16_t, outTy, std::int16_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int16_t, outTy, std::int32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int16_t, outTy, std::int64_t>,
+
+        // input uint16_t
+        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, std::uint16_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, std::uint32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, std::uint64_t>,
+
+        // input int32_t
+        td_ns::TypePairDefinedEntry<argTy, std::int32_t, outTy, std::int32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int32_t, outTy, std::int64_t>,
+
+        // input uint32_t
+        td_ns::TypePairDefinedEntry<argTy, std::uint32_t, outTy, std::uint32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint32_t, outTy, std::uint64_t>,
+
+        // input int64_t
+        td_ns::TypePairDefinedEntry<argTy, std::int64_t, outTy, std::int64_t>,
+
+        // input uint64_t
+        td_ns::TypePairDefinedEntry<argTy, std::uint64_t, outTy, std::uint64_t>,
+
+        // input half
+        td_ns::TypePairDefinedEntry<argTy, sycl::half, outTy, sycl::half>,
+
+        // input float
+        td_ns::TypePairDefinedEntry<argTy, float, outTy, float>,
+
+        // input double
+        td_ns::TypePairDefinedEntry<argTy, double, outTy, double>,
+
+        // input std::complex
+        td_ns::TypePairDefinedEntry<argTy,
+                                    std::complex<float>,
+                                    outTy,
+                                    std::complex<float>>,
+
+        td_ns::TypePairDefinedEntry<argTy,
+                                    std::complex<double>,
+                                    outTy,
+                                    std::complex<double>>,
+
+        // fall-through
+        td_ns::NotDefinedEntry>::is_defined;
+};
+
+template <typename T>
+using CumSumScanOpT = std::
+    conditional_t<std::is_same_v<T, bool>, sycl::logical_or<T>, sycl::plus<T>>;
+
+template <typename fnT, typename srcTy, typename dstTy>
+struct CumSum1DContigFactory
+{
+    fnT get()
+    {
+        if constexpr (TypePairSupportDataForSumAccumulation<srcTy,
+                                                            dstTy>::is_defined)
+        {
+            using ScanOpT = CumSumScanOpT<dstTy>;
+            static constexpr bool include_initial = false;
+            if constexpr (std::is_same_v<srcTy, dstTy>) {
+                using dpctl::tensor::kernels::accumulators::NoOpTransformer;
+                fnT fn = dpctl::tensor::kernels::accumulators::
+                    accumulate_1d_contig_impl<srcTy, dstTy,
+                                              NoOpTransformer<dstTy>, ScanOpT,
+                                              include_initial>;
+                return fn;
+            }
+            else {
+                using dpctl::tensor::kernels::accumulators::CastTransformer;
+                fnT fn = dpctl::tensor::kernels::accumulators::
+                    accumulate_1d_contig_impl<srcTy, dstTy,
+                                              CastTransformer<srcTy, dstTy>,
+                                              ScanOpT, include_initial>;
+                return fn;
+            }
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+template <typename fnT, typename srcTy, typename dstTy>
+struct CumSum1DIncludeInitialContigFactory
+{
+    fnT get()
+    {
+        if constexpr (TypePairSupportDataForSumAccumulation<srcTy,
+                                                            dstTy>::is_defined)
+        {
+            using ScanOpT = CumSumScanOpT<dstTy>;
+            static constexpr bool include_initial = true;
+            if constexpr (std::is_same_v<srcTy, dstTy>) {
+                using dpctl::tensor::kernels::accumulators::NoOpTransformer;
+                fnT fn = dpctl::tensor::kernels::accumulators::
+                    accumulate_1d_contig_impl<srcTy, dstTy,
+                                              NoOpTransformer<dstTy>, ScanOpT,
+                                              include_initial>;
+                return fn;
+            }
+            else {
+                using dpctl::tensor::kernels::accumulators::CastTransformer;
+                fnT fn = dpctl::tensor::kernels::accumulators::
+                    accumulate_1d_contig_impl<srcTy, dstTy,
+                                              CastTransformer<srcTy, dstTy>,
+                                              ScanOpT, include_initial>;
+                return fn;
+            }
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+template <typename fnT, typename srcTy, typename dstTy>
+struct CumSumStridedFactory
+{
+    fnT get()
+    {
+        if constexpr (TypePairSupportDataForSumAccumulation<srcTy,
+                                                            dstTy>::is_defined)
+        {
+            using ScanOpT = CumSumScanOpT<dstTy>;
+            static constexpr bool include_initial = false;
+            if constexpr (std::is_same_v<srcTy, dstTy>) {
+                using dpctl::tensor::kernels::accumulators::NoOpTransformer;
+                fnT fn = dpctl::tensor::kernels::accumulators::
+                    accumulate_strided_impl<srcTy, dstTy,
+                                            NoOpTransformer<dstTy>, ScanOpT,
+                                            include_initial>;
+                return fn;
+            }
+            else {
+                using dpctl::tensor::kernels::accumulators::CastTransformer;
+                fnT fn = dpctl::tensor::kernels::accumulators::
+                    accumulate_strided_impl<srcTy, dstTy,
+                                            CastTransformer<srcTy, dstTy>,
+                                            ScanOpT, include_initial>;
+                return fn;
+            }
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+template <typename fnT, typename srcTy, typename dstTy>
+struct CumSumIncludeInitialStridedFactory
+{
+    fnT get()
+    {
+        if constexpr (TypePairSupportDataForSumAccumulation<srcTy,
+                                                            dstTy>::is_defined)
+        {
+            using ScanOpT = CumSumScanOpT<dstTy>;
+            static constexpr bool include_initial = true;
+            if constexpr (std::is_same_v<srcTy, dstTy>) {
+                using dpctl::tensor::kernels::accumulators::NoOpTransformer;
+                fnT fn = dpctl::tensor::kernels::accumulators::
+                    accumulate_strided_impl<srcTy, dstTy,
+                                            NoOpTransformer<dstTy>, ScanOpT,
+                                            include_initial>;
+                return fn;
+            }
+            else {
+                using dpctl::tensor::kernels::accumulators::CastTransformer;
+                fnT fn = dpctl::tensor::kernels::accumulators::
+                    accumulate_strided_impl<srcTy, dstTy,
+                                            CastTransformer<srcTy, dstTy>,
+                                            ScanOpT, include_initial>;
+                return fn;
+            }
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+void populate_cumsum_dispatch_tables(void)
+{
+    td_ns::DispatchTableBuilder<accumulate_1d_contig_impl_fn_ptr_t,
+                                CumSum1DContigFactory, td_ns::num_types>
+        dtb1;
+    dtb1.populate_dispatch_table(cumsum_1d_contig_dispatch_table);
+
+    td_ns::DispatchTableBuilder<accumulate_strided_impl_fn_ptr_t,
+                                CumSumStridedFactory, td_ns::num_types>
+        dtb2;
+    dtb2.populate_dispatch_table(cumsum_strided_dispatch_table);
+
+    td_ns::DispatchTableBuilder<accumulate_1d_contig_impl_fn_ptr_t,
+                                CumSum1DIncludeInitialContigFactory,
+                                td_ns::num_types>
+        dtb3;
+    dtb3.populate_dispatch_table(
+        cumsum_1d_include_initial_contig_dispatch_table);
+
+    td_ns::DispatchTableBuilder<accumulate_strided_impl_fn_ptr_t,
+                                CumSumIncludeInitialStridedFactory,
+                                td_ns::num_types>
+        dtb4;
+    dtb4.populate_dispatch_table(cumsum_include_initial_strided_dispatch_table);
+
+    return;
+}
+
+} // namespace impl
+
+void init_cumulative_sum(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+
+    using impl::populate_cumsum_dispatch_tables;
+    populate_cumsum_dispatch_tables();
+
+    using impl::cumsum_1d_contig_dispatch_table;
+    using impl::cumsum_strided_dispatch_table;
+    auto cumsum_pyapi = [&](const arrayT &src, int trailing_dims_to_accumulate,
+                            const arrayT &dst, sycl::queue &exec_q,
+                            const event_vecT &depends = {}) {
+        return py_accumulate_over_axis(
+            src, trailing_dims_to_accumulate, dst, exec_q, depends,
+            cumsum_strided_dispatch_table, cumsum_1d_contig_dispatch_table);
+    };
+    m.def("_cumsum_over_axis", cumsum_pyapi, "", py::arg("src"),
+          py::arg("trailing_dims_to_accumulate"), py::arg("dst"),
+          py::arg("sycl_queue"), py::arg("depends") = py::list());
+
+    using impl::cumsum_1d_include_initial_contig_dispatch_table;
+    using impl::cumsum_include_initial_strided_dispatch_table;
+    auto cumsum_include_initial_pyapi =
+        [&](const arrayT &src, const arrayT &dst, sycl::queue &exec_q,
+            const event_vecT &depends = {}) {
+            return py_accumulate_final_axis_include_initial(
+                src, dst, exec_q, depends,
+                cumsum_include_initial_strided_dispatch_table,
+                cumsum_1d_include_initial_contig_dispatch_table);
+        };
+    m.def("_cumsum_final_axis_include_initial", cumsum_include_initial_pyapi,
+          "", py::arg("src"), py::arg("dst"), py::arg("sycl_queue"),
+          py::arg("depends") = py::list());
+
+    auto cumsum_dtype_supported = [&](const py::dtype &input_dtype,
+                                      const py::dtype &output_dtype) {
+        return py_accumulate_dtype_supported(input_dtype, output_dtype,
+                                             cumsum_strided_dispatch_table);
+    };
+    m.def("_cumsum_dtype_supported", cumsum_dtype_supported, "",
+          py::arg("arg_dtype"), py::arg("out_dtype"));
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/accumulators/cumulative_sum.hpp b/dpctl_ext/tensor/libtensor/source/accumulators/cumulative_sum.hpp
new file mode 100644
index 000000000000..5e06b222a3bc
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/accumulators/cumulative_sum.hpp
@@ -0,0 +1,46 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_accumulation_impl
+//  extensions
+//===----------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern void init_cumulative_sum(py::module_ m);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/tensor_accumulation.cpp b/dpctl_ext/tensor/libtensor/source/tensor_accumulation.cpp
new file mode 100644
index 000000000000..faa3fc8b52c6
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/tensor_accumulation.cpp
@@ -0,0 +1,43 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_accumulation_impl
+//  extensions
+//===----------------------------------------------------------------------===//
+
+#include <pybind11/pybind11.h>
+
+#include "accumulators/accumulators_common.hpp"
+
+PYBIND11_MODULE(_tensor_accumulation_impl, m)
+{
+    dpctl::tensor::py_internal::init_accumulator_functions(m);
+}
diff --git a/dpnp/dpnp_iface_mathematical.py b/dpnp/dpnp_iface_mathematical.py
index 06f4fe936253..000c343abdb4 100644
--- a/dpnp/dpnp_iface_mathematical.py
+++ b/dpnp/dpnp_iface_mathematical.py
@@ -1126,7 +1126,7 @@ def cumprod(a, axis=None, dtype=None, out=None):
     return dpnp_wrap_reduction_call(
         usm_a,
         out,
-        dpt.cumulative_prod,
+        dpt_ext.cumulative_prod,
         _get_reduction_res_dt(a, dtype),
         axis=axis,
         dtype=dtype,
@@ -1218,7 +1218,7 @@ def cumsum(a, axis=None, dtype=None, out=None):
     return dpnp_wrap_reduction_call(
         usm_a,
         out,
-        dpt.cumulative_sum,
+        dpt_ext.cumulative_sum,
         _get_reduction_res_dt(a, dtype),
         axis=axis,
         dtype=dtype,
@@ -1307,7 +1307,7 @@ def cumulative_prod(
     return dpnp_wrap_reduction_call(
         dpnp.get_usm_ndarray(x),
         out,
-        dpt.cumulative_prod,
+        dpt_ext.cumulative_prod,
         _get_reduction_res_dt(x, dtype),
         axis=axis,
         dtype=dtype,
@@ -1403,7 +1403,7 @@ def cumulative_sum(
     return dpnp_wrap_reduction_call(
         dpnp.get_usm_ndarray(x),
         out,
-        dpt.cumulative_sum,
+        dpt_ext.cumulative_sum,
         _get_reduction_res_dt(x, dtype),
         axis=axis,
         dtype=dtype,
diff --git a/dpnp/dpnp_iface_trigonometric.py b/dpnp/dpnp_iface_trigonometric.py
index 9894bd304701..460a0dc80f0f 100644
--- a/dpnp/dpnp_iface_trigonometric.py
+++ b/dpnp/dpnp_iface_trigonometric.py
@@ -48,6 +48,7 @@
 
 # TODO: revert to `import dpctl.tensor...`
 # when dpnp fully migrates dpctl/tensor
+import dpctl_ext.tensor as dpt_ext
 import dpctl_ext.tensor._type_utils as dtu
 import dpnp
 import dpnp.backend.extensions.ufunc._ufunc_impl as ufi
@@ -934,7 +935,7 @@ def cumlogsumexp(
     return dpnp_wrap_reduction_call(
         usm_x,
         out,
-        dpt.cumulative_logsumexp,
+        dpt_ext.cumulative_logsumexp,
         _get_accumulation_res_dt(x, dtype),
         axis=axis,
         dtype=dtype,

From dd62b6ef2a317247648a169206bf6c7bfdee7cb2 Mon Sep 17 00:00:00 2001
From: vlad-perevezentsev <vladislav.perevezentsev@intel.com>
Date: Mon, 9 Mar 2026 16:47:05 +0100
Subject: [PATCH 10/43] Move `_tensor_sorting_impl` extension and use it for
 dpnp (#2793)

This PR completely moves `_tensor_sorting_impl` pybind11 extension into
`dpctl_ext.tensor` and extends dpctl_ext.tensor Python API with the
functions `searchsorted isin, unique_all, unique_counts, unique_inverse,
unique_values, argsort, sort and top_k ` reusing them in dpnp
---
 dpctl_ext/tensor/CMakeLists.txt               |   21 +-
 dpctl_ext/tensor/__init__.py                  |   18 +
 dpctl_ext/tensor/_searchsorted.py             |  189 ++
 dpctl_ext/tensor/_set_functions.py            |  803 +++++++
 dpctl_ext/tensor/_sorting.py                  |  450 ++++
 .../include/kernels/sorting/isin.hpp          |  245 +++
 .../include/kernels/sorting/merge_sort.hpp    |  856 ++++++++
 .../include/kernels/sorting/radix_sort.hpp    | 1921 +++++++++++++++++
 .../kernels/sorting/search_sorted_detail.hpp  |  119 +
 .../include/kernels/sorting/searchsorted.hpp  |  258 +++
 .../kernels/sorting/sort_impl_fn_ptr_t.hpp    |   61 +
 .../include/kernels/sorting/sort_utils.hpp    |  144 ++
 .../include/kernels/sorting/topk.hpp          |  512 +++++
 .../include/utils/rich_comparisons.hpp        |  149 ++
 .../tensor/libtensor/source/sorting/isin.cpp  |  325 +++
 .../tensor/libtensor/source/sorting/isin.hpp  |   47 +
 .../source/sorting/merge_argsort.cpp          |  157 ++
 .../source/sorting/merge_argsort.hpp          |   47 +
 .../libtensor/source/sorting/merge_sort.cpp   |  139 ++
 .../libtensor/source/sorting/merge_sort.hpp   |   47 +
 .../source/sorting/py_argsort_common.hpp      |  184 ++
 .../source/sorting/py_sort_common.hpp         |  178 ++
 .../source/sorting/radix_argsort.cpp          |  187 ++
 .../source/sorting/radix_argsort.hpp          |   47 +
 .../libtensor/source/sorting/radix_sort.cpp   |  188 ++
 .../libtensor/source/sorting/radix_sort.hpp   |   47 +
 .../source/sorting/radix_sort_support.hpp     |   78 +
 .../libtensor/source/sorting/searchsorted.cpp |  478 ++++
 .../libtensor/source/sorting/searchsorted.hpp |   47 +
 .../tensor/libtensor/source/sorting/topk.cpp  |  303 +++
 .../tensor/libtensor/source/sorting/topk.hpp  |   47 +
 .../libtensor/source/tensor_sorting.cpp       |   55 +
 dpnp/dpnp_iface_logic.py                      |    5 +-
 dpnp/dpnp_iface_manipulation.py               |   14 +-
 dpnp/dpnp_iface_searching.py                  |    2 +-
 dpnp/dpnp_iface_sorting.py                    |    6 +-
 pyproject.toml                                |    2 +-
 37 files changed, 8362 insertions(+), 14 deletions(-)
 create mode 100644 dpctl_ext/tensor/_searchsorted.py
 create mode 100644 dpctl_ext/tensor/_set_functions.py
 create mode 100644 dpctl_ext/tensor/_sorting.py
 create mode 100644 dpctl_ext/tensor/libtensor/include/kernels/sorting/isin.hpp
 create mode 100644 dpctl_ext/tensor/libtensor/include/kernels/sorting/merge_sort.hpp
 create mode 100644 dpctl_ext/tensor/libtensor/include/kernels/sorting/radix_sort.hpp
 create mode 100644 dpctl_ext/tensor/libtensor/include/kernels/sorting/search_sorted_detail.hpp
 create mode 100644 dpctl_ext/tensor/libtensor/include/kernels/sorting/searchsorted.hpp
 create mode 100644 dpctl_ext/tensor/libtensor/include/kernels/sorting/sort_impl_fn_ptr_t.hpp
 create mode 100644 dpctl_ext/tensor/libtensor/include/kernels/sorting/sort_utils.hpp
 create mode 100644 dpctl_ext/tensor/libtensor/include/kernels/sorting/topk.hpp
 create mode 100644 dpctl_ext/tensor/libtensor/include/utils/rich_comparisons.hpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/sorting/isin.cpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/sorting/isin.hpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/sorting/merge_argsort.cpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/sorting/merge_argsort.hpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/sorting/merge_sort.cpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/sorting/merge_sort.hpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/sorting/py_argsort_common.hpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/sorting/py_sort_common.hpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/sorting/radix_argsort.cpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/sorting/radix_argsort.hpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/sorting/radix_sort.cpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/sorting/radix_sort.hpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/sorting/radix_sort_support.hpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/sorting/searchsorted.cpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/sorting/searchsorted.hpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/sorting/topk.cpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/sorting/topk.hpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/tensor_sorting.cpp

diff --git a/dpctl_ext/tensor/CMakeLists.txt b/dpctl_ext/tensor/CMakeLists.txt
index eff5e7552648..056b7c425544 100644
--- a/dpctl_ext/tensor/CMakeLists.txt
+++ b/dpctl_ext/tensor/CMakeLists.txt
@@ -69,10 +69,23 @@ set(_accumulator_sources
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/accumulators/cumulative_prod.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/accumulators/cumulative_sum.cpp
 )
+set(_sorting_sources
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/sorting/isin.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/sorting/merge_sort.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/sorting/merge_argsort.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/sorting/radix_sort.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/sorting/radix_argsort.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/sorting/searchsorted.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/sorting/topk.cpp
+)
 set(_tensor_accumulation_impl_sources
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/tensor_accumulation.cpp
     ${_accumulator_sources}
 )
+set(_tensor_sorting_impl_sources
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/tensor_sorting.cpp
+    ${_sorting_sources}
+)
 
 set(_static_lib_trgt simplify_iteration_space)
 
@@ -101,6 +114,12 @@ add_sycl_to_target(TARGET ${python_module_name} SOURCES ${_tensor_accumulation_i
 target_link_libraries(${python_module_name} PRIVATE ${_static_lib_trgt})
 list(APPEND _py_trgts ${python_module_name})
 
+set(python_module_name _tensor_sorting_impl)
+pybind11_add_module(${python_module_name} MODULE ${_tensor_sorting_impl_sources})
+add_sycl_to_target(TARGET ${python_module_name} SOURCES ${_tensor_sorting_impl_sources})
+target_link_libraries(${python_module_name} PRIVATE ${_static_lib_trgt})
+list(APPEND _py_trgts ${python_module_name})
+
 set(_clang_prefix "")
 if(WIN32)
     set(_clang_prefix "/clang:")
@@ -117,7 +136,7 @@ list(
     APPEND _no_fast_math_sources
     # ${_elementwise_sources}
     # ${_reduction_sources}
-    # ${_sorting_sources}
+    ${_sorting_sources}
     # ${_linalg_sources}
     ${_accumulator_sources}
 )
diff --git a/dpctl_ext/tensor/__init__.py b/dpctl_ext/tensor/__init__.py
index 72c7536ed473..cba7c417d559 100644
--- a/dpctl_ext/tensor/__init__.py
+++ b/dpctl_ext/tensor/__init__.py
@@ -80,10 +80,20 @@
 )
 from ._reshape import reshape
 from ._search_functions import where
+from ._searchsorted import searchsorted
+from ._set_functions import (
+    isin,
+    unique_all,
+    unique_counts,
+    unique_inverse,
+    unique_values,
+)
+from ._sorting import argsort, sort, top_k
 from ._type_utils import can_cast, finfo, iinfo, isdtype, result_type
 
 __all__ = [
     "arange",
+    "argsort",
     "asarray",
     "asnumpy",
     "astype",
@@ -108,6 +118,7 @@
     "full_like",
     "iinfo",
     "isdtype",
+    "isin",
     "linspace",
     "meshgrid",
     "moveaxis",
@@ -122,15 +133,22 @@
     "reshape",
     "result_type",
     "roll",
+    "searchsorted",
+    "sort",
     "squeeze",
     "stack",
     "swapaxes",
     "take",
     "take_along_axis",
     "tile",
+    "top_k",
     "to_numpy",
     "tril",
     "triu",
+    "unique_all",
+    "unique_counts",
+    "unique_inverse",
+    "unique_values",
     "unstack",
     "where",
     "zeros",
diff --git a/dpctl_ext/tensor/_searchsorted.py b/dpctl_ext/tensor/_searchsorted.py
new file mode 100644
index 000000000000..2d4807fb0d0c
--- /dev/null
+++ b/dpctl_ext/tensor/_searchsorted.py
@@ -0,0 +1,189 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+
+from typing import Literal, Union
+
+import dpctl
+import dpctl.utils as du
+
+# TODO: revert to `from ._usmarray import...`
+# when dpnp fully migrates dpctl/tensor
+from dpctl.tensor._usmarray import usm_ndarray
+
+from ._copy_utils import _empty_like_orderK
+from ._ctors import empty
+from ._tensor_impl import _copy_usm_ndarray_into_usm_ndarray as ti_copy
+from ._tensor_impl import _take as ti_take
+from ._tensor_impl import (
+    default_device_index_type as ti_default_device_index_type,
+)
+from ._tensor_sorting_impl import _searchsorted_left, _searchsorted_right
+from ._type_utils import isdtype, result_type
+
+
+def searchsorted(
+    x1: usm_ndarray,
+    x2: usm_ndarray,
+    /,
+    *,
+    side: Literal["left", "right"] = "left",
+    sorter: Union[usm_ndarray, None] = None,
+) -> usm_ndarray:
+    """searchsorted(x1, x2, side='left', sorter=None)
+
+    Finds the indices into `x1` such that, if the corresponding elements
+    in `x2` were inserted before the indices, the order of `x1`, when sorted
+    in ascending order, would be preserved.
+
+    Args:
+        x1 (usm_ndarray):
+            input array. Must be a one-dimensional array. If `sorter` is
+            `None`, must be sorted in ascending order; otherwise, `sorter` must
+            be an array of indices that sort `x1` in ascending order.
+        x2 (usm_ndarray):
+            array containing search values.
+        side (Literal["left", "right]):
+            argument controlling which index is returned if a value lands
+            exactly on an edge. If `x2` is an array of rank `N` where
+            `v = x2[n, m, ..., j]`, the element `ret[n, m, ..., j]` in the
+            return array `ret` contains the position `i` such that
+            if `side="left"`, it is the first index such that
+            `x1[i-1] < v <= x1[i]`, `0` if `v <= x1[0]`, and `x1.size`
+            if `v > x1[-1]`;
+            and if `side="right"`, it is the first position `i` such that
+            `x1[i-1] <= v < x1[i]`, `0` if `v < x1[0]`, and `x1.size`
+            if `v >= x1[-1]`. Default: `"left"`.
+        sorter (Optional[usm_ndarray]):
+            array of indices that sort `x1` in ascending order. The array must
+            have the same shape as `x1` and have an integral data type.
+            Out of bound index values of `sorter` array are treated using
+            `"wrap"` mode documented in :py:func:`dpctl.tensor.take`.
+            Default: `None`.
+    """
+    if not isinstance(x1, usm_ndarray):
+        raise TypeError(f"Expected dpctl.tensor.usm_ndarray, got {type(x1)}")
+    if not isinstance(x2, usm_ndarray):
+        raise TypeError(f"Expected dpctl.tensor.usm_ndarray, got {type(x2)}")
+    if sorter is not None and not isinstance(sorter, usm_ndarray):
+        raise TypeError(
+            f"Expected dpctl.tensor.usm_ndarray, got {type(sorter)}"
+        )
+
+    if side not in ["left", "right"]:
+        raise ValueError(
+            "Unrecognized value of 'side' keyword argument. "
+            "Expected either 'left' or 'right'"
+        )
+
+    if sorter is None:
+        q = du.get_execution_queue([x1.sycl_queue, x2.sycl_queue])
+    else:
+        q = du.get_execution_queue(
+            [x1.sycl_queue, x2.sycl_queue, sorter.sycl_queue]
+        )
+    if q is None:
+        raise du.ExecutionPlacementError(
+            "Execution placement can not be unambiguously "
+            "inferred from input arguments."
+        )
+
+    if x1.ndim != 1:
+        raise ValueError("First argument array must be one-dimensional")
+
+    x1_dt = x1.dtype
+    x2_dt = x2.dtype
+
+    _manager = du.SequentialOrderManager[q]
+    dep_evs = _manager.submitted_events
+    ev = dpctl.SyclEvent()
+    if sorter is not None:
+        if not isdtype(sorter.dtype, "integral"):
+            raise ValueError(
+                f"Sorter array must have integral data type, got {sorter.dtype}"
+            )
+        if x1.shape != sorter.shape:
+            raise ValueError(
+                "Sorter array must be one-dimension with the same "
+                "shape as the first argument array"
+            )
+        res = empty(x1.shape, dtype=x1_dt, usm_type=x1.usm_type, sycl_queue=q)
+        ind = (sorter,)
+        axis = 0
+        wrap_out_of_bound_indices_mode = 0
+        ht_ev, ev = ti_take(
+            x1,
+            ind,
+            res,
+            axis,
+            wrap_out_of_bound_indices_mode,
+            sycl_queue=q,
+            depends=dep_evs,
+        )
+        x1 = res
+        _manager.add_event_pair(ht_ev, ev)
+
+    if x1_dt != x2_dt:
+        dt = result_type(x1, x2)
+        if x1_dt != dt:
+            x1_buf = _empty_like_orderK(x1, dt)
+            dep_evs = _manager.submitted_events
+            ht_ev, ev = ti_copy(
+                src=x1, dst=x1_buf, sycl_queue=q, depends=dep_evs
+            )
+            _manager.add_event_pair(ht_ev, ev)
+            x1 = x1_buf
+        if x2_dt != dt:
+            x2_buf = _empty_like_orderK(x2, dt)
+            dep_evs = _manager.submitted_events
+            ht_ev, ev = ti_copy(
+                src=x2, dst=x2_buf, sycl_queue=q, depends=dep_evs
+            )
+            _manager.add_event_pair(ht_ev, ev)
+            x2 = x2_buf
+
+    dst_usm_type = du.get_coerced_usm_type([x1.usm_type, x2.usm_type])
+    index_dt = ti_default_device_index_type(q)
+
+    dst = _empty_like_orderK(x2, index_dt, usm_type=dst_usm_type)
+
+    dep_evs = _manager.submitted_events
+    if side == "left":
+        ht_ev, s_ev = _searchsorted_left(
+            hay=x1,
+            needles=x2,
+            positions=dst,
+            sycl_queue=q,
+            depends=dep_evs,
+        )
+    else:
+        ht_ev, s_ev = _searchsorted_right(
+            hay=x1, needles=x2, positions=dst, sycl_queue=q, depends=dep_evs
+        )
+    _manager.add_event_pair(ht_ev, s_ev)
+    return dst
diff --git a/dpctl_ext/tensor/_set_functions.py b/dpctl_ext/tensor/_set_functions.py
new file mode 100644
index 000000000000..93f81f044fd2
--- /dev/null
+++ b/dpctl_ext/tensor/_set_functions.py
@@ -0,0 +1,803 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+from typing import NamedTuple, Optional, Union
+
+import dpctl.tensor as dpt
+import dpctl.utils as du
+from dpctl.tensor._tensor_elementwise_impl import _not_equal, _subtract
+
+# TODO: revert to `import dpctl.tensor...`
+# when dpnp fully migrates dpctl/tensor
+import dpctl_ext.tensor as dpt_ext
+
+from ._copy_utils import _empty_like_orderK
+from ._scalar_utils import (
+    _get_dtype,
+    _get_queue_usm_type,
+    _get_shape,
+    _validate_dtype,
+)
+from ._tensor_impl import (
+    _copy_usm_ndarray_into_usm_ndarray,
+    _extract,
+    _full_usm_ndarray,
+    _linspace_step,
+    _take,
+    default_device_index_type,
+    mask_positions,
+)
+from ._tensor_sorting_impl import (
+    _argsort_ascending,
+    _isin,
+    _searchsorted_left,
+    _sort_ascending,
+)
+from ._type_utils import (
+    _resolve_weak_types_all_py_ints,
+    _to_device_supported_dtype,
+)
+
+__all__ = [
+    "isin",
+    "unique_values",
+    "unique_counts",
+    "unique_inverse",
+    "unique_all",
+    "UniqueAllResult",
+    "UniqueCountsResult",
+    "UniqueInverseResult",
+]
+
+
+class UniqueAllResult(NamedTuple):
+    values: dpt.usm_ndarray
+    indices: dpt.usm_ndarray
+    inverse_indices: dpt.usm_ndarray
+    counts: dpt.usm_ndarray
+
+
+class UniqueCountsResult(NamedTuple):
+    values: dpt.usm_ndarray
+    counts: dpt.usm_ndarray
+
+
+class UniqueInverseResult(NamedTuple):
+    values: dpt.usm_ndarray
+    inverse_indices: dpt.usm_ndarray
+
+
+def unique_values(x: dpt.usm_ndarray) -> dpt.usm_ndarray:
+    """unique_values(x)
+
+    Returns the unique elements of an input array `x`.
+
+    Args:
+        x (usm_ndarray):
+            input array. Inputs with more than one dimension are flattened.
+    Returns:
+        usm_ndarray
+            an array containing the set of unique elements in `x`. The
+            returned array has the same data type as `x`.
+    """
+    if not isinstance(x, dpt.usm_ndarray):
+        raise TypeError(f"Expected dpctl.tensor.usm_ndarray, got {type(x)}")
+    array_api_dev = x.device
+    exec_q = array_api_dev.sycl_queue
+    if x.ndim == 1:
+        fx = x
+    else:
+        fx = dpt_ext.reshape(x, (x.size,), order="C")
+    if fx.size == 0:
+        return fx
+    s = dpt_ext.empty_like(fx, order="C")
+    _manager = du.SequentialOrderManager[exec_q]
+    dep_evs = _manager.submitted_events
+    if fx.flags.c_contiguous:
+        ht_ev, sort_ev = _sort_ascending(
+            src=fx,
+            trailing_dims_to_sort=1,
+            dst=s,
+            sycl_queue=exec_q,
+            depends=dep_evs,
+        )
+        _manager.add_event_pair(ht_ev, sort_ev)
+    else:
+        tmp = dpt_ext.empty_like(fx, order="C")
+        ht_ev, copy_ev = _copy_usm_ndarray_into_usm_ndarray(
+            src=fx, dst=tmp, sycl_queue=exec_q, depends=dep_evs
+        )
+        _manager.add_event_pair(ht_ev, copy_ev)
+        ht_ev, sort_ev = _sort_ascending(
+            src=tmp,
+            trailing_dims_to_sort=1,
+            dst=s,
+            sycl_queue=exec_q,
+            depends=[copy_ev],
+        )
+        _manager.add_event_pair(ht_ev, sort_ev)
+    unique_mask = dpt_ext.empty(fx.shape, dtype="?", sycl_queue=exec_q)
+    ht_ev, uneq_ev = _not_equal(
+        src1=s[:-1],
+        src2=s[1:],
+        dst=unique_mask[1:],
+        sycl_queue=exec_q,
+        depends=[sort_ev],
+    )
+    _manager.add_event_pair(ht_ev, uneq_ev)
+    # writing into new allocation, no dependencies
+    ht_ev, one_ev = _full_usm_ndarray(
+        fill_value=True, dst=unique_mask[0], sycl_queue=exec_q
+    )
+    _manager.add_event_pair(ht_ev, one_ev)
+    cumsum = dpt_ext.empty(s.shape, dtype=dpt.int64, sycl_queue=exec_q)
+    # synchronizing call
+    n_uniques = mask_positions(
+        unique_mask, cumsum, sycl_queue=exec_q, depends=[one_ev, uneq_ev]
+    )
+    if n_uniques == fx.size:
+        return s
+    unique_vals = dpt_ext.empty(
+        n_uniques, dtype=x.dtype, usm_type=x.usm_type, sycl_queue=exec_q
+    )
+    ht_ev, ex_e = _extract(
+        src=s,
+        cumsum=cumsum,
+        axis_start=0,
+        axis_end=1,
+        dst=unique_vals,
+        sycl_queue=exec_q,
+    )
+    _manager.add_event_pair(ht_ev, ex_e)
+    return unique_vals
+
+
+def unique_counts(x: dpt.usm_ndarray) -> UniqueCountsResult:
+    """unique_counts(x)
+
+    Returns the unique elements of an input array `x` and the corresponding
+    counts for each unique element in `x`.
+
+    Args:
+        x (usm_ndarray):
+            input array. Inputs with more than one dimension are flattened.
+    Returns:
+        tuple[usm_ndarray, usm_ndarray]
+            a namedtuple `(values, counts)` whose
+
+            * first element is the field name `values` and is an array
+               containing the unique elements of `x`. This array has the
+               same data type as `x`.
+            * second element has the field name `counts` and is an array
+              containing the number of times each unique element occurs in `x`.
+              This array has the same shape as `values` and has the default
+              array index data type.
+    """
+    if not isinstance(x, dpt.usm_ndarray):
+        raise TypeError(f"Expected dpctl.tensor.usm_ndarray, got {type(x)}")
+    array_api_dev = x.device
+    exec_q = array_api_dev.sycl_queue
+    x_usm_type = x.usm_type
+    if x.ndim == 1:
+        fx = x
+    else:
+        fx = dpt_ext.reshape(x, (x.size,), order="C")
+    ind_dt = default_device_index_type(exec_q)
+    if fx.size == 0:
+        return UniqueCountsResult(fx, dpt_ext.empty_like(fx, dtype=ind_dt))
+    s = dpt_ext.empty_like(fx, order="C")
+
+    _manager = du.SequentialOrderManager[exec_q]
+    dep_evs = _manager.submitted_events
+    if fx.flags.c_contiguous:
+        ht_ev, sort_ev = _sort_ascending(
+            src=fx,
+            trailing_dims_to_sort=1,
+            dst=s,
+            sycl_queue=exec_q,
+            depends=dep_evs,
+        )
+        _manager.add_event_pair(ht_ev, sort_ev)
+    else:
+        tmp = dpt_ext.empty_like(fx, order="C")
+        ht_ev, copy_ev = _copy_usm_ndarray_into_usm_ndarray(
+            src=fx, dst=tmp, sycl_queue=exec_q, depends=dep_evs
+        )
+        _manager.add_event_pair(ht_ev, copy_ev)
+        ht_ev, sort_ev = _sort_ascending(
+            src=tmp,
+            dst=s,
+            trailing_dims_to_sort=1,
+            sycl_queue=exec_q,
+            depends=[copy_ev],
+        )
+        _manager.add_event_pair(ht_ev, sort_ev)
+    unique_mask = dpt_ext.empty(s.shape, dtype="?", sycl_queue=exec_q)
+    ht_ev, uneq_ev = _not_equal(
+        src1=s[:-1],
+        src2=s[1:],
+        dst=unique_mask[1:],
+        sycl_queue=exec_q,
+        depends=[sort_ev],
+    )
+    _manager.add_event_pair(ht_ev, uneq_ev)
+    # no dependency, since we write into new allocation
+    ht_ev, one_ev = _full_usm_ndarray(
+        fill_value=True, dst=unique_mask[0], sycl_queue=exec_q
+    )
+    _manager.add_event_pair(ht_ev, one_ev)
+    cumsum = dpt_ext.empty(
+        unique_mask.shape, dtype=dpt.int64, sycl_queue=exec_q
+    )
+    # synchronizing call
+    n_uniques = mask_positions(
+        unique_mask, cumsum, sycl_queue=exec_q, depends=[one_ev, uneq_ev]
+    )
+    if n_uniques == fx.size:
+        return UniqueCountsResult(
+            s,
+            dpt_ext.ones(
+                n_uniques, dtype=ind_dt, usm_type=x_usm_type, sycl_queue=exec_q
+            ),
+        )
+    unique_vals = dpt_ext.empty(
+        n_uniques, dtype=x.dtype, usm_type=x_usm_type, sycl_queue=exec_q
+    )
+    # populate unique values
+    ht_ev, ex_e = _extract(
+        src=s,
+        cumsum=cumsum,
+        axis_start=0,
+        axis_end=1,
+        dst=unique_vals,
+        sycl_queue=exec_q,
+    )
+    _manager.add_event_pair(ht_ev, ex_e)
+    unique_counts = dpt_ext.empty(
+        n_uniques + 1, dtype=ind_dt, usm_type=x_usm_type, sycl_queue=exec_q
+    )
+    idx = dpt_ext.empty(x.size, dtype=ind_dt, sycl_queue=exec_q)
+    # writing into new allocation, no dependency
+    ht_ev, id_ev = _linspace_step(start=0, dt=1, dst=idx, sycl_queue=exec_q)
+    _manager.add_event_pair(ht_ev, id_ev)
+    ht_ev, extr_ev = _extract(
+        src=idx,
+        cumsum=cumsum,
+        axis_start=0,
+        axis_end=1,
+        dst=unique_counts[:-1],
+        sycl_queue=exec_q,
+        depends=[id_ev],
+    )
+    _manager.add_event_pair(ht_ev, extr_ev)
+    # no dependency, writing into disjoint segmenent of new allocation
+    ht_ev, set_ev = _full_usm_ndarray(
+        x.size, dst=unique_counts[-1], sycl_queue=exec_q
+    )
+    _manager.add_event_pair(ht_ev, set_ev)
+    _counts = dpt_ext.empty_like(unique_counts[1:])
+    ht_ev, sub_ev = _subtract(
+        src1=unique_counts[1:],
+        src2=unique_counts[:-1],
+        dst=_counts,
+        sycl_queue=exec_q,
+        depends=[set_ev, extr_ev],
+    )
+    _manager.add_event_pair(ht_ev, sub_ev)
+    return UniqueCountsResult(unique_vals, _counts)
+
+
+def unique_inverse(x):
+    """unique_inverse
+
+    Returns the unique elements of an input array x and the indices from the
+    set of unique elements that reconstruct `x`.
+
+    Args:
+        x (usm_ndarray):
+            input array. Inputs with more than one dimension are flattened.
+    Returns:
+        tuple[usm_ndarray, usm_ndarray]
+            a namedtuple `(values, inverse_indices)` whose
+
+            * first element has the field name `values` and is an array
+              containing the unique elements of `x`. The array has the same
+              data type as `x`.
+            * second element has the field name `inverse_indices` and is an
+              array containing the indices of values that reconstruct `x`.
+              The array has the same shape as `x` and has the default array
+              index data type.
+    """
+    if not isinstance(x, dpt.usm_ndarray):
+        raise TypeError(f"Expected dpctl.tensor.usm_ndarray, got {type(x)}")
+    array_api_dev = x.device
+    exec_q = array_api_dev.sycl_queue
+    x_usm_type = x.usm_type
+    ind_dt = default_device_index_type(exec_q)
+    if x.ndim == 1:
+        fx = x
+    else:
+        fx = dpt_ext.reshape(x, (x.size,), order="C")
+    sorting_ids = dpt_ext.empty_like(fx, dtype=ind_dt, order="C")
+    unsorting_ids = dpt_ext.empty_like(sorting_ids, dtype=ind_dt, order="C")
+    if fx.size == 0:
+        return UniqueInverseResult(fx, dpt_ext.reshape(unsorting_ids, x.shape))
+
+    _manager = du.SequentialOrderManager[exec_q]
+    dep_evs = _manager.submitted_events
+    if fx.flags.c_contiguous:
+        ht_ev, sort_ev = _argsort_ascending(
+            src=fx,
+            trailing_dims_to_sort=1,
+            dst=sorting_ids,
+            sycl_queue=exec_q,
+            depends=dep_evs,
+        )
+        _manager.add_event_pair(ht_ev, sort_ev)
+    else:
+        tmp = dpt_ext.empty_like(fx, order="C")
+        ht_ev, copy_ev = _copy_usm_ndarray_into_usm_ndarray(
+            src=fx, dst=tmp, sycl_queue=exec_q, depends=dep_evs
+        )
+        _manager.add_event_pair(ht_ev, copy_ev)
+        ht_ev, sort_ev = _argsort_ascending(
+            src=tmp,
+            trailing_dims_to_sort=1,
+            dst=sorting_ids,
+            sycl_queue=exec_q,
+            depends=[copy_ev],
+        )
+        _manager.add_event_pair(ht_ev, sort_ev)
+    ht_ev, argsort_ev = _argsort_ascending(
+        src=sorting_ids,
+        trailing_dims_to_sort=1,
+        dst=unsorting_ids,
+        sycl_queue=exec_q,
+        depends=[sort_ev],
+    )
+    _manager.add_event_pair(ht_ev, argsort_ev)
+    s = dpt_ext.empty_like(fx)
+    # s = fx[sorting_ids]
+    ht_ev, take_ev = _take(
+        src=fx,
+        ind=(sorting_ids,),
+        dst=s,
+        axis_start=0,
+        mode=0,
+        sycl_queue=exec_q,
+        depends=[sort_ev],
+    )
+    _manager.add_event_pair(ht_ev, take_ev)
+    unique_mask = dpt_ext.empty(fx.shape, dtype="?", sycl_queue=exec_q)
+    ht_ev, uneq_ev = _not_equal(
+        src1=s[:-1],
+        src2=s[1:],
+        dst=unique_mask[1:],
+        sycl_queue=exec_q,
+        depends=[take_ev],
+    )
+    _manager.add_event_pair(ht_ev, uneq_ev)
+    # no dependency
+    ht_ev, one_ev = _full_usm_ndarray(
+        fill_value=True, dst=unique_mask[0], sycl_queue=exec_q
+    )
+    _manager.add_event_pair(ht_ev, one_ev)
+    cumsum = dpt_ext.empty(
+        unique_mask.shape, dtype=dpt.int64, sycl_queue=exec_q
+    )
+    # synchronizing call
+    n_uniques = mask_positions(
+        unique_mask, cumsum, sycl_queue=exec_q, depends=[uneq_ev, one_ev]
+    )
+    if n_uniques == fx.size:
+        return UniqueInverseResult(s, dpt_ext.reshape(unsorting_ids, x.shape))
+    unique_vals = dpt_ext.empty(
+        n_uniques, dtype=x.dtype, usm_type=x_usm_type, sycl_queue=exec_q
+    )
+    ht_ev, uv_ev = _extract(
+        src=s,
+        cumsum=cumsum,
+        axis_start=0,
+        axis_end=1,
+        dst=unique_vals,
+        sycl_queue=exec_q,
+    )
+    _manager.add_event_pair(ht_ev, uv_ev)
+    cum_unique_counts = dpt_ext.empty(
+        n_uniques + 1, dtype=ind_dt, usm_type=x_usm_type, sycl_queue=exec_q
+    )
+    idx = dpt_ext.empty(x.size, dtype=ind_dt, sycl_queue=exec_q)
+    ht_ev, id_ev = _linspace_step(start=0, dt=1, dst=idx, sycl_queue=exec_q)
+    _manager.add_event_pair(ht_ev, id_ev)
+    ht_ev, extr_ev = _extract(
+        src=idx,
+        cumsum=cumsum,
+        axis_start=0,
+        axis_end=1,
+        dst=cum_unique_counts[:-1],
+        sycl_queue=exec_q,
+        depends=[id_ev],
+    )
+    _manager.add_event_pair(ht_ev, extr_ev)
+    ht_ev, set_ev = _full_usm_ndarray(
+        x.size, dst=cum_unique_counts[-1], sycl_queue=exec_q
+    )
+    _manager.add_event_pair(ht_ev, set_ev)
+    _counts = dpt_ext.empty_like(cum_unique_counts[1:])
+    ht_ev, sub_ev = _subtract(
+        src1=cum_unique_counts[1:],
+        src2=cum_unique_counts[:-1],
+        dst=_counts,
+        sycl_queue=exec_q,
+        depends=[set_ev, extr_ev],
+    )
+    _manager.add_event_pair(ht_ev, sub_ev)
+
+    inv = dpt_ext.empty_like(x, dtype=ind_dt, order="C")
+    ht_ev, ssl_ev = _searchsorted_left(
+        hay=unique_vals,
+        needles=x,
+        positions=inv,
+        sycl_queue=exec_q,
+        depends=[
+            uv_ev,
+        ],
+    )
+    _manager.add_event_pair(ht_ev, ssl_ev)
+
+    return UniqueInverseResult(unique_vals, inv)
+
+
+def unique_all(x: dpt.usm_ndarray) -> UniqueAllResult:
+    """unique_all(x)
+
+    Returns the unique elements of an input array `x`, the first occurring
+    indices for each unique element in `x`, the indices from the set of unique
+    elements that reconstruct `x`, and the corresponding counts for each
+    unique element in `x`.
+
+    Args:
+        x (usm_ndarray):
+            input array. Inputs with more than one dimension are flattened.
+    Returns:
+        tuple[usm_ndarray, usm_ndarray, usm_ndarray, usm_ndarray]
+            a namedtuple `(values, indices, inverse_indices, counts)` whose
+
+            * first element has the field name `values` and is an array
+              containing the unique elements of `x`. The array has the same
+              data type as `x`.
+            * second element has the field name `indices` and is an array
+              the indices (of first occurrences) of `x` that result in
+              `values`. The array has the same shape as `values` and has the
+              default array index data type.
+            * third element has the field name `inverse_indices` and is an
+              array containing the indices of values that reconstruct `x`.
+              The array has the same shape as `x` and has the default array
+              index data type.
+            * fourth element has the field name `counts` and is an array
+              containing the number of times each unique element occurs in `x`.
+              This array has the same shape as `values` and has the default
+              array index data type.
+    """
+    if not isinstance(x, dpt.usm_ndarray):
+        raise TypeError(f"Expected dpctl.tensor.usm_ndarray, got {type(x)}")
+    array_api_dev = x.device
+    exec_q = array_api_dev.sycl_queue
+    x_usm_type = x.usm_type
+    ind_dt = default_device_index_type(exec_q)
+    if x.ndim == 1:
+        fx = x
+    else:
+        fx = dpt_ext.reshape(x, (x.size,), order="C")
+    sorting_ids = dpt_ext.empty_like(fx, dtype=ind_dt, order="C")
+    unsorting_ids = dpt_ext.empty_like(sorting_ids, dtype=ind_dt, order="C")
+    if fx.size == 0:
+        # original array contains no data
+        # so it can be safely returned as values
+        return UniqueAllResult(
+            fx,
+            sorting_ids,
+            dpt_ext.reshape(unsorting_ids, x.shape),
+            dpt_ext.empty_like(fx, dtype=ind_dt),
+        )
+    _manager = du.SequentialOrderManager[exec_q]
+    dep_evs = _manager.submitted_events
+    if fx.flags.c_contiguous:
+        ht_ev, sort_ev = _argsort_ascending(
+            src=fx,
+            trailing_dims_to_sort=1,
+            dst=sorting_ids,
+            sycl_queue=exec_q,
+            depends=dep_evs,
+        )
+        _manager.add_event_pair(ht_ev, sort_ev)
+    else:
+        tmp = dpt_ext.empty_like(fx, order="C")
+        ht_ev, copy_ev = _copy_usm_ndarray_into_usm_ndarray(
+            src=fx, dst=tmp, sycl_queue=exec_q, depends=dep_evs
+        )
+        _manager.add_event_pair(ht_ev, copy_ev)
+        ht_ev, sort_ev = _argsort_ascending(
+            src=tmp,
+            trailing_dims_to_sort=1,
+            dst=sorting_ids,
+            sycl_queue=exec_q,
+            depends=[copy_ev],
+        )
+        _manager.add_event_pair(ht_ev, sort_ev)
+    ht_ev, args_ev = _argsort_ascending(
+        src=sorting_ids,
+        trailing_dims_to_sort=1,
+        dst=unsorting_ids,
+        sycl_queue=exec_q,
+        depends=[sort_ev],
+    )
+    _manager.add_event_pair(ht_ev, args_ev)
+    s = dpt_ext.empty_like(fx)
+    # s = fx[sorting_ids]
+    ht_ev, take_ev = _take(
+        src=fx,
+        ind=(sorting_ids,),
+        dst=s,
+        axis_start=0,
+        mode=0,
+        sycl_queue=exec_q,
+        depends=[sort_ev],
+    )
+    _manager.add_event_pair(ht_ev, take_ev)
+    unique_mask = dpt_ext.empty(fx.shape, dtype="?", sycl_queue=exec_q)
+    ht_ev, uneq_ev = _not_equal(
+        src1=s[:-1],
+        src2=s[1:],
+        dst=unique_mask[1:],
+        sycl_queue=exec_q,
+        depends=[take_ev],
+    )
+    _manager.add_event_pair(ht_ev, uneq_ev)
+    ht_ev, one_ev = _full_usm_ndarray(
+        fill_value=True, dst=unique_mask[0], sycl_queue=exec_q
+    )
+    _manager.add_event_pair(ht_ev, one_ev)
+    cumsum = dpt_ext.empty(
+        unique_mask.shape, dtype=dpt.int64, sycl_queue=exec_q
+    )
+    # synchronizing call
+    n_uniques = mask_positions(
+        unique_mask, cumsum, sycl_queue=exec_q, depends=[uneq_ev, one_ev]
+    )
+    if n_uniques == fx.size:
+        _counts = dpt_ext.ones(
+            n_uniques, dtype=ind_dt, usm_type=x_usm_type, sycl_queue=exec_q
+        )
+        return UniqueAllResult(
+            s,
+            sorting_ids,
+            dpt_ext.reshape(unsorting_ids, x.shape),
+            _counts,
+        )
+    unique_vals = dpt_ext.empty(
+        n_uniques, dtype=x.dtype, usm_type=x_usm_type, sycl_queue=exec_q
+    )
+    ht_ev, uv_ev = _extract(
+        src=s,
+        cumsum=cumsum,
+        axis_start=0,
+        axis_end=1,
+        dst=unique_vals,
+        sycl_queue=exec_q,
+    )
+    _manager.add_event_pair(ht_ev, uv_ev)
+    cum_unique_counts = dpt_ext.empty(
+        n_uniques + 1, dtype=ind_dt, usm_type=x_usm_type, sycl_queue=exec_q
+    )
+    idx = dpt_ext.empty(x.size, dtype=ind_dt, sycl_queue=exec_q)
+    ht_ev, id_ev = _linspace_step(start=0, dt=1, dst=idx, sycl_queue=exec_q)
+    _manager.add_event_pair(ht_ev, id_ev)
+    ht_ev, extr_ev = _extract(
+        src=idx,
+        cumsum=cumsum,
+        axis_start=0,
+        axis_end=1,
+        dst=cum_unique_counts[:-1],
+        sycl_queue=exec_q,
+        depends=[id_ev],
+    )
+    _manager.add_event_pair(ht_ev, extr_ev)
+    ht_ev, set_ev = _full_usm_ndarray(
+        x.size, dst=cum_unique_counts[-1], sycl_queue=exec_q
+    )
+    _manager.add_event_pair(ht_ev, set_ev)
+    _counts = dpt_ext.empty_like(cum_unique_counts[1:])
+    ht_ev, sub_ev = _subtract(
+        src1=cum_unique_counts[1:],
+        src2=cum_unique_counts[:-1],
+        dst=_counts,
+        sycl_queue=exec_q,
+        depends=[set_ev, extr_ev],
+    )
+    _manager.add_event_pair(ht_ev, sub_ev)
+
+    inv = dpt_ext.empty_like(x, dtype=ind_dt, order="C")
+    ht_ev, ssl_ev = _searchsorted_left(
+        hay=unique_vals,
+        needles=x,
+        positions=inv,
+        sycl_queue=exec_q,
+        depends=[
+            uv_ev,
+        ],
+    )
+    _manager.add_event_pair(ht_ev, ssl_ev)
+    return UniqueAllResult(
+        unique_vals,
+        sorting_ids[cum_unique_counts[:-1]],
+        inv,
+        _counts,
+    )
+
+
+def isin(
+    x: Union[dpt.usm_ndarray, int, float, complex, bool],
+    test_elements: Union[dpt.usm_ndarray, int, float, complex, bool],
+    /,
+    *,
+    invert: Optional[bool] = False,
+) -> dpt.usm_ndarray:
+    """isin(x, test_elements, /, *, invert=False)
+
+    Tests `x in test_elements` for each element of `x`. Returns a boolean array
+    with the same shape as `x` that is `True` where the element is in
+    `test_elements`, `False` otherwise.
+
+    Args:
+        x (Union[usm_ndarray, bool, int, float, complex]):
+            input element or elements.
+        test_elements (Union[usm_ndarray, bool, int, float, complex]):
+            elements against which to test each value of `x`.
+        invert (Optional[bool]):
+            if `True`, the output results are inverted, i.e., are equivalent to
+            testing `x not in test_elements` for each element of `x`.
+            Default: `False`.
+
+    Returns:
+        usm_ndarray:
+            an array of the inclusion test results. The returned array has a
+            boolean data type and the same shape as `x`.
+    """
+    q1, x_usm_type = _get_queue_usm_type(x)
+    q2, test_usm_type = _get_queue_usm_type(test_elements)
+    if q1 is None and q2 is None:
+        raise du.ExecutionPlacementError(
+            "Execution placement can not be unambiguously inferred "
+            "from input arguments. "
+            "One of the arguments must represent USM allocation and "
+            "expose `__sycl_usm_array_interface__` property"
+        )
+    if q1 is None:
+        exec_q = q2
+        res_usm_type = test_usm_type
+    elif q2 is None:
+        exec_q = q1
+        res_usm_type = x_usm_type
+    else:
+        exec_q = du.get_execution_queue((q1, q2))
+        if exec_q is None:
+            raise du.ExecutionPlacementError(
+                "Execution placement can not be unambiguously inferred "
+                "from input arguments."
+            )
+        res_usm_type = du.get_coerced_usm_type(
+            (
+                x_usm_type,
+                test_usm_type,
+            )
+        )
+    du.validate_usm_type(res_usm_type, allow_none=False)
+    sycl_dev = exec_q.sycl_device
+
+    if not isinstance(invert, bool):
+        raise TypeError(
+            "`invert` keyword argument must be of boolean type, "
+            f"got {type(invert)}"
+        )
+
+    x_dt = _get_dtype(x, sycl_dev)
+    test_dt = _get_dtype(test_elements, sycl_dev)
+    if not all(_validate_dtype(dt) for dt in (x_dt, test_dt)):
+        raise ValueError("Operands have unsupported data types")
+
+    x_sh = _get_shape(x)
+    if isinstance(test_elements, dpt.usm_ndarray) and test_elements.size == 0:
+        if invert:
+            return dpt_ext.ones(
+                x_sh, dtype=dpt.bool, usm_type=res_usm_type, sycl_queue=exec_q
+            )
+        else:
+            return dpt_ext.zeros(
+                x_sh, dtype=dpt.bool, usm_type=res_usm_type, sycl_queue=exec_q
+            )
+
+    dt1, dt2 = _resolve_weak_types_all_py_ints(x_dt, test_dt, sycl_dev)
+    dt = _to_device_supported_dtype(dpt_ext.result_type(dt1, dt2), sycl_dev)
+
+    if not isinstance(x, dpt.usm_ndarray):
+        x_arr = dpt_ext.asarray(
+            x, dtype=dt1, usm_type=res_usm_type, sycl_queue=exec_q
+        )
+    else:
+        x_arr = x
+
+    if not isinstance(test_elements, dpt.usm_ndarray):
+        test_arr = dpt_ext.asarray(
+            test_elements, dtype=dt2, usm_type=res_usm_type, sycl_queue=exec_q
+        )
+    else:
+        test_arr = test_elements
+
+    _manager = du.SequentialOrderManager[exec_q]
+    dep_evs = _manager.submitted_events
+
+    if x_dt != dt:
+        x_buf = _empty_like_orderK(x_arr, dt, res_usm_type, exec_q)
+        ht_ev, ev = _copy_usm_ndarray_into_usm_ndarray(
+            src=x_arr, dst=x_buf, sycl_queue=exec_q, depends=dep_evs
+        )
+        _manager.add_event_pair(ht_ev, ev)
+    else:
+        x_buf = x_arr
+
+    if test_dt != dt:
+        # copy into C-contiguous memory, because the array will be flattened
+        test_buf = dpt_ext.empty_like(
+            test_arr, dtype=dt, order="C", usm_type=res_usm_type
+        )
+        ht_ev, ev = _copy_usm_ndarray_into_usm_ndarray(
+            src=test_arr, dst=test_buf, sycl_queue=exec_q, depends=dep_evs
+        )
+        _manager.add_event_pair(ht_ev, ev)
+    else:
+        test_buf = test_arr
+
+    test_buf = dpt_ext.reshape(test_buf, -1)
+    test_buf = dpt_ext.sort(test_buf)
+
+    dst = dpt_ext.empty_like(
+        x_buf, dtype=dpt.bool, usm_type=res_usm_type, order="C"
+    )
+
+    dep_evs = _manager.submitted_events
+    ht_ev, s_ev = _isin(
+        needles=x_buf,
+        hay=test_buf,
+        dst=dst,
+        sycl_queue=exec_q,
+        invert=invert,
+        depends=dep_evs,
+    )
+    _manager.add_event_pair(ht_ev, s_ev)
+    return dst
diff --git a/dpctl_ext/tensor/_sorting.py b/dpctl_ext/tensor/_sorting.py
new file mode 100644
index 000000000000..24693a408889
--- /dev/null
+++ b/dpctl_ext/tensor/_sorting.py
@@ -0,0 +1,450 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+import operator
+from typing import NamedTuple
+
+import dpctl.tensor as dpt
+import dpctl.utils as du
+
+# TODO: revert to `import dpctl.tensor...`
+# when dpnp fully migrates dpctl/tensor
+import dpctl_ext.tensor as dpt_ext
+import dpctl_ext.tensor._tensor_impl as ti
+
+from ._numpy_helper import normalize_axis_index
+from ._tensor_sorting_impl import (
+    _argsort_ascending,
+    _argsort_descending,
+    _radix_argsort_ascending,
+    _radix_argsort_descending,
+    _radix_sort_ascending,
+    _radix_sort_descending,
+    _radix_sort_dtype_supported,
+    _sort_ascending,
+    _sort_descending,
+    _topk,
+)
+
+__all__ = ["sort", "argsort", "top_k"]
+
+
+def _get_mergesort_impl_fn(descending):
+    return _sort_descending if descending else _sort_ascending
+
+
+def _get_radixsort_impl_fn(descending):
+    return _radix_sort_descending if descending else _radix_sort_ascending
+
+
+def sort(x, /, *, axis=-1, descending=False, stable=True, kind=None):
+    """sort(x, axis=-1, descending=False, stable=True)
+
+    Returns a sorted copy of an input array `x`.
+
+    Args:
+        x (usm_ndarray):
+            input array.
+        axis (Optional[int]):
+            axis along which to sort. If set to `-1`, the function
+            must sort along the last axis. Default: `-1`.
+        descending (Optional[bool]):
+            sort order. If `True`, the array must be sorted in descending
+            order (by value). If `False`, the array must be sorted in
+            ascending order (by value). Default: `False`.
+        stable (Optional[bool]):
+            sort stability. If `True`, the returned array must maintain the
+            relative order of `x` values which compare as equal. If `False`,
+            the returned array may or may not maintain the relative order of
+            `x` values which compare as equal. Default: `True`.
+        kind (Optional[Literal["stable", "mergesort", "radixsort"]]):
+            Sorting algorithm. The default is `"stable"`, which uses parallel
+            merge-sort or parallel radix-sort algorithms depending on the
+            array data type.
+    Returns:
+        usm_ndarray:
+            a sorted array. The returned array has the same data type and
+            the same shape as the input array `x`.
+    """
+    if not isinstance(x, dpt.usm_ndarray):
+        raise TypeError(
+            f"Expected type dpctl.tensor.usm_ndarray, got {type(x)}"
+        )
+    nd = x.ndim
+    if nd == 0:
+        axis = normalize_axis_index(axis, ndim=1, msg_prefix="axis")
+        return dpt_ext.copy(x, order="C")
+    else:
+        axis = normalize_axis_index(axis, ndim=nd, msg_prefix="axis")
+    a1 = axis + 1
+    if a1 == nd:
+        perm = list(range(nd))
+        arr = x
+    else:
+        perm = [i for i in range(nd) if i != axis] + [
+            axis,
+        ]
+        arr = dpt_ext.permute_dims(x, perm)
+    if kind is None:
+        kind = "stable"
+    if not isinstance(kind, str) or kind not in [
+        "stable",
+        "radixsort",
+        "mergesort",
+    ]:
+        raise ValueError(
+            "Unsupported kind value. Expected 'stable', 'mergesort', "
+            f"or 'radixsort', but got '{kind}'"
+        )
+    if kind == "mergesort":
+        impl_fn = _get_mergesort_impl_fn(descending)
+    elif kind == "radixsort":
+        if _radix_sort_dtype_supported(x.dtype.num):
+            impl_fn = _get_radixsort_impl_fn(descending)
+        else:
+            raise ValueError(f"Radix sort is not supported for {x.dtype}")
+    else:
+        dt = x.dtype
+        if dt in [dpt.bool, dpt.uint8, dpt.int8, dpt.int16, dpt.uint16]:
+            impl_fn = _get_radixsort_impl_fn(descending)
+        else:
+            impl_fn = _get_mergesort_impl_fn(descending)
+    exec_q = x.sycl_queue
+    _manager = du.SequentialOrderManager[exec_q]
+    dep_evs = _manager.submitted_events
+    if arr.flags.c_contiguous:
+        res = dpt_ext.empty_like(arr, order="C")
+        ht_ev, impl_ev = impl_fn(
+            src=arr,
+            trailing_dims_to_sort=1,
+            dst=res,
+            sycl_queue=exec_q,
+            depends=dep_evs,
+        )
+        _manager.add_event_pair(ht_ev, impl_ev)
+    else:
+        tmp = dpt_ext.empty_like(arr, order="C")
+        ht_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
+            src=arr, dst=tmp, sycl_queue=exec_q, depends=dep_evs
+        )
+        _manager.add_event_pair(ht_ev, copy_ev)
+        res = dpt_ext.empty_like(arr, order="C")
+        ht_ev, impl_ev = impl_fn(
+            src=tmp,
+            trailing_dims_to_sort=1,
+            dst=res,
+            sycl_queue=exec_q,
+            depends=[copy_ev],
+        )
+        _manager.add_event_pair(ht_ev, impl_ev)
+    if a1 != nd:
+        inv_perm = sorted(range(nd), key=lambda d: perm[d])
+        res = dpt_ext.permute_dims(res, inv_perm)
+    return res
+
+
+def _get_mergeargsort_impl_fn(descending):
+    return _argsort_descending if descending else _argsort_ascending
+
+
+def _get_radixargsort_impl_fn(descending):
+    return _radix_argsort_descending if descending else _radix_argsort_ascending
+
+
+def argsort(x, axis=-1, descending=False, stable=True, kind=None):
+    """argsort(x, axis=-1, descending=False, stable=True)
+
+    Returns the indices that sort an array `x` along a specified axis.
+
+    Args:
+        x (usm_ndarray):
+            input array.
+        axis (Optional[int]):
+            axis along which to sort. If set to `-1`, the function
+            must sort along the last axis. Default: `-1`.
+        descending (Optional[bool]):
+            sort order. If `True`, the array must be sorted in descending
+            order (by value). If `False`, the array must be sorted in
+            ascending order (by value). Default: `False`.
+        stable (Optional[bool]):
+            sort stability. If `True`, the returned array must maintain the
+            relative order of `x` values which compare as equal. If `False`,
+            the returned array may or may not maintain the relative order of
+            `x` values which compare as equal. Default: `True`.
+        kind (Optional[Literal["stable", "mergesort", "radixsort"]]):
+            Sorting algorithm. The default is `"stable"`, which uses parallel
+            merge-sort or parallel radix-sort algorithms depending on the
+            array data type.
+
+    Returns:
+        usm_ndarray:
+            an array of indices. The returned array has the  same shape as
+            the input array `x`. The return array has default array index
+            data type.
+    """
+    if not isinstance(x, dpt.usm_ndarray):
+        raise TypeError(
+            f"Expected type dpctl.tensor.usm_ndarray, got {type(x)}"
+        )
+    nd = x.ndim
+    if nd == 0:
+        axis = normalize_axis_index(axis, ndim=1, msg_prefix="axis")
+        return dpt_ext.zeros_like(
+            x, dtype=ti.default_device_index_type(x.sycl_queue), order="C"
+        )
+    else:
+        axis = normalize_axis_index(axis, ndim=nd, msg_prefix="axis")
+    a1 = axis + 1
+    if a1 == nd:
+        perm = list(range(nd))
+        arr = x
+    else:
+        perm = [i for i in range(nd) if i != axis] + [
+            axis,
+        ]
+        arr = dpt_ext.permute_dims(x, perm)
+    if kind is None:
+        kind = "stable"
+    if not isinstance(kind, str) or kind not in [
+        "stable",
+        "radixsort",
+        "mergesort",
+    ]:
+        raise ValueError(
+            "Unsupported kind value. Expected 'stable', 'mergesort', "
+            f"or 'radixsort', but got '{kind}'"
+        )
+    if kind == "mergesort":
+        impl_fn = _get_mergeargsort_impl_fn(descending)
+    elif kind == "radixsort":
+        if _radix_sort_dtype_supported(x.dtype.num):
+            impl_fn = _get_radixargsort_impl_fn(descending)
+        else:
+            raise ValueError(f"Radix sort is not supported for {x.dtype}")
+    else:
+        dt = x.dtype
+        if dt in [dpt.bool, dpt.uint8, dpt.int8, dpt.int16, dpt.uint16]:
+            impl_fn = _get_radixargsort_impl_fn(descending)
+        else:
+            impl_fn = _get_mergeargsort_impl_fn(descending)
+    exec_q = x.sycl_queue
+    _manager = du.SequentialOrderManager[exec_q]
+    dep_evs = _manager.submitted_events
+    index_dt = ti.default_device_index_type(exec_q)
+    if arr.flags.c_contiguous:
+        res = dpt_ext.empty_like(arr, dtype=index_dt, order="C")
+        ht_ev, impl_ev = impl_fn(
+            src=arr,
+            trailing_dims_to_sort=1,
+            dst=res,
+            sycl_queue=exec_q,
+            depends=dep_evs,
+        )
+        _manager.add_event_pair(ht_ev, impl_ev)
+    else:
+        tmp = dpt_ext.empty_like(arr, order="C")
+        ht_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
+            src=arr, dst=tmp, sycl_queue=exec_q, depends=dep_evs
+        )
+        _manager.add_event_pair(ht_ev, copy_ev)
+        res = dpt_ext.empty_like(arr, dtype=index_dt, order="C")
+        ht_ev, impl_ev = impl_fn(
+            src=tmp,
+            trailing_dims_to_sort=1,
+            dst=res,
+            sycl_queue=exec_q,
+            depends=[copy_ev],
+        )
+        _manager.add_event_pair(ht_ev, impl_ev)
+    if a1 != nd:
+        inv_perm = sorted(range(nd), key=lambda d: perm[d])
+        res = dpt_ext.permute_dims(res, inv_perm)
+    return res
+
+
+def _get_top_k_largest(mode):
+    modes = {"largest": True, "smallest": False}
+    try:
+        return modes[mode]
+    except KeyError:
+        raise ValueError(
+            f"`mode` must be `largest` or `smallest`. Got `{mode}`."
+        )
+
+
+class TopKResult(NamedTuple):
+    values: dpt.usm_ndarray
+    indices: dpt.usm_ndarray
+
+
+def top_k(x, k, /, *, axis=None, mode="largest"):
+    """top_k(x, k, axis=None, mode="largest")
+
+    Returns the `k` largest or smallest values and their indices in the input
+    array `x` along the specified axis `axis`.
+
+    Args:
+        x (usm_ndarray):
+            input array.
+        k (int):
+            number of elements to find. Must be a positive integer value.
+        axis (Optional[int]):
+            axis along which to search. If `None`, the search will be performed
+            over the flattened array. Default: ``None``.
+        mode (Literal["largest", "smallest"]):
+            search mode. Must be one of the following modes:
+
+            - `"largest"`: return the `k` largest elements.
+            - `"smallest"`: return the `k` smallest elements.
+
+            Default: `"largest"`.
+
+    Returns:
+        tuple[usm_ndarray, usm_ndarray]
+            a namedtuple `(values, indices)` whose
+
+            * first element `values` will be an array containing the `k`
+              largest or smallest elements of `x`. The array has the same data
+              type as `x`. If `axis` was `None`, `values` will be a
+              one-dimensional array with shape `(k,)` and otherwise, `values`
+              will have shape `x.shape[:axis] + (k,) + x.shape[axis+1:]`
+            * second element `indices` will be an array containing indices of
+              `x` that result in `values`. The array will have the same shape
+              as `values` and will have the default array index data type.
+    """
+    largest = _get_top_k_largest(mode)
+    if not isinstance(x, dpt.usm_ndarray):
+        raise TypeError(
+            f"Expected type dpctl.tensor.usm_ndarray, got {type(x)}"
+        )
+
+    k = operator.index(k)
+    if k < 0:
+        raise ValueError("`k` must be a positive integer value")
+
+    nd = x.ndim
+    if axis is None:
+        sz = x.size
+        if nd == 0:
+            if k > 1:
+                raise ValueError(f"`k`={k} is out of bounds 1")
+            return TopKResult(
+                dpt_ext.copy(x, order="C"),
+                dpt_ext.zeros_like(
+                    x, dtype=ti.default_device_index_type(x.sycl_queue)
+                ),
+            )
+        arr = x
+        n_search_dims = None
+        res_sh = k
+    else:
+        axis = normalize_axis_index(axis, ndim=nd, msg_prefix="axis")
+        sz = x.shape[axis]
+        a1 = axis + 1
+        if a1 == nd:
+            perm = list(range(nd))
+            arr = x
+        else:
+            perm = [i for i in range(nd) if i != axis] + [
+                axis,
+            ]
+            arr = dpt_ext.permute_dims(x, perm)
+        n_search_dims = 1
+        res_sh = arr.shape[: nd - 1] + (k,)
+
+    if k > sz:
+        raise ValueError(f"`k`={k} is out of bounds {sz}")
+
+    exec_q = x.sycl_queue
+    _manager = du.SequentialOrderManager[exec_q]
+    dep_evs = _manager.submitted_events
+
+    res_usm_type = arr.usm_type
+    if arr.flags.c_contiguous:
+        vals = dpt_ext.empty(
+            res_sh,
+            dtype=arr.dtype,
+            usm_type=res_usm_type,
+            order="C",
+            sycl_queue=exec_q,
+        )
+        inds = dpt_ext.empty(
+            res_sh,
+            dtype=ti.default_device_index_type(exec_q),
+            usm_type=res_usm_type,
+            order="C",
+            sycl_queue=exec_q,
+        )
+        ht_ev, impl_ev = _topk(
+            src=arr,
+            trailing_dims_to_search=n_search_dims,
+            k=k,
+            largest=largest,
+            vals=vals,
+            inds=inds,
+            sycl_queue=exec_q,
+            depends=dep_evs,
+        )
+        _manager.add_event_pair(ht_ev, impl_ev)
+    else:
+        tmp = dpt_ext.empty_like(arr, order="C")
+        ht_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
+            src=arr, dst=tmp, sycl_queue=exec_q, depends=dep_evs
+        )
+        _manager.add_event_pair(ht_ev, copy_ev)
+        vals = dpt_ext.empty(
+            res_sh,
+            dtype=arr.dtype,
+            usm_type=res_usm_type,
+            order="C",
+            sycl_queue=exec_q,
+        )
+        inds = dpt_ext.empty(
+            res_sh,
+            dtype=ti.default_device_index_type(exec_q),
+            usm_type=res_usm_type,
+            order="C",
+            sycl_queue=exec_q,
+        )
+        ht_ev, impl_ev = _topk(
+            src=tmp,
+            trailing_dims_to_search=n_search_dims,
+            k=k,
+            largest=largest,
+            vals=vals,
+            inds=inds,
+            sycl_queue=exec_q,
+            depends=[copy_ev],
+        )
+        _manager.add_event_pair(ht_ev, impl_ev)
+    if axis is not None and a1 != nd:
+        inv_perm = sorted(range(nd), key=lambda d: perm[d])
+        vals = dpt_ext.permute_dims(vals, inv_perm)
+        inds = dpt_ext.permute_dims(inds, inv_perm)
+
+    return TopKResult(vals, inds)
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/sorting/isin.hpp b/dpctl_ext/tensor/libtensor/include/kernels/sorting/isin.hpp
new file mode 100644
index 000000000000..847fa96ecdff
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/include/kernels/sorting/isin.hpp
@@ -0,0 +1,245 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for tensor membership operations.
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <cstddef>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/sorting/search_sorted_detail.hpp"
+#include "utils/offset_utils.hpp"
+#include "utils/rich_comparisons.hpp"
+
+namespace dpctl::tensor::kernels
+{
+
+using dpctl::tensor::ssize_t;
+
+template <typename T,
+          typename HayIndexerT,
+          typename NeedlesIndexerT,
+          typename OutIndexerT>
+struct IsinFunctor
+{
+private:
+    bool invert;
+    const T *hay_tp;
+    const T *needles_tp;
+    bool *out_tp;
+    std::size_t hay_nelems;
+    HayIndexerT hay_indexer;
+    NeedlesIndexerT needles_indexer;
+    OutIndexerT out_indexer;
+
+public:
+    IsinFunctor(const bool invert_,
+                const T *hay_,
+                const T *needles_,
+                bool *out_,
+                const std::size_t hay_nelems_,
+                const HayIndexerT &hay_indexer_,
+                const NeedlesIndexerT &needles_indexer_,
+                const OutIndexerT &out_indexer_)
+        : invert(invert_), hay_tp(hay_), needles_tp(needles_), out_tp(out_),
+          hay_nelems(hay_nelems_), hay_indexer(hay_indexer_),
+          needles_indexer(needles_indexer_), out_indexer(out_indexer_)
+    {
+    }
+
+    void operator()(sycl::id<1> id) const
+    {
+        using Compare =
+            typename dpctl::tensor::rich_comparisons::AscendingSorter<T>::type;
+        static constexpr Compare comp{};
+
+        const std::size_t i = id[0];
+        const T needle_v = needles_tp[needles_indexer(i)];
+
+        // position of the needle_v in the hay array
+        std::size_t pos{};
+
+        static constexpr std::size_t zero(0);
+        // search in hay in left-closed interval, give `pos` such that
+        // hay[pos - 1] < needle_v <= hay[pos]
+
+        // lower_bound returns the first pos such that bool(hay[pos] <
+        // needle_v) is false, i.e. needle_v <= hay[pos]
+        pos = search_sorted_detail::lower_bound_indexed_impl(
+            hay_tp, zero, hay_nelems, needle_v, comp, hay_indexer);
+        bool out = (pos == hay_nelems ? false : hay_tp[pos] == needle_v);
+        out_tp[out_indexer(i)] = (invert) ? !out : out;
+    }
+};
+
+typedef sycl::event (*isin_contig_impl_fp_ptr_t)(
+    sycl::queue &,
+    const bool,
+    const std::size_t,
+    const std::size_t,
+    const char *,
+    const ssize_t,
+    const char *,
+    const ssize_t,
+    char *,
+    const ssize_t,
+    const std::vector<sycl::event> &);
+
+template <typename T>
+class isin_contig_impl_krn;
+
+template <typename T>
+sycl::event isin_contig_impl(sycl::queue &exec_q,
+                             const bool invert,
+                             const std::size_t hay_nelems,
+                             const std::size_t needles_nelems,
+                             const char *hay_cp,
+                             const ssize_t hay_offset,
+                             const char *needles_cp,
+                             const ssize_t needles_offset,
+                             char *out_cp,
+                             const ssize_t out_offset,
+                             const std::vector<sycl::event> &depends)
+{
+    const T *hay_tp = reinterpret_cast<const T *>(hay_cp) + hay_offset;
+    const T *needles_tp =
+        reinterpret_cast<const T *>(needles_cp) + needles_offset;
+
+    bool *out_tp = reinterpret_cast<bool *>(out_cp) + out_offset;
+
+    sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+
+        using KernelName = class isin_contig_impl_krn<T>;
+
+        sycl::range<1> gRange(needles_nelems);
+
+        using TrivialIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+
+        static constexpr TrivialIndexerT hay_indexer{};
+        static constexpr TrivialIndexerT needles_indexer{};
+        static constexpr TrivialIndexerT out_indexer{};
+
+        const auto fnctr =
+            IsinFunctor<T, TrivialIndexerT, TrivialIndexerT, TrivialIndexerT>(
+                invert, hay_tp, needles_tp, out_tp, hay_nelems, hay_indexer,
+                needles_indexer, out_indexer);
+
+        cgh.parallel_for<KernelName>(gRange, fnctr);
+    });
+
+    return comp_ev;
+}
+
+typedef sycl::event (*isin_strided_impl_fp_ptr_t)(
+    sycl::queue &,
+    const bool,
+    const std::size_t,
+    const std::size_t,
+    const char *,
+    const ssize_t,
+    const ssize_t,
+    const char *,
+    const ssize_t,
+    char *,
+    const ssize_t,
+    int,
+    const ssize_t *,
+    const std::vector<sycl::event> &);
+
+template <typename T>
+class isin_strided_impl_krn;
+
+template <typename T>
+sycl::event isin_strided_impl(
+    sycl::queue &exec_q,
+    const bool invert,
+    const std::size_t hay_nelems,
+    const std::size_t needles_nelems,
+    const char *hay_cp,
+    const ssize_t hay_offset,
+    // hay is 1D, so hay_nelems, hay_offset, hay_stride describe strided array
+    const ssize_t hay_stride,
+    const char *needles_cp,
+    const ssize_t needles_offset,
+    char *out_cp,
+    const ssize_t out_offset,
+    const int needles_nd,
+    // packed_shape_strides is [needles_shape, needles_strides,
+    // out_strides] has length of 3*needles_nd
+    const ssize_t *packed_shape_strides,
+    const std::vector<sycl::event> &depends)
+{
+    const T *hay_tp = reinterpret_cast<const T *>(hay_cp);
+    const T *needles_tp = reinterpret_cast<const T *>(needles_cp);
+
+    bool *out_tp = reinterpret_cast<bool *>(out_cp);
+
+    sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+
+        sycl::range<1> gRange(needles_nelems);
+
+        using HayIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer;
+        const HayIndexerT hay_indexer(
+            /* offset */ hay_offset,
+            /* size   */ hay_nelems,
+            /* step   */ hay_stride);
+
+        using NeedlesIndexerT = dpctl::tensor::offset_utils::StridedIndexer;
+        const ssize_t *needles_shape_strides = packed_shape_strides;
+        const NeedlesIndexerT needles_indexer(needles_nd, needles_offset,
+                                              needles_shape_strides);
+        using OutIndexerT = dpctl::tensor::offset_utils::UnpackedStridedIndexer;
+
+        const ssize_t *out_shape = packed_shape_strides;
+        const ssize_t *out_strides = packed_shape_strides + 2 * needles_nd;
+        const OutIndexerT out_indexer(needles_nd, out_offset, out_shape,
+                                      out_strides);
+
+        const auto fnctr =
+            IsinFunctor<T, HayIndexerT, NeedlesIndexerT, OutIndexerT>(
+                invert, hay_tp, needles_tp, out_tp, hay_nelems, hay_indexer,
+                needles_indexer, out_indexer);
+        using KernelName = class isin_strided_impl_krn<T>;
+
+        cgh.parallel_for<KernelName>(gRange, fnctr);
+    });
+
+    return comp_ev;
+}
+
+} // namespace dpctl::tensor::kernels
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/sorting/merge_sort.hpp b/dpctl_ext/tensor/libtensor/include/kernels/sorting/merge_sort.hpp
new file mode 100644
index 000000000000..a047c172f7bc
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/include/kernels/sorting/merge_sort.hpp
@@ -0,0 +1,856 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for tensor sort/argsort operations.
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <functional>
+#include <iterator>
+#include <utility>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/sorting/search_sorted_detail.hpp"
+#include "kernels/sorting/sort_utils.hpp"
+
+namespace dpctl::tensor::kernels
+{
+
+namespace merge_sort_detail
+{
+
+using dpctl::tensor::ssize_t;
+using namespace dpctl::tensor::kernels::search_sorted_detail;
+
+/*! @brief Merge two contiguous sorted segments */
+template <typename InAcc, typename OutAcc, typename Compare>
+void merge_impl(const std::size_t offset,
+                const InAcc in_acc,
+                OutAcc out_acc,
+                const std::size_t start_1,
+                const std::size_t end_1,
+                const std::size_t end_2,
+                const std::size_t start_out,
+                Compare comp,
+                const std::size_t chunk)
+{
+    const std::size_t start_2 = end_1;
+    // Borders of the sequences to merge within this call
+    const std::size_t local_start_1 = sycl::min(offset + start_1, end_1);
+    const std::size_t local_end_1 = sycl::min(local_start_1 + chunk, end_1);
+    const std::size_t local_start_2 = sycl::min(offset + start_2, end_2);
+    const std::size_t local_end_2 = sycl::min(local_start_2 + chunk, end_2);
+
+    const std::size_t local_size_1 = local_end_1 - local_start_1;
+    const std::size_t local_size_2 = local_end_2 - local_start_2;
+
+    const auto r_item_1 = in_acc[end_1 - 1];
+    const auto l_item_2 = (start_2 < end_2) ? in_acc[start_2] : r_item_1;
+
+    // Copy if the sequences are sorted with respect to each other or merge
+    // otherwise
+    if (!comp(l_item_2, r_item_1)) {
+        const std::size_t out_shift_1 = start_out + local_start_1 - start_1;
+        const std::size_t out_shift_2 =
+            start_out + end_1 - start_1 + local_start_2 - start_2;
+
+        for (std::size_t i = 0; i < local_size_1; ++i) {
+            out_acc[out_shift_1 + i] = in_acc[local_start_1 + i];
+        }
+        for (std::size_t i = 0; i < local_size_2; ++i) {
+            out_acc[out_shift_2 + i] = in_acc[local_start_2 + i];
+        }
+    }
+    else if (comp(r_item_1, l_item_2)) {
+        const std::size_t out_shift_1 =
+            start_out + end_2 - start_2 + local_start_1 - start_1;
+        const std::size_t out_shift_2 = start_out + local_start_2 - start_2;
+        for (std::size_t i = 0; i < local_size_1; ++i) {
+            out_acc[out_shift_1 + i] = in_acc[local_start_1 + i];
+        }
+        for (std::size_t i = 0; i < local_size_2; ++i) {
+            out_acc[out_shift_2 + i] = in_acc[local_start_2 + i];
+        }
+    }
+    // Perform merging
+    else {
+
+        // Process 1st sequence
+        if (local_start_1 < local_end_1) {
+            // Reduce the range for searching within the 2nd sequence and handle
+            // bound items find left border in 2nd sequence
+            const auto local_l_item_1 = in_acc[local_start_1];
+            std::size_t l_search_bound_2 =
+                lower_bound_impl(in_acc, start_2, end_2, local_l_item_1, comp);
+            const std::size_t l_shift_1 = local_start_1 - start_1;
+            const std::size_t l_shift_2 = l_search_bound_2 - start_2;
+
+            out_acc[start_out + l_shift_1 + l_shift_2] = local_l_item_1;
+
+            std::size_t r_search_bound_2{};
+            // find right border in 2nd sequence
+            if (local_size_1 > 1) {
+                const auto local_r_item_1 = in_acc[local_end_1 - 1];
+                r_search_bound_2 = lower_bound_impl(
+                    in_acc, l_search_bound_2, end_2, local_r_item_1, comp);
+                const auto r_shift_1 = local_end_1 - 1 - start_1;
+                const auto r_shift_2 = r_search_bound_2 - start_2;
+
+                out_acc[start_out + r_shift_1 + r_shift_2] = local_r_item_1;
+            }
+
+            // Handle intermediate items
+            if (r_search_bound_2 == l_search_bound_2) {
+                const std::size_t shift_2 = l_search_bound_2 - start_2;
+                for (std::size_t idx = local_start_1 + 1; idx < local_end_1 - 1;
+                     ++idx) {
+                    const auto intermediate_item_1 = in_acc[idx];
+                    const std::size_t shift_1 = idx - start_1;
+                    out_acc[start_out + shift_1 + shift_2] =
+                        intermediate_item_1;
+                }
+            }
+            else {
+                for (std::size_t idx = local_start_1 + 1; idx < local_end_1 - 1;
+                     ++idx) {
+                    const auto intermediate_item_1 = in_acc[idx];
+                    // we shouldn't seek in whole 2nd sequence. Just for the
+                    // part where the 1st sequence should be
+                    l_search_bound_2 = lower_bound_impl(
+                        in_acc, l_search_bound_2, r_search_bound_2,
+                        intermediate_item_1, comp);
+                    const std::size_t shift_1 = idx - start_1;
+                    const std::size_t shift_2 = l_search_bound_2 - start_2;
+
+                    out_acc[start_out + shift_1 + shift_2] =
+                        intermediate_item_1;
+                }
+            }
+        }
+        // Process 2nd sequence
+        if (local_start_2 < local_end_2) {
+            // Reduce the range for searching within the 1st sequence and handle
+            // bound items find left border in 1st sequence
+            const auto local_l_item_2 = in_acc[local_start_2];
+            std::size_t l_search_bound_1 =
+                upper_bound_impl(in_acc, start_1, end_1, local_l_item_2, comp);
+            const std::size_t l_shift_1 = l_search_bound_1 - start_1;
+            const std::size_t l_shift_2 = local_start_2 - start_2;
+
+            out_acc[start_out + l_shift_1 + l_shift_2] = local_l_item_2;
+
+            std::size_t r_search_bound_1{};
+            // find right border in 1st sequence
+            if (local_size_2 > 1) {
+                const auto local_r_item_2 = in_acc[local_end_2 - 1];
+                r_search_bound_1 = upper_bound_impl(
+                    in_acc, l_search_bound_1, end_1, local_r_item_2, comp);
+                const std::size_t r_shift_1 = r_search_bound_1 - start_1;
+                const std::size_t r_shift_2 = local_end_2 - 1 - start_2;
+
+                out_acc[start_out + r_shift_1 + r_shift_2] = local_r_item_2;
+            }
+
+            // Handle intermediate items
+            if (l_search_bound_1 == r_search_bound_1) {
+                const std::size_t shift_1 = l_search_bound_1 - start_1;
+                for (auto idx = local_start_2 + 1; idx < local_end_2 - 1; ++idx)
+                {
+                    const auto intermediate_item_2 = in_acc[idx];
+                    const std::size_t shift_2 = idx - start_2;
+                    out_acc[start_out + shift_1 + shift_2] =
+                        intermediate_item_2;
+                }
+            }
+            else {
+                for (auto idx = local_start_2 + 1; idx < local_end_2 - 1; ++idx)
+                {
+                    const auto intermediate_item_2 = in_acc[idx];
+                    // we shouldn't seek in whole 1st sequence. Just for the
+                    // part where the 2nd sequence should be
+                    l_search_bound_1 = upper_bound_impl(
+                        in_acc, l_search_bound_1, r_search_bound_1,
+                        intermediate_item_2, comp);
+                    const std::size_t shift_1 = l_search_bound_1 - start_1;
+                    const std::size_t shift_2 = idx - start_2;
+
+                    out_acc[start_out + shift_1 + shift_2] =
+                        intermediate_item_2;
+                }
+            }
+        }
+    }
+}
+
+template <typename Iter, typename Compare>
+void insertion_sort_impl(Iter &&first,
+                         std::size_t begin,
+                         std::size_t end,
+                         Compare &&comp)
+{
+    for (std::size_t i = begin + 1; i < end; ++i) {
+        const auto val_i = first[i];
+        std::size_t j = i - 1;
+        while ((j + 1 > begin) && (comp(val_i, first[j]))) {
+            first[j + 1] = first[j];
+            --j;
+        }
+        if (j + 1 < i) {
+            first[j + 1] = val_i;
+        }
+    }
+}
+
+template <typename Iter, typename Compare>
+void leaf_sort_impl(Iter &&first,
+                    std::size_t begin,
+                    std::size_t end,
+                    Compare &&comp)
+{
+    return insertion_sort_impl<Iter, Compare>(std::forward<Iter>(first),
+                                              std::move(begin), std::move(end),
+                                              std::forward<Compare>(comp));
+}
+
+template <typename Iter>
+struct GetValueType
+{
+    using value_type = typename std::iterator_traits<Iter>::value_type;
+};
+
+template <typename ElementType,
+          sycl::access::address_space Space,
+          sycl::access::decorated IsDecorated>
+struct GetValueType<sycl::multi_ptr<ElementType, Space, IsDecorated>>
+{
+    using value_type = ElementType;
+};
+
+template <typename ElementType,
+          int Dim,
+          sycl::access_mode Mode,
+          sycl::target Target,
+          sycl::access::placeholder isPlaceholder>
+struct GetValueType<
+    sycl::accessor<ElementType, Dim, Mode, Target, isPlaceholder>>
+{
+    using value_type = ElementType;
+};
+
+template <typename ElementType, int Dim, typename AllocatorT>
+struct GetValueType<sycl::buffer<ElementType, Dim, AllocatorT>>
+{
+    using value_type = ElementType;
+};
+
+template <typename Iter>
+struct GetReadOnlyAccess
+{
+    Iter operator()(const Iter &it, sycl::handler &)
+    {
+        return it;
+    }
+};
+
+template <typename ElementType, int Dim, typename AllocatorT>
+struct GetReadOnlyAccess<sycl::buffer<ElementType, Dim, AllocatorT>>
+{
+    auto operator()(const sycl::buffer<ElementType, Dim, AllocatorT> &buf,
+                    sycl::handler &cgh)
+    {
+        sycl::accessor acc(buf, cgh, sycl::read_only);
+        return acc;
+    }
+};
+
+template <typename Iter>
+struct GetWriteDiscardAccess
+{
+    Iter operator()(Iter it, sycl::handler &)
+    {
+        return it;
+    }
+};
+
+template <typename ElementType, int Dim, typename AllocatorT>
+struct GetWriteDiscardAccess<sycl::buffer<ElementType, Dim, AllocatorT>>
+{
+    auto operator()(sycl::buffer<ElementType, Dim, AllocatorT> &buf,
+                    sycl::handler &cgh)
+    {
+        sycl::accessor acc(buf, cgh, sycl::write_only, sycl::no_init);
+        return acc;
+    }
+};
+
+template <typename Iter>
+struct GetReadWriteAccess
+{
+    Iter operator()(Iter &it, sycl::handler &)
+    {
+        return it;
+    }
+};
+
+template <typename ElementType, int Dim, typename AllocatorT>
+struct GetReadWriteAccess<sycl::buffer<ElementType, Dim, AllocatorT>>
+{
+    auto operator()(sycl::buffer<ElementType, Dim, AllocatorT> &buf,
+                    sycl::handler &cgh)
+    {
+        sycl::accessor acc(buf, cgh, sycl::read_write);
+        return acc;
+    }
+};
+
+template <typename T1, typename T2, typename Comp>
+class sort_base_step_contig_krn;
+
+template <typename InpAcc, typename OutAcc, typename Comp>
+sycl::event
+    sort_base_step_contig_impl(sycl::queue &q,
+                               const std::size_t iter_nelems,
+                               const std::size_t sort_nelems,
+                               const InpAcc input,
+                               OutAcc output,
+                               const Comp &comp,
+                               const std::size_t conseq_nelems_sorted,
+                               const std::vector<sycl::event> &depends = {})
+{
+
+    using inpT = typename GetValueType<InpAcc>::value_type;
+    using outT = typename GetValueType<OutAcc>::value_type;
+    using KernelName = sort_base_step_contig_krn<inpT, outT, Comp>;
+
+    const std::size_t n_segments =
+        quotient_ceil(sort_nelems, conseq_nelems_sorted);
+
+    sycl::event base_sort = q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+
+        const sycl::range<1> gRange{iter_nelems * n_segments};
+
+        auto input_acc = GetReadOnlyAccess<InpAcc>{}(input, cgh);
+        auto output_acc = GetWriteDiscardAccess<OutAcc>{}(output, cgh);
+
+        cgh.parallel_for<KernelName>(gRange, [=](sycl::id<1> id) {
+            const std::size_t iter_id = id[0] / n_segments;
+            const std::size_t segment_id = id[0] - iter_id * n_segments;
+
+            const std::size_t iter_offset = iter_id * sort_nelems;
+            const std::size_t beg_id =
+                iter_offset + segment_id * conseq_nelems_sorted;
+            const std::size_t end_id =
+                iter_offset +
+                std::min((segment_id + 1) * conseq_nelems_sorted, sort_nelems);
+            for (std::size_t i = beg_id; i < end_id; ++i) {
+                output_acc[i] = input_acc[i];
+            }
+
+            leaf_sort_impl(output_acc, beg_id, end_id, comp);
+        });
+    });
+
+    return base_sort;
+}
+
+template <typename T1, typename T2, typename Comp>
+class sort_over_work_group_contig_krn;
+
+template <typename InpAcc, typename OutAcc, typename Comp>
+sycl::event sort_over_work_group_contig_impl(
+    sycl::queue &q,
+    std::size_t iter_nelems,
+    std::size_t sort_nelems,
+    const InpAcc input,
+    OutAcc output,
+    const Comp &comp,
+    std::size_t &nelems_wg_sorts,
+    const std::vector<sycl::event> &depends = {})
+{
+    using inpT = typename GetValueType<InpAcc>::value_type;
+    using T = typename GetValueType<OutAcc>::value_type;
+    using KernelName = sort_over_work_group_contig_krn<inpT, T, Comp>;
+
+    const auto &kernel_id = sycl::get_kernel_id<KernelName>();
+
+    auto const &ctx = q.get_context();
+    auto const &dev = q.get_device();
+    auto kb = sycl::get_kernel_bundle<sycl::bundle_state::executable>(
+        ctx, {dev}, {kernel_id});
+
+    auto krn = kb.get_kernel(kernel_id);
+
+    const std::uint32_t max_sg_size = krn.template get_info<
+        sycl::info::kernel_device_specific::max_sub_group_size>(dev);
+    const std::uint64_t device_local_memory_size =
+        dev.get_info<sycl::info::device::local_mem_size>();
+
+    //  leave 512 bytes of local memory for RT
+    const std::uint64_t safety_margin = 512;
+
+    const std::uint64_t nelems_per_slm =
+        (device_local_memory_size - safety_margin) / (2 * sizeof(T));
+
+    static constexpr std::uint32_t sub_groups_per_work_group = 4;
+    const std::uint32_t elems_per_wi = dev.has(sycl::aspect::cpu) ? 8 : 2;
+
+    const std::size_t lws = sub_groups_per_work_group * max_sg_size;
+
+    nelems_wg_sorts = elems_per_wi * lws;
+
+    if (nelems_wg_sorts > nelems_per_slm) {
+        nelems_wg_sorts = (q.get_device().has(sycl::aspect::cpu) ? 16 : 4);
+
+        return sort_base_step_contig_impl<InpAcc, OutAcc, Comp>(
+            q, iter_nelems, sort_nelems, input, output, comp, nelems_wg_sorts,
+            depends);
+    }
+
+    // This assumption permits doing away with using a loop
+    assert(nelems_wg_sorts % lws == 0);
+
+    const std::size_t n_segments = quotient_ceil(sort_nelems, nelems_wg_sorts);
+
+    sycl::event base_sort_ev = q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+
+        cgh.use_kernel_bundle(kb);
+
+        sycl::range<1> global_range{iter_nelems * n_segments * lws};
+        sycl::range<1> local_range{lws};
+
+        sycl::range<1> slm_range{nelems_wg_sorts};
+        sycl::local_accessor<T, 1> work_space(slm_range, cgh);
+        sycl::local_accessor<T, 1> scratch_space(slm_range, cgh);
+
+        auto input_acc = GetReadOnlyAccess<InpAcc>{}(input, cgh);
+        auto output_acc = GetWriteDiscardAccess<OutAcc>{}(output, cgh);
+
+        sycl::nd_range<1> ndRange(global_range, local_range);
+
+        cgh.parallel_for<KernelName>(ndRange, [=](sycl::nd_item<1> it) {
+            const std::size_t group_id = it.get_group_linear_id();
+            const std::size_t iter_id = group_id / n_segments;
+            const std::size_t segment_id = group_id - iter_id * n_segments;
+            const std::size_t lid = it.get_local_linear_id();
+
+            const std::size_t segment_start_idx = segment_id * nelems_wg_sorts;
+            const std::size_t segment_end_idx =
+                std::min(segment_start_idx + nelems_wg_sorts, sort_nelems);
+            const std::size_t wg_chunk_size =
+                segment_end_idx - segment_start_idx;
+
+            // load input into SLM
+            for (std::size_t array_id = segment_start_idx + lid;
+                 array_id < segment_end_idx; array_id += lws)
+            {
+                T v = (array_id < sort_nelems)
+                          ? input_acc[iter_id * sort_nelems + array_id]
+                          : T{};
+                work_space[array_id - segment_start_idx] = v;
+            }
+            sycl::group_barrier(it.get_group());
+
+            const std::size_t chunk = quotient_ceil(nelems_wg_sorts, lws);
+
+            const std::size_t chunk_start_idx = lid * chunk;
+            const std::size_t chunk_end_idx =
+                sycl::min(chunk_start_idx + chunk, wg_chunk_size);
+
+            leaf_sort_impl(work_space, chunk_start_idx, chunk_end_idx, comp);
+
+            sycl::group_barrier(it.get_group());
+
+            bool data_in_temp = false;
+            std::size_t n_chunks_merged = 1;
+
+            // merge chunk while n_chunks_merged * chunk < wg_chunk_size
+            const std::size_t max_chunks_merged =
+                1 + ((wg_chunk_size - 1) / chunk);
+            for (; n_chunks_merged < max_chunks_merged;
+                 data_in_temp = !data_in_temp, n_chunks_merged *= 2)
+            {
+                const std::size_t nelems_sorted_so_far =
+                    n_chunks_merged * chunk;
+                const std::size_t q = (lid / n_chunks_merged);
+                const std::size_t start_1 =
+                    sycl::min(2 * nelems_sorted_so_far * q, wg_chunk_size);
+                const std::size_t end_1 =
+                    sycl::min(start_1 + nelems_sorted_so_far, wg_chunk_size);
+                const std::size_t end_2 =
+                    sycl::min(end_1 + nelems_sorted_so_far, wg_chunk_size);
+                const std::size_t offset = chunk * (lid - q * n_chunks_merged);
+
+                if (data_in_temp) {
+                    merge_impl(offset, scratch_space, work_space, start_1,
+                               end_1, end_2, start_1, comp, chunk);
+                }
+                else {
+                    merge_impl(offset, work_space, scratch_space, start_1,
+                               end_1, end_2, start_1, comp, chunk);
+                }
+                sycl::group_barrier(it.get_group());
+            }
+
+            const auto &out_src = (data_in_temp) ? scratch_space : work_space;
+            for (std::size_t array_id = segment_start_idx + lid;
+                 array_id < segment_end_idx; array_id += lws)
+            {
+                if (array_id < sort_nelems) {
+                    output_acc[iter_id * sort_nelems + array_id] =
+                        out_src[array_id - segment_start_idx];
+                }
+            }
+        });
+    });
+
+    return base_sort_ev;
+}
+
+class vacuous_krn;
+
+inline sycl::event tie_events(sycl::queue &q,
+                              const std::vector<sycl::event> depends)
+{
+    if (depends.empty())
+        return sycl::event();
+    if (depends.size() == 1)
+        return depends[0];
+
+    sycl::event e = q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+        using KernelName = vacuous_krn;
+        cgh.single_task<KernelName>([]() {});
+    });
+
+    return e;
+}
+
+template <typename T, typename Comp>
+class merge_adjacent_blocks_to_temp_krn;
+
+template <typename T, typename Comp>
+class merge_adjacent_blocks_from_temp_krn;
+
+template <typename Acc, typename Comp>
+sycl::event
+    merge_sorted_block_contig_impl(sycl::queue &q,
+                                   std::size_t iter_nelems,
+                                   std::size_t sort_nelems,
+                                   Acc output,
+                                   const Comp comp,
+                                   std::size_t sorted_block_size,
+                                   const std::vector<sycl::event> &depends = {})
+{
+
+    if (sorted_block_size >= sort_nelems)
+        return tie_events(q, depends);
+
+    // experimentally determined value
+    // size of segments worked upon by each work-item during merging
+    const sycl::device &dev = q.get_device();
+    const std::size_t segment_size = (dev.has(sycl::aspect::cpu)) ? 32 : 4;
+
+    const std::size_t chunk_size =
+        (sorted_block_size < segment_size) ? sorted_block_size : segment_size;
+
+    assert(sorted_block_size % chunk_size == 0);
+
+    using T = typename GetValueType<Acc>::value_type;
+
+    sycl::buffer<T, 1> temp_buf(sycl::range<1>{iter_nelems * sort_nelems});
+    // T *allocated_mem = sycl::malloc_device<T>(iter_nelems * sort_nelems, q);
+
+    bool needs_copy = true;
+    bool used_depends = false;
+
+    sycl::event dep_ev;
+    std::size_t chunks_merged = sorted_block_size / chunk_size;
+
+    assert(!(chunks_merged & (chunks_merged - 1)));
+
+    using ToTempKernelName = class merge_adjacent_blocks_to_temp_krn<T, Comp>;
+    using FromTempKernelName =
+        class merge_adjacent_blocks_from_temp_krn<T, Comp>;
+
+    while (chunks_merged * chunk_size < sort_nelems) {
+        sycl::event local_dep = dep_ev;
+
+        sycl::event merge_ev = q.submit([&](sycl::handler &cgh) {
+            if (used_depends) {
+                cgh.depends_on(local_dep);
+            }
+            else {
+                cgh.depends_on(depends);
+                used_depends = true;
+            }
+
+            const std::size_t n_chunks = quotient_ceil(sort_nelems, chunk_size);
+
+            if (needs_copy) {
+                sycl::accessor temp_acc{temp_buf, cgh, sycl::write_only,
+                                        sycl::no_init};
+                auto output_acc = GetReadOnlyAccess<Acc>{}(output, cgh);
+                cgh.parallel_for<ToTempKernelName>(
+                    {iter_nelems * n_chunks}, [=](sycl::id<1> wid) {
+                        auto flat_idx = wid[0];
+                        auto iter_idx = flat_idx / n_chunks;
+                        auto idx = flat_idx - n_chunks * iter_idx;
+
+                        const std::size_t idx_mult =
+                            (idx / chunks_merged) * chunks_merged;
+                        const std::size_t idx_rem = (idx - idx_mult);
+                        const std::size_t start_1 =
+                            sycl::min(2 * idx_mult * chunk_size, sort_nelems);
+                        const std::size_t end_1 = sycl::min(
+                            start_1 + chunks_merged * chunk_size, sort_nelems);
+                        const std::size_t end_2 = sycl::min(
+                            end_1 + chunks_merged * chunk_size, sort_nelems);
+                        const std::size_t offset = chunk_size * idx_rem;
+
+                        const std::size_t iter_offset = iter_idx * sort_nelems;
+
+                        merge_impl(offset, output_acc, temp_acc,
+                                   iter_offset + start_1, iter_offset + end_1,
+                                   iter_offset + end_2, iter_offset + start_1,
+                                   comp, chunk_size);
+                    });
+            }
+            else {
+                sycl::accessor temp_acc{temp_buf, cgh, sycl::read_only};
+                auto output_acc = GetWriteDiscardAccess<Acc>{}(output, cgh);
+                cgh.parallel_for<FromTempKernelName>(
+                    {iter_nelems * n_chunks}, [=](sycl::id<1> wid) {
+                        auto flat_idx = wid[0];
+                        auto iter_idx = flat_idx / n_chunks;
+                        auto idx = flat_idx - n_chunks * iter_idx;
+
+                        const std::size_t idx_mult =
+                            (idx / chunks_merged) * chunks_merged;
+                        const std::size_t idx_rem = (idx - idx_mult);
+                        const std::size_t start_1 =
+                            sycl::min(2 * idx_mult * chunk_size, sort_nelems);
+                        const std::size_t end_1 = sycl::min(
+                            start_1 + chunks_merged * chunk_size, sort_nelems);
+                        const std::size_t end_2 = sycl::min(
+                            end_1 + chunks_merged * chunk_size, sort_nelems);
+                        const std::size_t offset = chunk_size * idx_rem;
+
+                        const std::size_t iter_offset = iter_idx * sort_nelems;
+
+                        merge_impl(offset, temp_acc, output_acc,
+                                   iter_offset + start_1, iter_offset + end_1,
+                                   iter_offset + end_2, iter_offset + start_1,
+                                   comp, chunk_size);
+                    });
+            }
+        });
+
+        chunks_merged *= 2;
+        dep_ev = merge_ev;
+
+        if (chunks_merged * chunk_size < sort_nelems) {
+            needs_copy = !needs_copy;
+        }
+    }
+
+    if (needs_copy) {
+        sycl::event copy_ev = q.submit([&](sycl::handler &cgh) {
+            cgh.depends_on(dep_ev);
+
+            sycl::accessor temp_acc{temp_buf, cgh, sycl::read_only};
+            auto output_acc = GetWriteDiscardAccess<Acc>{}(output, cgh);
+
+            cgh.copy(temp_acc, output_acc);
+        });
+        dep_ev = copy_ev;
+    }
+
+    return dep_ev;
+}
+
+} // namespace merge_sort_detail
+
+template <typename argTy, typename Comp = std::less<argTy>>
+sycl::event stable_sort_axis1_contig_impl(
+    sycl::queue &exec_q,
+    std::size_t iter_nelems, // number of sub-arrays to sort (num. of rows in a
+                             // matrix when sorting over rows)
+    std::size_t sort_nelems, // size of each array to sort  (length of rows,
+                             // i.e. number of columns)
+    const char *arg_cp,
+    char *res_cp,
+    ssize_t iter_arg_offset,
+    ssize_t iter_res_offset,
+    ssize_t sort_arg_offset,
+    ssize_t sort_res_offset,
+    const std::vector<sycl::event> &depends)
+{
+    const argTy *arg_tp = reinterpret_cast<const argTy *>(arg_cp) +
+                          iter_arg_offset + sort_arg_offset;
+    argTy *res_tp =
+        reinterpret_cast<argTy *>(res_cp) + iter_res_offset + sort_res_offset;
+
+    auto comp = Comp{};
+
+    // constant chosen experimentally to ensure monotonicity of
+    // sorting performance, as measured on GPU Max, and Iris Xe
+    constexpr std::size_t sequential_sorting_threshold = 16;
+
+    if (sort_nelems < sequential_sorting_threshold) {
+        // equal work-item sorts entire row
+        sycl::event sequential_sorting_ev =
+            merge_sort_detail::sort_base_step_contig_impl<const argTy *,
+                                                          argTy *, Comp>(
+                exec_q, iter_nelems, sort_nelems, arg_tp, res_tp, comp,
+                sort_nelems, depends);
+
+        return sequential_sorting_ev;
+    }
+    else {
+        std::size_t sorted_block_size{};
+
+        // Sort segments of the array
+        sycl::event base_sort_ev =
+            merge_sort_detail::sort_over_work_group_contig_impl<const argTy *,
+                                                                argTy *, Comp>(
+                exec_q, iter_nelems, sort_nelems, arg_tp, res_tp, comp,
+                sorted_block_size, // modified in place with size of sorted
+                                   // block size
+                depends);
+
+        // Merge segments in parallel until all elements are sorted
+        sycl::event merges_ev =
+            merge_sort_detail::merge_sorted_block_contig_impl<argTy *, Comp>(
+                exec_q, iter_nelems, sort_nelems, res_tp, comp,
+                sorted_block_size, {base_sort_ev});
+
+        return merges_ev;
+    }
+}
+
+template <typename T1, typename T2>
+class populate_index_data_krn;
+
+template <typename T1, typename T2>
+class index_map_to_rows_krn;
+
+template <typename IndexT, typename ValueT, typename ValueComp>
+struct IndexComp
+{
+    IndexComp(const ValueT *data, const ValueComp &comp_op)
+        : ptr(data), value_comp(comp_op)
+    {
+    }
+
+    bool operator()(const IndexT &i1, const IndexT &i2) const
+    {
+        return value_comp(ptr[i1], ptr[i2]);
+    }
+
+private:
+    const ValueT *ptr;
+    ValueComp value_comp;
+};
+
+template <typename argTy,
+          typename IndexTy,
+          typename ValueComp = std::less<argTy>>
+sycl::event stable_argsort_axis1_contig_impl(
+    sycl::queue &exec_q,
+    std::size_t iter_nelems, // number of sub-arrays to sort (num. of rows in a
+                             // matrix when sorting over rows)
+    std::size_t sort_nelems, // size of each array to sort  (length of rows,
+                             // i.e. number of columns)
+    const char *arg_cp,
+    char *res_cp,
+    ssize_t iter_arg_offset,
+    ssize_t iter_res_offset,
+    ssize_t sort_arg_offset,
+    ssize_t sort_res_offset,
+    const std::vector<sycl::event> &depends)
+{
+    const argTy *arg_tp = reinterpret_cast<const argTy *>(arg_cp) +
+                          iter_arg_offset + sort_arg_offset;
+    IndexTy *res_tp =
+        reinterpret_cast<IndexTy *>(res_cp) + iter_res_offset + sort_res_offset;
+
+    const IndexComp<IndexTy, argTy, ValueComp> index_comp{arg_tp, ValueComp{}};
+
+    static constexpr std::size_t determine_automatically = 0;
+    std::size_t sorted_block_size = determine_automatically;
+
+    const std::size_t total_nelems = iter_nelems * sort_nelems;
+
+    using dpctl::tensor::kernels::sort_utils_detail::iota_impl;
+
+    using IotaKernelName = populate_index_data_krn<argTy, IndexTy>;
+
+    sycl::event populate_indexed_data_ev = iota_impl<IotaKernelName, IndexTy>(
+        exec_q, res_tp, total_nelems, depends);
+
+    // Sort segments of the array
+    sycl::event base_sort_ev =
+        merge_sort_detail::sort_over_work_group_contig_impl(
+            exec_q, iter_nelems, sort_nelems, res_tp, res_tp, index_comp,
+            sorted_block_size, // modified in place with size of sorted block
+                               // size
+            {populate_indexed_data_ev});
+
+    // Merge segments in parallel until all elements are sorted
+    sycl::event merges_ev = merge_sort_detail::merge_sorted_block_contig_impl(
+        exec_q, iter_nelems, sort_nelems, res_tp, index_comp, sorted_block_size,
+        {base_sort_ev});
+
+    // no need to map back if iter_nelems == 1
+    if (iter_nelems == 1u) {
+        return merges_ev;
+    }
+
+    using MapBackKernelName = index_map_to_rows_krn<argTy, IndexTy>;
+    using dpctl::tensor::kernels::sort_utils_detail::map_back_impl;
+
+    sycl::event write_out_ev = map_back_impl<MapBackKernelName, IndexTy>(
+        exec_q, total_nelems, res_tp, res_tp, sort_nelems, {merges_ev});
+
+    return write_out_ev;
+}
+
+} // namespace dpctl::tensor::kernels
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/sorting/radix_sort.hpp b/dpctl_ext/tensor/libtensor/include/kernels/sorting/radix_sort.hpp
new file mode 100644
index 000000000000..940c6d802a9a
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/include/kernels/sorting/radix_sort.hpp
@@ -0,0 +1,1921 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_sorting_impl
+/// extension.
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <algorithm>
+#include <array>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <limits>
+#include <type_traits>
+#include <utility>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/sorting/sort_utils.hpp"
+#include "utils/sycl_alloc_utils.hpp"
+
+namespace dpctl::tensor::kernels
+{
+
+namespace radix_sort_details
+{
+
+template <std::uint32_t, bool, typename... TrailingNames>
+class radix_sort_count_kernel;
+
+template <std::uint32_t, typename... TrailingNames>
+class radix_sort_scan_kernel;
+
+template <std::uint32_t, bool, typename... TrailingNames>
+class radix_sort_reorder_peer_kernel;
+
+template <std::uint32_t, bool, typename... TrailingNames>
+class radix_sort_reorder_kernel;
+
+/*! @brief Computes smallest exponent such that `n <= (1 << exponent)` */
+template <typename SizeT,
+          std::enable_if_t<std::is_unsigned_v<SizeT> &&
+                               sizeof(SizeT) == sizeof(std::uint64_t),
+                           int> = 0>
+std::uint32_t ceil_log2(SizeT n)
+{
+    // if n > 2^b, n = q * 2^b + r for q > 0 and 0 <= r < 2^b
+    // floor_log2(q * 2^b + r) == floor_log2(q * 2^b) == q + floor_log2(n1)
+    // ceil_log2(n) == 1 + floor_log2(n-1)
+    if (n <= 1)
+        return std::uint32_t{1};
+
+    std::uint32_t exp{1};
+    --n;
+    if (n >= (SizeT{1} << 32)) {
+        n >>= 32;
+        exp += 32;
+    }
+    if (n >= (SizeT{1} << 16)) {
+        n >>= 16;
+        exp += 16;
+    }
+    if (n >= (SizeT{1} << 8)) {
+        n >>= 8;
+        exp += 8;
+    }
+    if (n >= (SizeT{1} << 4)) {
+        n >>= 4;
+        exp += 4;
+    }
+    if (n >= (SizeT{1} << 2)) {
+        n >>= 2;
+        exp += 2;
+    }
+    if (n >= (SizeT{1} << 1)) {
+        n >>= 1;
+        ++exp;
+    }
+    return exp;
+}
+
+//----------------------------------------------------------
+// bitwise order-preserving conversions to unsigned integers
+//----------------------------------------------------------
+
+template <bool is_ascending>
+bool order_preserving_cast(bool val)
+{
+    if constexpr (is_ascending)
+        return val;
+    else
+        return !val;
+}
+
+template <bool is_ascending,
+          typename UIntT,
+          std::enable_if_t<std::is_unsigned_v<UIntT>, int> = 0>
+UIntT order_preserving_cast(UIntT val)
+{
+    if constexpr (is_ascending) {
+        return val;
+    }
+    else {
+        // bitwise invert
+        return (~val);
+    }
+}
+
+template <bool is_ascending,
+          typename IntT,
+          std::enable_if_t<std::is_integral_v<IntT> && std::is_signed_v<IntT>,
+                           int> = 0>
+std::make_unsigned_t<IntT> order_preserving_cast(IntT val)
+{
+    using UIntT = std::make_unsigned_t<IntT>;
+    const UIntT uint_val = sycl::bit_cast<UIntT>(val);
+
+    if constexpr (is_ascending) {
+        // ascending_mask: 100..0
+        static constexpr UIntT ascending_mask =
+            (UIntT(1) << std::numeric_limits<IntT>::digits);
+        return (uint_val ^ ascending_mask);
+    }
+    else {
+        // descending_mask: 011..1
+        static constexpr UIntT descending_mask =
+            (std::numeric_limits<UIntT>::max() >> 1);
+        return (uint_val ^ descending_mask);
+    }
+}
+
+template <bool is_ascending>
+std::uint16_t order_preserving_cast(sycl::half val)
+{
+    using UIntT = std::uint16_t;
+
+    const UIntT uint_val = sycl::bit_cast<UIntT>(
+        (sycl::isnan(val)) ? std::numeric_limits<sycl::half>::quiet_NaN()
+                           : val);
+    UIntT mask;
+
+    // test the sign bit of the original value
+    const bool zero_fp_sign_bit = (UIntT(0) == (uint_val >> 15));
+
+    static constexpr UIntT zero_mask = UIntT(0x8000u);
+    static constexpr UIntT nonzero_mask = UIntT(0xFFFFu);
+
+    static constexpr UIntT inv_zero_mask = static_cast<UIntT>(~zero_mask);
+    static constexpr UIntT inv_nonzero_mask = static_cast<UIntT>(~nonzero_mask);
+
+    if constexpr (is_ascending) {
+        mask = (zero_fp_sign_bit) ? zero_mask : nonzero_mask;
+    }
+    else {
+        mask = (zero_fp_sign_bit) ? (inv_zero_mask) : (inv_nonzero_mask);
+    }
+
+    return (uint_val ^ mask);
+}
+
+template <bool is_ascending,
+          typename FloatT,
+          std::enable_if_t<std::is_floating_point_v<FloatT> &&
+                               sizeof(FloatT) == sizeof(std::uint32_t),
+                           int> = 0>
+std::uint32_t order_preserving_cast(FloatT val)
+{
+    using UIntT = std::uint32_t;
+
+    UIntT uint_val = sycl::bit_cast<UIntT>(
+        (sycl::isnan(val)) ? std::numeric_limits<FloatT>::quiet_NaN() : val);
+
+    UIntT mask;
+
+    // test the sign bit of the original value
+    const bool zero_fp_sign_bit = (UIntT(0) == (uint_val >> 31));
+
+    static constexpr UIntT zero_mask = UIntT(0x80000000u);
+    static constexpr UIntT nonzero_mask = UIntT(0xFFFFFFFFu);
+
+    if constexpr (is_ascending)
+        mask = (zero_fp_sign_bit) ? zero_mask : nonzero_mask;
+    else
+        mask = (zero_fp_sign_bit) ? (~zero_mask) : (~nonzero_mask);
+
+    return (uint_val ^ mask);
+}
+
+template <bool is_ascending,
+          typename FloatT,
+          std::enable_if_t<std::is_floating_point_v<FloatT> &&
+                               sizeof(FloatT) == sizeof(std::uint64_t),
+                           int> = 0>
+std::uint64_t order_preserving_cast(FloatT val)
+{
+    using UIntT = std::uint64_t;
+
+    UIntT uint_val = sycl::bit_cast<UIntT>(
+        (sycl::isnan(val)) ? std::numeric_limits<FloatT>::quiet_NaN() : val);
+    UIntT mask;
+
+    // test the sign bit of the original value
+    const bool zero_fp_sign_bit = (UIntT(0) == (uint_val >> 63));
+
+    static constexpr UIntT zero_mask = UIntT(0x8000000000000000u);
+    static constexpr UIntT nonzero_mask = UIntT(0xFFFFFFFFFFFFFFFFu);
+
+    if constexpr (is_ascending)
+        mask = (zero_fp_sign_bit) ? zero_mask : nonzero_mask;
+    else
+        mask = (zero_fp_sign_bit) ? (~zero_mask) : (~nonzero_mask);
+
+    return (uint_val ^ mask);
+}
+
+//-----------------
+// bucket functions
+//-----------------
+
+template <typename T>
+constexpr std::size_t number_of_bits_in_type()
+{
+    constexpr std::size_t type_bits =
+        (sizeof(T) * std::numeric_limits<unsigned char>::digits);
+    return type_bits;
+}
+
+// the number of buckets (size of radix bits) in T
+template <typename T>
+constexpr std::uint32_t number_of_buckets_in_type(std::uint32_t radix_bits)
+{
+    constexpr std::size_t type_bits = number_of_bits_in_type<T>();
+    return (type_bits + radix_bits - 1) / radix_bits;
+}
+
+// get bits value (bucket) in a certain radix position
+template <std::uint32_t radix_mask, typename T>
+std::uint32_t get_bucket_id(T val, std::uint32_t radix_offset)
+{
+    static_assert(std::is_unsigned_v<T>);
+
+    return (val >> radix_offset) & T(radix_mask);
+}
+
+//--------------------------------
+// count kernel (single iteration)
+//--------------------------------
+
+template <typename KernelName,
+          std::uint32_t radix_bits,
+          typename ValueT,
+          typename CountT,
+          typename Proj>
+sycl::event
+    radix_sort_count_submit(sycl::queue &exec_q,
+                            std::size_t n_iters,
+                            std::size_t n_segments,
+                            std::size_t wg_size,
+                            std::uint32_t radix_offset,
+                            std::size_t n_values,
+                            ValueT *vals_ptr,
+                            std::size_t n_counts,
+                            CountT *counts_ptr,
+                            const Proj &proj_op,
+                            const bool is_ascending,
+                            const std::vector<sycl::event> &dependency_events)
+{
+    // bin_count = radix_states used for an array storing bucket state counters
+    static constexpr std::uint32_t radix_states =
+        (std::uint32_t(1) << radix_bits);
+    static constexpr std::uint32_t radix_mask = radix_states - 1;
+
+    // iteration space info
+    const std::size_t n = n_values;
+    // each segment is processed by a work-group
+    const std::size_t elems_per_segment = (n + n_segments - 1) / n_segments;
+    const std::size_t no_op_flag_id = n_counts - 1;
+
+    assert(n_counts == (n_segments + 1) * radix_states + 1);
+
+    sycl::event local_count_ev = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(dependency_events);
+
+        sycl::local_accessor<CountT, 1> counts_lacc(wg_size * radix_states,
+                                                    cgh);
+
+        sycl::nd_range<1> ndRange(n_iters * n_segments * wg_size, wg_size);
+
+        cgh.parallel_for<KernelName>(ndRange, [=](sycl::nd_item<1> ndit) {
+            // 0 <= lid < wg_size
+            const std::size_t lid = ndit.get_local_id(0);
+            // 0 <= group_id < n_segments * n_iters
+            const std::size_t group_id = ndit.get_group(0);
+            const std::size_t iter_id = group_id / n_segments;
+            const std::size_t val_iter_offset = iter_id * n;
+            // 0 <= wgr_id < n_segments
+            const std::size_t wgr_id = group_id - iter_id * n_segments;
+
+            const std::size_t seg_start = elems_per_segment * wgr_id;
+
+            // count per work-item: create a private array for storing count
+            // values here bin_count = radix_states
+            std::array<CountT, radix_states> counts_arr = {CountT{0}};
+
+            // count per work-item: count values and write result to private
+            // count array
+            const std::size_t seg_end =
+                sycl::min(seg_start + elems_per_segment, n);
+            if (is_ascending) {
+                for (std::size_t val_id = seg_start + lid; val_id < seg_end;
+                     val_id += wg_size) {
+                    // get the bucket for the bit-ordered input value,
+                    // applying the offset and mask for radix bits
+                    const auto val =
+                        order_preserving_cast</*is_ascending*/ true>(
+                            proj_op(vals_ptr[val_iter_offset + val_id]));
+                    const std::uint32_t bucket_id =
+                        get_bucket_id<radix_mask>(val, radix_offset);
+
+                    // increment counter for this bit bucket
+                    ++counts_arr[bucket_id];
+                }
+            }
+            else {
+                for (std::size_t val_id = seg_start + lid; val_id < seg_end;
+                     val_id += wg_size) {
+                    // get the bucket for the bit-ordered input value,
+                    // applying the offset and mask for radix bits
+                    const auto val =
+                        order_preserving_cast</*is_ascending*/ false>(
+                            proj_op(vals_ptr[val_iter_offset + val_id]));
+                    const std::uint32_t bucket_id =
+                        get_bucket_id<radix_mask>(val, radix_offset);
+
+                    // increment counter for this bit bucket
+                    ++counts_arr[bucket_id];
+                }
+            }
+
+            // count per work-item: write private count array to local count
+            // array counts_lacc is concatenation of private count arrays from
+            // each work-item in the order of their local ids
+            const std::uint32_t count_start_id = radix_states * lid;
+            for (std::uint32_t radix_state_id = 0;
+                 radix_state_id < radix_states; ++radix_state_id)
+            {
+                counts_lacc[count_start_id + radix_state_id] =
+                    counts_arr[radix_state_id];
+            }
+
+            sycl::group_barrier(ndit.get_group());
+
+            // count per work-group: reduce till count_lacc[] size > wg_size
+            // all work-items in the work-group do the work.
+            for (std::uint32_t i = 1; i < radix_states; ++i) {
+                // Since we interested in computing total count over work-group
+                // for each radix state, the correct result is only assured if
+                // wg_size >= radix_states
+                counts_lacc[lid] += counts_lacc[wg_size * i + lid];
+            }
+
+            sycl::group_barrier(ndit.get_group());
+
+            // count per work-group: reduce until count_lacc[] size >
+            // radix_states (n_witems /= 2 per iteration)
+            for (std::uint32_t n_witems = (wg_size >> 1);
+                 n_witems >= radix_states; n_witems >>= 1)
+            {
+                if (lid < n_witems)
+                    counts_lacc[lid] += counts_lacc[n_witems + lid];
+
+                sycl::group_barrier(ndit.get_group());
+            }
+
+            const std::size_t iter_counter_offset = iter_id * n_counts;
+
+            // count per work-group: write local count array to global count
+            // array
+            if (lid < radix_states) {
+                // move buckets with the same id to adjacent positions,
+                // thus splitting count array into radix_states regions
+                counts_ptr[iter_counter_offset + (n_segments + 1) * lid +
+                           wgr_id] = counts_lacc[lid];
+            }
+
+            // side work: reset 'no-operation-flag', signaling to skip re-order
+            // phase
+            if (wgr_id == 0 && lid == 0) {
+                CountT &no_op_flag =
+                    counts_ptr[iter_counter_offset + no_op_flag_id];
+                no_op_flag = 0;
+            }
+        });
+    });
+
+    return local_count_ev;
+}
+
+//-----------------------------------------------------------------------
+// radix sort: scan kernel (single iteration)
+//-----------------------------------------------------------------------
+
+template <typename KernelName, std::uint32_t radix_bits, typename CountT>
+sycl::event radix_sort_scan_submit(sycl::queue &exec_q,
+                                   std::size_t n_iters,
+                                   std::size_t n_segments,
+                                   std::size_t wg_size,
+                                   std::size_t n_values,
+                                   std::size_t n_counts,
+                                   CountT *counts_ptr,
+                                   const std::vector<sycl::event> depends)
+{
+    const std::size_t no_op_flag_id = n_counts - 1;
+
+    // Scan produces local offsets using count values.
+    // There are no local offsets for the first segment, but the rest segments
+    // should be scanned with respect to the count value in the first segment
+    // what requires n + 1 positions
+    const std::size_t scan_size = n_segments + 1;
+    wg_size = std::min(scan_size, wg_size);
+
+    static constexpr std::uint32_t radix_states = std::uint32_t(1)
+                                                  << radix_bits;
+
+    // compilation of the kernel prevents out of resources issue, which may
+    // occur due to usage of collective algorithms such as joint_exclusive_scan
+    // even if local memory is not explicitly requested
+    sycl::event scan_ev = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+
+        sycl::nd_range<1> ndRange(n_iters * radix_states * wg_size, wg_size);
+
+        cgh.parallel_for<KernelName>(ndRange, [=](sycl::nd_item<1> ndit) {
+            const std::size_t group_id = ndit.get_group(0);
+            const std::size_t iter_id = group_id / radix_states;
+            const std::size_t wgr_id = group_id - iter_id * radix_states;
+            // find borders of a region with a specific bucket id
+            auto begin_ptr =
+                counts_ptr + scan_size * wgr_id + iter_id * n_counts;
+
+            sycl::joint_exclusive_scan(ndit.get_group(), begin_ptr,
+                                       begin_ptr + scan_size, begin_ptr,
+                                       CountT(0), sycl::plus<CountT>{});
+
+            const auto lid = ndit.get_local_linear_id();
+
+            // NB: No race condition here, because the condition may ever be
+            // true for only on one WG, one WI.
+            if ((lid == wg_size - 1) && (begin_ptr[scan_size - 1] == n_values))
+            {
+                // set flag, since all the values got into one
+                // this is optimization, may happy often for
+                // higher radix offsets (all zeros)
+                auto &no_op_flag =
+                    counts_ptr[iter_id * n_counts + no_op_flag_id];
+                no_op_flag = 1;
+            }
+        });
+    });
+
+    return scan_ev;
+}
+
+//-----------------------------------------------------------------------
+// radix sort: group level reorder algorithms
+//-----------------------------------------------------------------------
+
+struct empty_storage
+{
+    template <typename... T>
+    empty_storage(T &&...)
+    {
+    }
+};
+
+// Number with `n` least significant bits of uint32_t
+inline std::uint32_t n_ls_bits_set(std::uint32_t n) noexcept
+{
+    static constexpr std::uint32_t zero{};
+    static constexpr std::uint32_t all_bits_set = ~zero;
+
+    return ~(all_bits_set << n);
+}
+
+enum class peer_prefix_algo
+{
+    subgroup_ballot,
+    atomic_fetch_or,
+    scan_then_broadcast
+};
+
+template <typename OffsetT, peer_prefix_algo Algo>
+struct peer_prefix_helper;
+
+template <typename AccT>
+auto get_accessor_pointer(const AccT &acc)
+{
+    return acc.template get_multi_ptr<sycl::access::decorated::no>().get();
+}
+
+template <typename OffsetT>
+struct peer_prefix_helper<OffsetT, peer_prefix_algo::atomic_fetch_or>
+{
+    using AtomicT = sycl::atomic_ref<std::uint32_t,
+                                     sycl::memory_order::relaxed,
+                                     sycl::memory_scope::work_group,
+                                     sycl::access::address_space::local_space>;
+    using TempStorageT = sycl::local_accessor<std::uint32_t, 1>;
+
+private:
+    sycl::sub_group sgroup;
+    std::uint32_t lid;
+    std::uint32_t item_mask;
+    AtomicT atomic_peer_mask;
+
+public:
+    peer_prefix_helper(sycl::nd_item<1> ndit, TempStorageT lacc)
+        : sgroup(ndit.get_sub_group()), lid(ndit.get_local_linear_id()),
+          item_mask(n_ls_bits_set(lid)), atomic_peer_mask(lacc[0])
+    {
+    }
+
+    std::uint32_t peer_contribution(OffsetT &new_offset_id,
+                                    OffsetT offset_prefix,
+                                    bool wi_bit_set) const
+    {
+        // reset mask for each radix state
+        if (lid == 0)
+            atomic_peer_mask.store(std::uint32_t{0});
+        sycl::group_barrier(sgroup);
+
+        const std::uint32_t uint_contrib{wi_bit_set ? std::uint32_t{1}
+                                                    : std::uint32_t{0}};
+
+        // set local id's bit to 1 if the bucket value matches the radix state
+        atomic_peer_mask.fetch_or(uint_contrib << lid);
+        sycl::group_barrier(sgroup);
+        std::uint32_t peer_mask_bits = atomic_peer_mask.load();
+        std::uint32_t sg_total_offset = sycl::popcount(peer_mask_bits);
+
+        // get the local offset index from the bits set in the peer mask with
+        // index less than the work item ID
+        peer_mask_bits &= item_mask;
+        new_offset_id |= wi_bit_set
+                             ? (offset_prefix + sycl::popcount(peer_mask_bits))
+                             : OffsetT{0};
+        return sg_total_offset;
+    }
+};
+
+template <typename OffsetT>
+struct peer_prefix_helper<OffsetT, peer_prefix_algo::scan_then_broadcast>
+{
+    using TempStorageT = empty_storage;
+    using ItemType = sycl::nd_item<1>;
+    using SubGroupType = sycl::sub_group;
+
+private:
+    SubGroupType sgroup;
+    std::uint32_t sg_size;
+
+public:
+    peer_prefix_helper(sycl::nd_item<1> ndit, TempStorageT)
+        : sgroup(ndit.get_sub_group()), sg_size(sgroup.get_local_range()[0])
+    {
+    }
+
+    std::uint32_t peer_contribution(OffsetT &new_offset_id,
+                                    OffsetT offset_prefix,
+                                    bool wi_bit_set) const
+    {
+        const std::uint32_t contrib{wi_bit_set ? std::uint32_t{1}
+                                               : std::uint32_t{0}};
+
+        std::uint32_t sg_item_offset = sycl::exclusive_scan_over_group(
+            sgroup, contrib, sycl::plus<std::uint32_t>{});
+
+        new_offset_id |=
+            (wi_bit_set ? (offset_prefix + sg_item_offset) : OffsetT(0));
+
+        // the last scanned value does not contain number of all copies, thus
+        // adding contribution
+        std::uint32_t sg_total_offset = sycl::group_broadcast(
+            sgroup, sg_item_offset + contrib, sg_size - 1);
+
+        return sg_total_offset;
+    }
+};
+
+template <typename OffsetT>
+struct peer_prefix_helper<OffsetT, peer_prefix_algo::subgroup_ballot>
+{
+private:
+    sycl::sub_group sgroup;
+    std::uint32_t lid;
+    sycl::ext::oneapi::sub_group_mask item_sg_mask;
+
+    sycl::ext::oneapi::sub_group_mask mask_builder(std::uint32_t mask,
+                                                   std::uint32_t sg_size)
+    {
+        return sycl::detail::Builder::createSubGroupMask<
+            sycl::ext::oneapi::sub_group_mask>(mask, sg_size);
+    }
+
+public:
+    using TempStorageT = empty_storage;
+
+    peer_prefix_helper(sycl::nd_item<1> ndit, TempStorageT)
+        : sgroup(ndit.get_sub_group()), lid(ndit.get_local_linear_id()),
+          item_sg_mask(
+              mask_builder(n_ls_bits_set(lid), sgroup.get_local_linear_range()))
+    {
+    }
+
+    std::uint32_t peer_contribution(OffsetT &new_offset_id,
+                                    OffsetT offset_prefix,
+                                    bool wi_bit_set) const
+    {
+        // set local id's bit to 1 if the bucket value matches the radix state
+        auto peer_mask = sycl::ext::oneapi::group_ballot(sgroup, wi_bit_set);
+        std::uint32_t peer_mask_bits{};
+
+        peer_mask.extract_bits(peer_mask_bits);
+        std::uint32_t sg_total_offset = sycl::popcount(peer_mask_bits);
+
+        // get the local offset index from the bits set in the peer mask with
+        // index less than the work item ID
+        peer_mask &= item_sg_mask;
+        peer_mask.extract_bits(peer_mask_bits);
+
+        new_offset_id |= wi_bit_set
+                             ? (offset_prefix + sycl::popcount(peer_mask_bits))
+                             : OffsetT(0);
+
+        return sg_total_offset;
+    }
+};
+
+template <typename InputT, typename OutputT>
+void copy_func_for_radix_sort(const std::size_t n_segments,
+                              const std::size_t elems_per_segment,
+                              const std::size_t sg_size,
+                              const std::uint32_t lid,
+                              const std::size_t wgr_id,
+                              const InputT *input_ptr,
+                              const std::size_t n_values,
+                              OutputT *output_ptr)
+{
+    // item info
+    const std::size_t seg_start = elems_per_segment * wgr_id;
+
+    std::size_t seg_end = sycl::min(seg_start + elems_per_segment, n_values);
+
+    // ensure that each work item in a subgroup does the same number of loop
+    // iterations
+    const std::uint16_t tail_size = (seg_end - seg_start) % sg_size;
+    seg_end -= tail_size;
+
+    // find offsets for the same values within a segment and fill the resulting
+    // buffer
+    for (std::size_t val_id = seg_start + lid; val_id < seg_end;
+         val_id += sg_size) {
+        output_ptr[val_id] = std::move(input_ptr[val_id]);
+    }
+
+    if (tail_size > 0 && lid < tail_size) {
+        const std::size_t val_id = seg_end + lid;
+        output_ptr[val_id] = std::move(input_ptr[val_id]);
+    }
+}
+
+//-----------------------------------------------------------------------
+// radix sort: reorder kernel (per iteration)
+//-----------------------------------------------------------------------
+template <typename KernelName,
+          std::uint32_t radix_bits,
+          peer_prefix_algo PeerAlgo,
+          typename InputT,
+          typename OutputT,
+          typename OffsetT,
+          typename ProjT>
+sycl::event
+    radix_sort_reorder_submit(sycl::queue &exec_q,
+                              std::size_t n_iters,
+                              std::size_t n_segments,
+                              std::uint32_t radix_offset,
+                              std::size_t n_values,
+                              const InputT *input_ptr,
+                              OutputT *output_ptr,
+                              std::size_t n_offsets,
+                              OffsetT *offset_ptr,
+                              const ProjT &proj_op,
+                              const bool is_ascending,
+                              const std::vector<sycl::event> dependency_events)
+{
+    using ValueT = InputT;
+    using PeerHelper = peer_prefix_helper<OffsetT, PeerAlgo>;
+
+    static constexpr std::uint32_t radix_states = std::uint32_t{1}
+                                                  << radix_bits;
+    static constexpr std::uint32_t radix_mask = radix_states - 1;
+    const std::size_t elems_per_segment =
+        (n_values + n_segments - 1) / n_segments;
+
+    const std::size_t no_op_flag_id = n_offsets - 1;
+
+    const auto &kernel_id = sycl::get_kernel_id<KernelName>();
+
+    auto const &ctx = exec_q.get_context();
+    auto const &dev = exec_q.get_device();
+    auto kb = sycl::get_kernel_bundle<sycl::bundle_state::executable>(
+        ctx, {dev}, {kernel_id});
+
+    auto krn = kb.get_kernel(kernel_id);
+
+    const std::uint32_t sg_size = krn.template get_info<
+        sycl::info::kernel_device_specific::max_sub_group_size>(dev);
+
+    sycl::event reorder_ev = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(dependency_events);
+        cgh.use_kernel_bundle(kb);
+
+        using StorageT = typename PeerHelper::TempStorageT;
+
+        StorageT peer_temp(1, cgh);
+
+        sycl::range<1> lRange{sg_size};
+        sycl::range<1> gRange{n_iters * n_segments * sg_size};
+
+        sycl::nd_range<1> ndRange{gRange, lRange};
+
+        cgh.parallel_for<KernelName>(ndRange, [=](sycl::nd_item<1> ndit) {
+            const std::size_t group_id = ndit.get_group(0);
+            const std::size_t iter_id = group_id / n_segments;
+            const std::size_t segment_id = group_id - iter_id * n_segments;
+
+            auto b_offset_ptr = offset_ptr + iter_id * n_offsets;
+            auto b_input_ptr = input_ptr + iter_id * n_values;
+            auto b_output_ptr = output_ptr + iter_id * n_values;
+
+            const std::uint32_t lid = ndit.get_local_id(0);
+
+            auto &no_op_flag = b_offset_ptr[no_op_flag_id];
+            if (no_op_flag) {
+                // no reordering necessary, simply copy
+                copy_func_for_radix_sort<InputT, OutputT>(
+                    n_segments, elems_per_segment, sg_size, lid, segment_id,
+                    b_input_ptr, n_values, b_output_ptr);
+                return;
+            }
+
+            // create a private array for storing offset values
+            // and add total offset and offset for compute unit
+            // for a certain radix state
+            std::array<OffsetT, radix_states> offset_arr{};
+            const std::size_t scan_size = n_segments + 1;
+
+            OffsetT scanned_bin = 0;
+
+            /* find cumulative offset */
+            static constexpr std::uint32_t zero_radix_state_id = 0;
+            offset_arr[zero_radix_state_id] = b_offset_ptr[segment_id];
+
+            for (std::uint32_t radix_state_id = 1;
+                 radix_state_id < radix_states; ++radix_state_id)
+            {
+                const std::uint32_t local_offset_id =
+                    segment_id + scan_size * radix_state_id;
+
+                // scan bins serially
+                const std::size_t last_segment_bucket_id =
+                    radix_state_id * scan_size - 1;
+                scanned_bin += b_offset_ptr[last_segment_bucket_id];
+
+                offset_arr[radix_state_id] =
+                    scanned_bin + b_offset_ptr[local_offset_id];
+            }
+
+            const std::size_t seg_start = elems_per_segment * segment_id;
+            std::size_t seg_end =
+                sycl::min(seg_start + elems_per_segment, n_values);
+            // ensure that each work item in a subgroup does the same number of
+            // loop iterations
+            const std::uint32_t tail_size = (seg_end - seg_start) % sg_size;
+            seg_end -= tail_size;
+
+            const PeerHelper peer_prefix_hlp(ndit, peer_temp);
+
+            // find offsets for the same values within a segment and fill the
+            // resulting buffer
+            if (is_ascending) {
+                for (std::size_t val_id = seg_start + lid; val_id < seg_end;
+                     val_id += sg_size) {
+                    ValueT in_val = std::move(b_input_ptr[val_id]);
+
+                    // get the bucket for the bit-ordered input value, applying
+                    // the offset and mask for radix bits
+                    const auto mapped_val =
+                        order_preserving_cast</*is_ascending*/ true>(
+                            proj_op(in_val));
+                    std::uint32_t bucket_id =
+                        get_bucket_id<radix_mask>(mapped_val, radix_offset);
+
+                    OffsetT new_offset_id = 0;
+                    for (std::uint32_t radix_state_id = 0;
+                         radix_state_id < radix_states; ++radix_state_id)
+                    {
+                        bool is_current_bucket = (bucket_id == radix_state_id);
+                        std::uint32_t sg_total_offset =
+                            peer_prefix_hlp.peer_contribution(
+                                /* modified by reference */ new_offset_id,
+                                offset_arr[radix_state_id],
+                                /* bit contribution from this work-item */
+                                is_current_bucket);
+                        offset_arr[radix_state_id] += sg_total_offset;
+                    }
+                    b_output_ptr[new_offset_id] = std::move(in_val);
+                }
+            }
+            else {
+                for (std::size_t val_id = seg_start + lid; val_id < seg_end;
+                     val_id += sg_size) {
+                    ValueT in_val = std::move(b_input_ptr[val_id]);
+
+                    // get the bucket for the bit-ordered input value, applying
+                    // the offset and mask for radix bits
+                    const auto mapped_val =
+                        order_preserving_cast</*is_ascending*/ false>(
+                            proj_op(in_val));
+                    std::uint32_t bucket_id =
+                        get_bucket_id<radix_mask>(mapped_val, radix_offset);
+
+                    OffsetT new_offset_id = 0;
+                    for (std::uint32_t radix_state_id = 0;
+                         radix_state_id < radix_states; ++radix_state_id)
+                    {
+                        bool is_current_bucket = (bucket_id == radix_state_id);
+                        std::uint32_t sg_total_offset =
+                            peer_prefix_hlp.peer_contribution(
+                                /* modified by reference */ new_offset_id,
+                                offset_arr[radix_state_id],
+                                /* bit contribution from this work-item */
+                                is_current_bucket);
+                        offset_arr[radix_state_id] += sg_total_offset;
+                    }
+                    b_output_ptr[new_offset_id] = std::move(in_val);
+                }
+            }
+            if (tail_size > 0) {
+                ValueT in_val;
+
+                // default: is greater than any actual radix state
+                std::uint32_t bucket_id = radix_states;
+                if (lid < tail_size) {
+                    in_val = std::move(b_input_ptr[seg_end + lid]);
+
+                    const auto proj_val = proj_op(in_val);
+                    const auto mapped_val =
+                        (is_ascending)
+                            ? order_preserving_cast</*is_ascending*/ true>(
+                                  proj_val)
+                            : order_preserving_cast</*is_ascending*/ false>(
+                                  proj_val);
+                    bucket_id =
+                        get_bucket_id<radix_mask>(mapped_val, radix_offset);
+                }
+
+                OffsetT new_offset_id = 0;
+                for (std::uint32_t radix_state_id = 0;
+                     radix_state_id < radix_states; ++radix_state_id)
+                {
+                    bool is_current_bucket = (bucket_id == radix_state_id);
+                    std::uint32_t sg_total_offset =
+                        peer_prefix_hlp.peer_contribution(
+                            new_offset_id, offset_arr[radix_state_id],
+                            is_current_bucket);
+
+                    offset_arr[radix_state_id] += sg_total_offset;
+                }
+
+                if (lid < tail_size) {
+                    b_output_ptr[new_offset_id] = std::move(in_val);
+                }
+            }
+        });
+    });
+
+    return reorder_ev;
+}
+
+template <typename sizeT>
+sizeT _slm_adjusted_work_group_size(sycl::queue &exec_q,
+                                    sizeT required_slm_bytes_per_wg,
+                                    sizeT wg_size)
+{
+    const auto &dev = exec_q.get_device();
+
+    if (wg_size == 0)
+        wg_size =
+            dev.template get_info<sycl::info::device::max_work_group_size>();
+
+    const auto local_mem_sz =
+        dev.template get_info<sycl::info::device::local_mem_size>();
+
+    return sycl::min(local_mem_sz / required_slm_bytes_per_wg, wg_size);
+}
+
+//-----------------------------------------------------------------------
+// radix sort: one iteration
+//-----------------------------------------------------------------------
+
+template <std::uint32_t radix_bits, bool even>
+struct parallel_radix_sort_iteration_step
+{
+    template <typename... Name>
+    using count_phase = radix_sort_count_kernel<radix_bits, even, Name...>;
+    template <typename... Name>
+    using local_scan_phase = radix_sort_scan_kernel<radix_bits, Name...>;
+    template <typename... Name>
+    using reorder_peer_phase =
+        radix_sort_reorder_peer_kernel<radix_bits, even, Name...>;
+    template <typename... Name>
+    using reorder_phase = radix_sort_reorder_kernel<radix_bits, even, Name...>;
+
+    template <typename InputT,
+              typename OutputT,
+              typename CountT,
+              typename ProjT>
+    static sycl::event submit(sycl::queue &exec_q,
+                              std::size_t n_iters,
+                              std::size_t n_segments,
+                              std::uint32_t radix_iter,
+                              std::size_t n_values,
+                              const InputT *in_ptr,
+                              OutputT *out_ptr,
+                              std::size_t n_counts,
+                              CountT *counts_ptr,
+                              const ProjT &proj_op,
+                              const bool is_ascending,
+                              const std::vector<sycl::event> &dependency_events)
+    {
+        using _RadixCountKernel = count_phase<InputT, OutputT, CountT, ProjT>;
+        using _RadixLocalScanKernel =
+            local_scan_phase<InputT, OutputT, CountT, ProjT>;
+        using _RadixReorderPeerKernel =
+            reorder_peer_phase<InputT, OutputT, CountT, ProjT>;
+        using _RadixReorderKernel =
+            reorder_phase<InputT, OutputT, CountT, ProjT>;
+
+        const auto &supported_sub_group_sizes =
+            exec_q.get_device()
+                .template get_info<sycl::info::device::sub_group_sizes>();
+        const std::size_t max_sg_size =
+            (supported_sub_group_sizes.empty()
+                 ? 0
+                 : supported_sub_group_sizes.back());
+        const std::size_t reorder_sg_size = max_sg_size;
+        const std::size_t scan_wg_size =
+            exec_q.get_device()
+                .template get_info<sycl::info::device::max_work_group_size>();
+
+        static constexpr std::size_t two_mils = (std::size_t(1) << 21);
+        std::size_t count_wg_size =
+            ((max_sg_size > 0) && (n_values > two_mils) ? 128 : max_sg_size);
+
+        static constexpr std::uint32_t radix_states = std::uint32_t(1)
+                                                      << radix_bits;
+
+        // correct count_wg_size according to local memory limit in count phase
+        const auto max_count_wg_size = _slm_adjusted_work_group_size(
+            exec_q, sizeof(CountT) * radix_states, count_wg_size);
+        count_wg_size =
+            static_cast<::std::size_t>((max_count_wg_size / radix_states)) *
+            radix_states;
+
+        // work-group size must be a power of 2 and not less than the number of
+        // states, for scanning to work correctly
+
+        const std::size_t rounded_down_count_wg_size =
+            std::size_t{1} << (number_of_bits_in_type<std::size_t>() -
+                               sycl::clz(count_wg_size) - 1);
+        count_wg_size =
+            sycl::max(rounded_down_count_wg_size, std::size_t(radix_states));
+
+        // Compute the radix position for the given iteration
+        std::uint32_t radix_offset = radix_iter * radix_bits;
+
+        // 1. Count Phase
+        sycl::event count_ev =
+            radix_sort_count_submit<_RadixCountKernel, radix_bits>(
+                exec_q, n_iters, n_segments, count_wg_size, radix_offset,
+                n_values, in_ptr, n_counts, counts_ptr, proj_op, is_ascending,
+                dependency_events);
+
+        // 2. Scan Phase
+        sycl::event scan_ev =
+            radix_sort_scan_submit<_RadixLocalScanKernel, radix_bits>(
+                exec_q, n_iters, n_segments, scan_wg_size, n_values, n_counts,
+                counts_ptr, {count_ev});
+
+        // 3. Reorder Phase
+        sycl::event reorder_ev{};
+        // subgroup_ballot-based peer algo uses extract_bits to populate
+        // uint32_t mask and hence relies on sub-group to be 32 or narrower
+        static constexpr std::size_t sg32_v = 32u;
+        static constexpr std::size_t sg16_v = 16u;
+        static constexpr std::size_t sg08_v = 8u;
+        if (sg32_v == reorder_sg_size || sg16_v == reorder_sg_size ||
+            sg08_v == reorder_sg_size)
+        {
+            static constexpr auto peer_algorithm =
+                peer_prefix_algo::subgroup_ballot;
+
+            reorder_ev = radix_sort_reorder_submit<_RadixReorderPeerKernel,
+                                                   radix_bits, peer_algorithm>(
+                exec_q, n_iters, n_segments, radix_offset, n_values, in_ptr,
+                out_ptr, n_counts, counts_ptr, proj_op, is_ascending,
+                {scan_ev});
+        }
+        else {
+            static constexpr auto peer_algorithm =
+                peer_prefix_algo::scan_then_broadcast;
+
+            reorder_ev = radix_sort_reorder_submit<_RadixReorderKernel,
+                                                   radix_bits, peer_algorithm>(
+                exec_q, n_iters, n_segments, radix_offset, n_values, in_ptr,
+                out_ptr, n_counts, counts_ptr, proj_op, is_ascending,
+                {scan_ev});
+        }
+
+        return reorder_ev;
+    }
+}; // struct parallel_radix_sort_iteration
+
+template <typename Names, std::uint16_t... Constants>
+class radix_sort_one_wg_krn;
+
+template <typename KernelNameBase,
+          std::uint16_t wg_size = 256,
+          std::uint16_t block_size = 16,
+          std::uint32_t radix = 4,
+          std::uint16_t req_sub_group_size = (block_size < 4 ? 32 : 16)>
+struct subgroup_radix_sort
+{
+private:
+    class use_slm_tag
+    {
+    };
+    class use_global_mem_tag
+    {
+    };
+
+public:
+    template <typename ValueT, typename OutputT, typename ProjT>
+    sycl::event operator()(sycl::queue &exec_q,
+                           std::size_t n_iters,
+                           std::size_t n_to_sort,
+                           ValueT *input_ptr,
+                           OutputT *output_ptr,
+                           ProjT proj_op,
+                           const bool is_ascending,
+                           const std::vector<sycl::event> &depends)
+    {
+        static_assert(std::is_same_v<std::remove_cv_t<ValueT>, OutputT>);
+
+        using _SortKernelLoc =
+            radix_sort_one_wg_krn<KernelNameBase, wg_size, block_size, 0>;
+        using _SortKernelPartGlob =
+            radix_sort_one_wg_krn<KernelNameBase, wg_size, block_size, 1>;
+        using _SortKernelGlob =
+            radix_sort_one_wg_krn<KernelNameBase, wg_size, block_size, 2>;
+
+        static constexpr std::size_t max_concurrent_work_groups = 128U;
+
+        // Choose this to occupy the entire accelerator
+        const std::size_t n_work_groups =
+            std::min<std::size_t>(n_iters, max_concurrent_work_groups);
+
+        // determine which temporary allocation can be accommodated in SLM
+        const auto &SLM_availability =
+            check_slm_size<ValueT>(exec_q, n_to_sort);
+
+        const std::size_t n_batch_size = n_work_groups;
+
+        switch (SLM_availability) {
+        case temp_allocations::both_in_slm:
+        {
+            static constexpr auto storage_for_values = use_slm_tag{};
+            static constexpr auto storage_for_counters = use_slm_tag{};
+
+            return one_group_submitter<_SortKernelLoc>()(
+                exec_q, n_iters, n_iters, n_to_sort, input_ptr, output_ptr,
+                proj_op, is_ascending, storage_for_values, storage_for_counters,
+                depends);
+        }
+        case temp_allocations::counters_in_slm:
+        {
+            static constexpr auto storage_for_values = use_global_mem_tag{};
+            static constexpr auto storage_for_counters = use_slm_tag{};
+
+            return one_group_submitter<_SortKernelPartGlob>()(
+                exec_q, n_iters, n_batch_size, n_to_sort, input_ptr, output_ptr,
+                proj_op, is_ascending, storage_for_values, storage_for_counters,
+                depends);
+        }
+        default:
+        {
+            static constexpr auto storage_for_values = use_global_mem_tag{};
+            static constexpr auto storage_for_counters = use_global_mem_tag{};
+
+            return one_group_submitter<_SortKernelGlob>()(
+                exec_q, n_iters, n_batch_size, n_to_sort, input_ptr, output_ptr,
+                proj_op, is_ascending, storage_for_values, storage_for_counters,
+                depends);
+        }
+        }
+    }
+
+private:
+    template <typename KeyT, typename>
+    class TempBuf;
+
+    template <typename KeyT>
+    class TempBuf<KeyT, use_slm_tag>
+    {
+        std::size_t buf_size;
+
+    public:
+        TempBuf(std::size_t, std::size_t n) : buf_size(n) {}
+        auto get_acc(sycl::handler &cgh)
+        {
+            return sycl::local_accessor<KeyT>(buf_size, cgh);
+        }
+
+        std::size_t get_iter_stride() const
+        {
+            return std::size_t{0};
+        }
+    };
+
+    template <typename KeyT>
+    class TempBuf<KeyT, use_global_mem_tag>
+    {
+        sycl::buffer<KeyT> buf;
+        std::size_t iter_stride;
+
+    public:
+        TempBuf(std::size_t n_iters, std::size_t n)
+            : buf(n_iters * n), iter_stride(n)
+        {
+        }
+        auto get_acc(sycl::handler &cgh)
+        {
+            return sycl::accessor(buf, cgh, sycl::read_write, sycl::no_init);
+        }
+        std::size_t get_iter_stride() const
+        {
+            return iter_stride;
+        }
+    };
+
+    static_assert(wg_size <= 1024);
+    static constexpr std::uint16_t bin_count = (1 << radix);
+    static constexpr std::uint16_t counter_buf_sz = wg_size * bin_count + 1;
+
+    enum class temp_allocations
+    {
+        both_in_slm,
+        counters_in_slm,
+        both_in_global_mem
+    };
+
+    template <typename T, typename SizeT>
+    temp_allocations check_slm_size(const sycl::queue &exec_q, SizeT n)
+    {
+        // the kernel is designed for data size <= 64K
+        assert(n <= (SizeT(1) << 16));
+
+        static constexpr auto req_slm_size_counters =
+            counter_buf_sz * sizeof(std::uint16_t);
+
+        const auto &dev = exec_q.get_device();
+
+        // Pessimistically only use half of the memory to take into account
+        // a SYCL group algorithm might use a portion of SLM
+        const std::size_t max_slm_size =
+            dev.template get_info<sycl::info::device::local_mem_size>() / 2;
+
+        const auto n_uniform = 1 << ceil_log2(n);
+        const auto req_slm_size_val = sizeof(T) * n_uniform;
+
+        return ((req_slm_size_val + req_slm_size_counters) <= max_slm_size)
+                   ?
+                   // the values and the counters are placed in SLM
+                   temp_allocations::both_in_slm
+                   : (req_slm_size_counters <= max_slm_size)
+                         ?
+                         // the counters are placed in SLM, the values - in the
+                         // global memory
+                         temp_allocations::counters_in_slm
+                         :
+                         // the values and the counters are placed in the global
+                         // memory
+                         temp_allocations::both_in_global_mem;
+    }
+
+    template <typename KernelName>
+    struct one_group_submitter
+    {
+        template <typename InputT,
+                  typename OutputT,
+                  typename ProjT,
+                  typename SLM_value_tag,
+                  typename SLM_counter_tag>
+        sycl::event operator()(sycl::queue &exec_q,
+                               std::size_t n_iters,
+                               std::size_t n_batch_size,
+                               std::size_t n_values,
+                               InputT *input_arr,
+                               OutputT *output_arr,
+                               const ProjT &proj_op,
+                               const bool is_ascending,
+                               SLM_value_tag,
+                               SLM_counter_tag,
+                               const std::vector<sycl::event> &depends)
+        {
+            assert(!(n_values >> 16));
+
+            assert(n_values <= static_cast<std::size_t>(block_size) *
+                                   static_cast<std::size_t>(wg_size));
+
+            const std::uint16_t n = static_cast<std::uint16_t>(n_values);
+            static_assert(std::is_same_v<std::remove_cv_t<InputT>, OutputT>);
+
+            using ValueT = OutputT;
+
+            using KeyT = std::invoke_result_t<ProjT, ValueT>;
+
+            TempBuf<ValueT, SLM_value_tag> buf_val(
+                n_batch_size, static_cast<std::size_t>(block_size * wg_size));
+            TempBuf<std::uint16_t, SLM_counter_tag> buf_count(
+                n_batch_size, static_cast<std::size_t>(counter_buf_sz));
+
+            sycl::range<1> lRange{wg_size};
+
+            sycl::event sort_ev;
+            std::vector<sycl::event> deps{depends};
+
+            const std::size_t n_batches =
+                (n_iters + n_batch_size - 1) / n_batch_size;
+
+            const auto &kernel_id = sycl::get_kernel_id<KernelName>();
+
+            auto const &ctx = exec_q.get_context();
+            auto const &dev = exec_q.get_device();
+            auto kb = sycl::get_kernel_bundle<sycl::bundle_state::executable>(
+                ctx, {dev}, {kernel_id});
+
+            const auto &krn = kb.get_kernel(kernel_id);
+
+            const std::uint32_t krn_sg_size = krn.template get_info<
+                sycl::info::kernel_device_specific::max_sub_group_size>(dev);
+
+            // due to a bug in CPU device implementation, an additional
+            // synchronization is necessary for short sub-group sizes
+            const bool work_around_needed =
+                exec_q.get_device().has(sycl::aspect::cpu) &&
+                (krn_sg_size < 16);
+
+            for (std::size_t batch_id = 0; batch_id < n_batches; ++batch_id) {
+
+                const std::size_t block_start = batch_id * n_batch_size;
+
+                // input_arr/output_arr each has shape (n_iters, n)
+                InputT *this_input_arr = input_arr + block_start * n_values;
+                OutputT *this_output_arr = output_arr + block_start * n_values;
+
+                const std::size_t block_end =
+                    std::min<std::size_t>(block_start + n_batch_size, n_iters);
+
+                sycl::range<1> gRange{(block_end - block_start) * wg_size};
+                sycl::nd_range ndRange{gRange, lRange};
+
+                sort_ev = exec_q.submit([&](sycl::handler &cgh) {
+                    cgh.depends_on(deps);
+                    cgh.use_kernel_bundle(kb);
+
+                    // allocation to use for value exchanges
+                    auto exchange_acc = buf_val.get_acc(cgh);
+                    const std::size_t exchange_acc_iter_stride =
+                        buf_val.get_iter_stride();
+
+                    // allocation for counters
+                    auto counter_acc = buf_count.get_acc(cgh);
+                    const std::size_t counter_acc_iter_stride =
+                        buf_count.get_iter_stride();
+
+                    cgh.parallel_for<KernelName>(ndRange, [=](sycl::nd_item<1>
+                                                                  ndit) {
+                        ValueT values[block_size];
+
+                        const std::size_t iter_id = ndit.get_group(0);
+                        const std::size_t iter_val_offset =
+                            iter_id * static_cast<std::size_t>(n);
+                        const std::size_t iter_counter_offset =
+                            iter_id * counter_acc_iter_stride;
+                        const std::size_t iter_exchange_offset =
+                            iter_id * exchange_acc_iter_stride;
+
+                        std::uint16_t wi = ndit.get_local_linear_id();
+                        std::uint16_t begin_bit = 0;
+
+                        static constexpr std::uint16_t end_bit =
+                            number_of_bits_in_type<KeyT>();
+
+                        // copy from input array into values
+#pragma unroll
+                        for (std::uint16_t i = 0; i < block_size; ++i) {
+                            const std::uint16_t id = wi * block_size + i;
+                            values[i] =
+                                (id < n) ? this_input_arr[iter_val_offset + id]
+                                         : ValueT{};
+                        }
+
+                        while (true) {
+                            // indices for indirect access in the "re-order"
+                            // phase
+                            std::uint16_t indices[block_size];
+                            {
+                                // pointers to bucket's counters
+                                std::uint16_t *counters[block_size];
+
+                                // counting phase
+                                auto pcounter =
+                                    get_accessor_pointer(counter_acc) +
+                                    (wi + iter_counter_offset);
+
+                                // initialize counters
+#pragma unroll
+                                for (std::uint16_t i = 0; i < bin_count; ++i)
+                                    pcounter[i * wg_size] = std::uint16_t{0};
+
+                                sycl::group_barrier(ndit.get_group());
+
+                                if (is_ascending) {
+#pragma unroll
+                                    for (std::uint16_t i = 0; i < block_size;
+                                         ++i) {
+                                        const std::uint16_t id =
+                                            wi * block_size + i;
+                                        static constexpr std::uint16_t
+                                            bin_mask = bin_count - 1;
+
+                                        // points to the padded element, i.e. id
+                                        // is in-range
+                                        static constexpr std::uint16_t
+                                            default_out_of_range_bin_id =
+                                                bin_mask;
+
+                                        const std::uint16_t bin =
+                                            (id < n)
+                                                ? get_bucket_id<bin_mask>(
+                                                      order_preserving_cast<
+                                                          /* is_ascending */
+                                                          true>(
+                                                          proj_op(values[i])),
+                                                      begin_bit)
+                                                : default_out_of_range_bin_id;
+
+                                        // counting and local offset calculation
+                                        counters[i] = &pcounter[bin * wg_size];
+                                        indices[i] = *counters[i];
+                                        *counters[i] = indices[i] + 1;
+
+                                        if (work_around_needed) {
+                                            sycl::group_barrier(
+                                                ndit.get_group());
+                                        }
+                                    }
+                                }
+                                else {
+#pragma unroll
+                                    for (std::uint16_t i = 0; i < block_size;
+                                         ++i) {
+                                        const std::uint16_t id =
+                                            wi * block_size + i;
+                                        static constexpr std::uint16_t
+                                            bin_mask = bin_count - 1;
+
+                                        // points to the padded element, i.e. id
+                                        // is in-range
+                                        static constexpr std::uint16_t
+                                            default_out_of_range_bin_id =
+                                                bin_mask;
+
+                                        const std::uint16_t bin =
+                                            (id < n)
+                                                ? get_bucket_id<bin_mask>(
+                                                      order_preserving_cast<
+                                                          /* is_ascending */
+                                                          false>(
+                                                          proj_op(values[i])),
+                                                      begin_bit)
+                                                : default_out_of_range_bin_id;
+
+                                        // counting and local offset calculation
+                                        counters[i] = &pcounter[bin * wg_size];
+                                        indices[i] = *counters[i];
+                                        *counters[i] = indices[i] + 1;
+
+                                        if (work_around_needed) {
+                                            sycl::group_barrier(
+                                                ndit.get_group());
+                                        }
+                                    }
+                                }
+
+                                sycl::group_barrier(ndit.get_group());
+
+                                // exclusive scan phase
+                                {
+
+                                    // scan contiguous numbers
+                                    std::uint16_t bin_sum[bin_count];
+                                    const std::size_t counter_offset0 =
+                                        iter_counter_offset + wi * bin_count;
+                                    bin_sum[0] = counter_acc[counter_offset0];
+
+#pragma unroll
+                                    for (std::uint16_t i = 1; i < bin_count;
+                                         ++i)
+                                        bin_sum[i] =
+                                            bin_sum[i - 1] +
+                                            counter_acc[counter_offset0 + i];
+
+                                    sycl::group_barrier(ndit.get_group());
+
+                                    // exclusive scan local sum
+                                    std::uint16_t sum_scan =
+                                        sycl::exclusive_scan_over_group(
+                                            ndit.get_group(),
+                                            bin_sum[bin_count - 1],
+                                            sycl::plus<std::uint16_t>());
+
+// add to local sum, generate exclusive scan result
+#pragma unroll
+                                    for (std::uint16_t i = 0; i < bin_count;
+                                         ++i)
+                                        counter_acc[counter_offset0 + i + 1] =
+                                            sum_scan + bin_sum[i];
+
+                                    if (wi == 0)
+                                        counter_acc[iter_counter_offset + 0] =
+                                            std::uint32_t{0};
+
+                                    sycl::group_barrier(ndit.get_group());
+                                }
+
+#pragma unroll
+                                for (std::uint16_t i = 0; i < block_size; ++i) {
+                                    // a global index is a local offset plus a
+                                    // global base index
+                                    indices[i] += *counters[i];
+                                }
+
+                                sycl::group_barrier(ndit.get_group());
+                            }
+
+                            begin_bit += radix;
+
+                            // "re-order" phase
+                            sycl::group_barrier(ndit.get_group());
+                            if (begin_bit >= end_bit) {
+                                // the last iteration - writing out the result
+#pragma unroll
+                                for (std::uint16_t i = 0; i < block_size; ++i) {
+                                    const std::uint16_t r = indices[i];
+                                    if (r < n) {
+                                        this_output_arr[iter_val_offset + r] =
+                                            values[i];
+                                    }
+                                }
+
+                                return;
+                            }
+
+                            // data exchange
+#pragma unroll
+                            for (std::uint16_t i = 0; i < block_size; ++i) {
+                                const std::uint16_t r = indices[i];
+                                if (r < n)
+                                    exchange_acc[iter_exchange_offset + r] =
+                                        values[i];
+                            }
+
+                            sycl::group_barrier(ndit.get_group());
+
+#pragma unroll
+                            for (std::uint16_t i = 0; i < block_size; ++i) {
+                                const std::uint16_t id = wi * block_size + i;
+                                if (id < n)
+                                    values[i] =
+                                        exchange_acc[iter_exchange_offset + id];
+                            }
+
+                            sycl::group_barrier(ndit.get_group());
+                        }
+                    });
+                });
+
+                deps = {sort_ev};
+            }
+
+            return sort_ev;
+        }
+    };
+};
+
+template <typename ValueT, typename ProjT>
+struct OneWorkGroupRadixSortKernel;
+
+//-----------------------------------------------------------------------
+// radix sort: main function
+//-----------------------------------------------------------------------
+template <typename ValueT, typename ProjT>
+sycl::event parallel_radix_sort_impl(sycl::queue &exec_q,
+                                     std::size_t n_iters,
+                                     std::size_t n_to_sort,
+                                     const ValueT *input_arr,
+                                     ValueT *output_arr,
+                                     const ProjT &proj_op,
+                                     const bool is_ascending,
+                                     const std::vector<sycl::event> &depends)
+{
+    assert(n_to_sort > 1);
+
+    using KeyT = std::remove_cv_t<
+        std::remove_reference_t<std::invoke_result_t<ProjT, ValueT>>>;
+
+    // radix bits represent number of processed bits in each value during one
+    // iteration
+    static constexpr std::uint32_t radix_bits = 4;
+
+    sycl::event sort_ev{};
+
+    const auto &dev = exec_q.get_device();
+    const auto max_wg_size =
+        dev.template get_info<sycl::info::device::max_work_group_size>();
+
+    static constexpr std::uint16_t ref_wg_size = 64;
+    if (n_to_sort <= 16384 && ref_wg_size * 8 <= max_wg_size) {
+        using _RadixSortKernel = OneWorkGroupRadixSortKernel<ValueT, ProjT>;
+
+        if (n_to_sort <= 64 && ref_wg_size <= max_wg_size) {
+            // wg_size * block_size == 64 * 1 * 1 == 64
+            static constexpr std::uint16_t wg_size = ref_wg_size;
+            static constexpr std::uint16_t block_size = 1;
+
+            sort_ev = subgroup_radix_sort<_RadixSortKernel, wg_size, block_size,
+                                          radix_bits>{}(
+                exec_q, n_iters, n_to_sort, input_arr, output_arr, proj_op,
+                is_ascending, depends);
+        }
+        else if (n_to_sort <= 128 && ref_wg_size * 2 <= max_wg_size) {
+            // wg_size * block_size == 64 * 2 * 1 == 128
+            static constexpr std::uint16_t wg_size = ref_wg_size * 2;
+            static constexpr std::uint16_t block_size = 1;
+
+            sort_ev = subgroup_radix_sort<_RadixSortKernel, wg_size, block_size,
+                                          radix_bits>{}(
+                exec_q, n_iters, n_to_sort, input_arr, output_arr, proj_op,
+                is_ascending, depends);
+        }
+        else if (n_to_sort <= 256 && ref_wg_size * 2 <= max_wg_size) {
+            // wg_size * block_size == 64 * 2 * 2 == 256
+            static constexpr std::uint16_t wg_size = ref_wg_size * 2;
+            static constexpr std::uint16_t block_size = 2;
+
+            sort_ev = subgroup_radix_sort<_RadixSortKernel, wg_size, block_size,
+                                          radix_bits>{}(
+                exec_q, n_iters, n_to_sort, input_arr, output_arr, proj_op,
+                is_ascending, depends);
+        }
+        else if (n_to_sort <= 512 && ref_wg_size * 2 <= max_wg_size) {
+            // wg_size * block_size == 64 * 2 * 4 == 512
+            static constexpr std::uint16_t wg_size = ref_wg_size * 2;
+            static constexpr std::uint16_t block_size = 4;
+
+            sort_ev = subgroup_radix_sort<_RadixSortKernel, wg_size, block_size,
+                                          radix_bits>{}(
+                exec_q, n_iters, n_to_sort, input_arr, output_arr, proj_op,
+                is_ascending, depends);
+        }
+        else if (n_to_sort <= 1024 && ref_wg_size * 2 <= max_wg_size) {
+            // wg_size * block_size == 64 * 2 * 8 == 1024
+            static constexpr std::uint16_t wg_size = ref_wg_size * 2;
+            static constexpr std::uint16_t block_size = 8;
+
+            sort_ev = subgroup_radix_sort<_RadixSortKernel, wg_size, block_size,
+                                          radix_bits>{}(
+                exec_q, n_iters, n_to_sort, input_arr, output_arr, proj_op,
+                is_ascending, depends);
+        }
+        else if (n_to_sort <= 2048 && ref_wg_size * 4 <= max_wg_size) {
+            // wg_size * block_size == 64 * 4 * 8 == 2048
+            static constexpr std::uint16_t wg_size = ref_wg_size * 4;
+            static constexpr std::uint16_t block_size = 8;
+
+            sort_ev = subgroup_radix_sort<_RadixSortKernel, wg_size, block_size,
+                                          radix_bits>{}(
+                exec_q, n_iters, n_to_sort, input_arr, output_arr, proj_op,
+                is_ascending, depends);
+        }
+        else if (n_to_sort <= 4096 && ref_wg_size * 4 <= max_wg_size) {
+            // wg_size * block_size == 64 * 4 * 16 == 4096
+            static constexpr std::uint16_t wg_size = ref_wg_size * 4;
+            static constexpr std::uint16_t block_size = 16;
+
+            sort_ev = subgroup_radix_sort<_RadixSortKernel, wg_size, block_size,
+                                          radix_bits>{}(
+                exec_q, n_iters, n_to_sort, input_arr, output_arr, proj_op,
+                is_ascending, depends);
+        }
+        else if (n_to_sort <= 8192 && ref_wg_size * 8 <= max_wg_size) {
+            // wg_size * block_size == 64 * 8 * 16 == 8192
+            static constexpr std::uint16_t wg_size = ref_wg_size * 8;
+            static constexpr std::uint16_t block_size = 16;
+
+            sort_ev = subgroup_radix_sort<_RadixSortKernel, wg_size, block_size,
+                                          radix_bits>{}(
+                exec_q, n_iters, n_to_sort, input_arr, output_arr, proj_op,
+                is_ascending, depends);
+        }
+        else {
+            // wg_size * block_size == 64 * 8 * 32 == 16384
+            static constexpr std::uint16_t wg_size = ref_wg_size * 8;
+            static constexpr std::uint16_t block_size = 32;
+
+            sort_ev = subgroup_radix_sort<_RadixSortKernel, wg_size, block_size,
+                                          radix_bits>{}(
+                exec_q, n_iters, n_to_sort, input_arr, output_arr, proj_op,
+                is_ascending, depends);
+        }
+    }
+    else {
+        static constexpr std::uint32_t radix_iters =
+            number_of_buckets_in_type<KeyT>(radix_bits);
+        static constexpr std::uint32_t radix_states = std::uint32_t(1)
+                                                      << radix_bits;
+
+        static constexpr std::size_t bound_512k = (std::size_t(1) << 19);
+        static constexpr std::size_t bound_2m = (std::size_t(1) << 21);
+
+        const auto wg_sz_k = (n_to_sort < bound_512k)  ? 8
+                             : (n_to_sort <= bound_2m) ? 4
+                                                       : 1;
+        const std::size_t wg_size = max_wg_size / wg_sz_k;
+
+        const std::size_t n_segments = (n_to_sort + wg_size - 1) / wg_size;
+
+        // Additional radix_states elements are used for getting local offsets
+        // from count values + no_op flag; 'No operation' flag specifies whether
+        // to skip re-order phase if the all keys are the same (lie in one bin)
+        const std::size_t n_counts =
+            (n_segments + 1) * radix_states + 1 /*no_op flag*/;
+
+        using CountT = std::uint32_t;
+
+        // memory for storing count and offset values
+        auto count_owner =
+            dpctl::tensor::alloc_utils::smart_malloc_device<CountT>(
+                n_iters * n_counts, exec_q);
+
+        CountT *count_ptr = count_owner.get();
+
+        static constexpr std::uint32_t zero_radix_iter{0};
+
+        if constexpr (std::is_same_v<KeyT, bool>) {
+
+            sort_ev = parallel_radix_sort_iteration_step<
+                radix_bits, /*even=*/true>::submit(exec_q, n_iters, n_segments,
+                                                   zero_radix_iter, n_to_sort,
+                                                   input_arr, output_arr,
+                                                   n_counts, count_ptr, proj_op,
+                                                   is_ascending, depends);
+
+            sort_ev = dpctl::tensor::alloc_utils::async_smart_free(
+                exec_q, {sort_ev}, count_owner);
+
+            return sort_ev;
+        }
+
+        auto tmp_arr_owner =
+            dpctl::tensor::alloc_utils::smart_malloc_device<ValueT>(
+                n_iters * n_to_sort, exec_q);
+
+        ValueT *tmp_arr = tmp_arr_owner.get();
+
+        // iterations per each bucket
+        assert("Number of iterations must be even" && radix_iters % 2 == 0);
+        assert(radix_iters > 0);
+
+        sort_ev = parallel_radix_sort_iteration_step<
+            radix_bits, /*even=*/true>::submit(exec_q, n_iters, n_segments,
+                                               zero_radix_iter, n_to_sort,
+                                               input_arr, tmp_arr, n_counts,
+                                               count_ptr, proj_op, is_ascending,
+                                               depends);
+
+        for (std::uint32_t radix_iter = 1; radix_iter < radix_iters;
+             ++radix_iter) {
+            if (radix_iter % 2 == 0) {
+                sort_ev = parallel_radix_sort_iteration_step<
+                    radix_bits,
+                    /*even=*/true>::submit(exec_q, n_iters, n_segments,
+                                           radix_iter, n_to_sort, output_arr,
+                                           tmp_arr, n_counts, count_ptr,
+                                           proj_op, is_ascending, {sort_ev});
+            }
+            else {
+                sort_ev = parallel_radix_sort_iteration_step<
+                    radix_bits,
+                    /*even=*/false>::submit(exec_q, n_iters, n_segments,
+                                            radix_iter, n_to_sort, tmp_arr,
+                                            output_arr, n_counts, count_ptr,
+                                            proj_op, is_ascending, {sort_ev});
+            }
+        }
+
+        sort_ev = dpctl::tensor::alloc_utils::async_smart_free(
+            exec_q, {sort_ev}, tmp_arr_owner, count_owner);
+    }
+
+    return sort_ev;
+}
+
+struct IdentityProj
+{
+    constexpr IdentityProj() {}
+
+    template <typename T>
+    constexpr T operator()(T val) const
+    {
+        return val;
+    }
+};
+
+template <typename ValueT, typename IndexT>
+struct ValueProj
+{
+    constexpr ValueProj() {}
+
+    constexpr ValueT operator()(const std::pair<ValueT, IndexT> &pair) const
+    {
+        return pair.first;
+    }
+};
+
+template <typename IndexT, typename ValueT, typename ProjT>
+struct IndexedProj
+{
+    IndexedProj(const ValueT *arg_ptr) : ptr(arg_ptr), value_projector{} {}
+
+    IndexedProj(const ValueT *arg_ptr, const ProjT &proj_op)
+        : ptr(arg_ptr), value_projector(proj_op)
+    {
+    }
+
+    auto operator()(IndexT i) const
+    {
+        return value_projector(ptr[i]);
+    }
+
+private:
+    const ValueT *ptr;
+    ProjT value_projector;
+};
+
+} // namespace radix_sort_details
+
+using dpctl::tensor::ssize_t;
+
+template <typename argTy>
+sycl::event
+    radix_sort_axis1_contig_impl(sycl::queue &exec_q,
+                                 const bool sort_ascending,
+                                 // number of sub-arrays to sort (num. of rows
+                                 // in a matrix when sorting over rows)
+                                 std::size_t iter_nelems,
+                                 // size of each array to sort  (length of rows,
+                                 // i.e. number of columns)
+                                 std::size_t sort_nelems,
+                                 const char *arg_cp,
+                                 char *res_cp,
+                                 ssize_t iter_arg_offset,
+                                 ssize_t iter_res_offset,
+                                 ssize_t sort_arg_offset,
+                                 ssize_t sort_res_offset,
+                                 const std::vector<sycl::event> &depends)
+{
+    const argTy *arg_tp = reinterpret_cast<const argTy *>(arg_cp) +
+                          iter_arg_offset + sort_arg_offset;
+    argTy *res_tp =
+        reinterpret_cast<argTy *>(res_cp) + iter_res_offset + sort_res_offset;
+
+    using Proj = radix_sort_details::IdentityProj;
+    static constexpr Proj proj_op{};
+
+    sycl::event radix_sort_ev =
+        radix_sort_details::parallel_radix_sort_impl<argTy, Proj>(
+            exec_q, iter_nelems, sort_nelems, arg_tp, res_tp, proj_op,
+            sort_ascending, depends);
+
+    return radix_sort_ev;
+}
+
+template <typename ValueT, typename IndexT>
+class radix_argsort_index_write_out_krn;
+
+template <typename ValueT, typename IndexT>
+class radix_argsort_iota_krn;
+
+template <typename argTy, typename IndexTy>
+sycl::event
+    radix_argsort_axis1_contig_impl(sycl::queue &exec_q,
+                                    const bool sort_ascending,
+                                    // number of sub-arrays to sort (num. of
+                                    // rows in a matrix when sorting over rows)
+                                    std::size_t iter_nelems,
+                                    // size of each array to sort  (length of
+                                    // rows, i.e. number of columns)
+                                    std::size_t sort_nelems,
+                                    const char *arg_cp,
+                                    char *res_cp,
+                                    ssize_t iter_arg_offset,
+                                    ssize_t iter_res_offset,
+                                    ssize_t sort_arg_offset,
+                                    ssize_t sort_res_offset,
+                                    const std::vector<sycl::event> &depends)
+{
+    const argTy *arg_tp = reinterpret_cast<const argTy *>(arg_cp) +
+                          iter_arg_offset + sort_arg_offset;
+    IndexTy *res_tp =
+        reinterpret_cast<IndexTy *>(res_cp) + iter_res_offset + sort_res_offset;
+
+    const std::size_t total_nelems = iter_nelems * sort_nelems;
+    auto workspace_owner =
+        dpctl::tensor::alloc_utils::smart_malloc_device<IndexTy>(total_nelems,
+                                                                 exec_q);
+
+    // get raw USM pointer
+    IndexTy *workspace = workspace_owner.get();
+
+    using IdentityProjT = radix_sort_details::IdentityProj;
+    using IndexedProjT =
+        radix_sort_details::IndexedProj<IndexTy, argTy, IdentityProjT>;
+    const IndexedProjT proj_op{arg_tp};
+
+    using IotaKernelName = radix_argsort_iota_krn<argTy, IndexTy>;
+
+    using dpctl::tensor::kernels::sort_utils_detail::iota_impl;
+
+    sycl::event iota_ev = iota_impl<IotaKernelName, IndexTy>(
+        exec_q, workspace, total_nelems, depends);
+
+    sycl::event radix_sort_ev =
+        radix_sort_details::parallel_radix_sort_impl<IndexTy, IndexedProjT>(
+            exec_q, iter_nelems, sort_nelems, workspace, res_tp, proj_op,
+            sort_ascending, {iota_ev});
+
+    using MapBackKernelName = radix_argsort_index_write_out_krn<argTy, IndexTy>;
+    using dpctl::tensor::kernels::sort_utils_detail::map_back_impl;
+
+    sycl::event dep = radix_sort_ev;
+
+    // no need to perform map_back ( id % sort_nelems)
+    //   if total_nelems == sort_nelems
+    if (iter_nelems > 1u) {
+        dep = map_back_impl<MapBackKernelName, IndexTy>(
+            exec_q, total_nelems, res_tp, res_tp, sort_nelems, {dep});
+    }
+
+    sycl::event cleanup_ev = dpctl::tensor::alloc_utils::async_smart_free(
+        exec_q, {dep}, workspace_owner);
+
+    return cleanup_ev;
+}
+
+} // namespace dpctl::tensor::kernels
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/sorting/search_sorted_detail.hpp b/dpctl_ext/tensor/libtensor/include/kernels/sorting/search_sorted_detail.hpp
new file mode 100644
index 000000000000..1f3576402511
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/include/kernels/sorting/search_sorted_detail.hpp
@@ -0,0 +1,119 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for tensor sort/argsort operations.
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <cstddef>
+
+namespace dpctl::tensor::kernels::search_sorted_detail
+{
+
+template <typename T>
+T quotient_ceil(T n, T m)
+{
+    return (n + m - 1) / m;
+}
+
+template <typename Acc, typename Value, typename Compare>
+std::size_t lower_bound_impl(const Acc acc,
+                             const std::size_t first,
+                             const std::size_t last,
+                             const Value &value,
+                             const Compare &comp)
+{
+    std::size_t n = last - first;
+    std::size_t cur = n, start = first;
+    std::size_t it;
+    while (n > 0) {
+        it = start;
+        cur = n / 2;
+        it += cur;
+        if (comp(acc[it], value)) {
+            n -= cur + 1, start = ++it;
+        }
+        else
+            n = cur;
+    }
+    return start;
+}
+
+template <typename Acc, typename Value, typename Compare>
+std::size_t upper_bound_impl(const Acc acc,
+                             const std::size_t first,
+                             const std::size_t last,
+                             const Value &value,
+                             const Compare &comp)
+{
+    const auto &op_comp = [comp](auto x, auto y) { return !comp(y, x); };
+    return lower_bound_impl(acc, first, last, value, op_comp);
+}
+
+template <typename Acc, typename Value, typename Compare, typename IndexerT>
+std::size_t lower_bound_indexed_impl(const Acc acc,
+                                     std::size_t first,
+                                     std::size_t last,
+                                     const Value &value,
+                                     const Compare &comp,
+                                     const IndexerT &acc_indexer)
+{
+    std::size_t n = last - first;
+    std::size_t cur = n, start = first;
+    std::size_t it;
+    while (n > 0) {
+        it = start;
+        cur = n / 2;
+        it += cur;
+        if (comp(acc[acc_indexer(it)], value)) {
+            n -= cur + 1, start = ++it;
+        }
+        else
+            n = cur;
+    }
+    return start;
+}
+
+template <typename Acc, typename Value, typename Compare, typename IndexerT>
+std::size_t upper_bound_indexed_impl(const Acc acc,
+                                     const std::size_t first,
+                                     const std::size_t last,
+                                     const Value &value,
+                                     const Compare &comp,
+                                     const IndexerT &acc_indexer)
+{
+    const auto &op_comp = [comp](auto x, auto y) { return !comp(y, x); };
+    return lower_bound_indexed_impl(acc, first, last, value, op_comp,
+                                    acc_indexer);
+}
+
+} // namespace dpctl::tensor::kernels::search_sorted_detail
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/sorting/searchsorted.hpp b/dpctl_ext/tensor/libtensor/include/kernels/sorting/searchsorted.hpp
new file mode 100644
index 000000000000..bc400c9e569a
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/include/kernels/sorting/searchsorted.hpp
@@ -0,0 +1,258 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for tensor sort/argsort operations.
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <cstddef>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/sorting/search_sorted_detail.hpp"
+#include "utils/offset_utils.hpp"
+
+namespace dpctl::tensor::kernels
+{
+
+using dpctl::tensor::ssize_t;
+
+template <typename argTy,
+          typename indTy,
+          bool left_side,
+          typename HayIndexerT,
+          typename NeedlesIndexerT,
+          typename PositionsIndexerT,
+          typename Compare>
+struct SearchSortedFunctor
+{
+private:
+    const argTy *hay_tp;
+    const argTy *needles_tp;
+    indTy *positions_tp;
+    std::size_t hay_nelems;
+    HayIndexerT hay_indexer;
+    NeedlesIndexerT needles_indexer;
+    PositionsIndexerT positions_indexer;
+
+public:
+    SearchSortedFunctor(const argTy *hay_,
+                        const argTy *needles_,
+                        indTy *positions_,
+                        const std::size_t hay_nelems_,
+                        const HayIndexerT &hay_indexer_,
+                        const NeedlesIndexerT &needles_indexer_,
+                        const PositionsIndexerT &positions_indexer_)
+        : hay_tp(hay_), needles_tp(needles_), positions_tp(positions_),
+          hay_nelems(hay_nelems_), hay_indexer(hay_indexer_),
+          needles_indexer(needles_indexer_),
+          positions_indexer(positions_indexer_)
+    {
+    }
+
+    void operator()(sycl::id<1> id) const
+    {
+        const Compare comp{};
+
+        const std::size_t i = id[0];
+        const argTy needle_v = needles_tp[needles_indexer(i)];
+
+        // position of the needle_v in the hay array
+        indTy pos{};
+
+        static constexpr std::size_t zero(0);
+        if constexpr (left_side) {
+            // search in hay in left-closed interval, give `pos` such that
+            // hay[pos - 1] < needle_v <= hay[pos]
+
+            // lower_bound returns the first pos such that bool(hay[pos] <
+            // needle_v) is false, i.e. needle_v <= hay[pos]
+            pos = search_sorted_detail::lower_bound_indexed_impl(
+                hay_tp, zero, hay_nelems, needle_v, comp, hay_indexer);
+        }
+        else {
+            // search in hay in right-closed interval: hay[pos - 1] <= needle_v
+            // < hay[pos]
+
+            // upper_bound returns the first pos such that bool(needle_v <
+            // hay[pos]) is true, i.e. needle_v < hay[pos]
+            pos = search_sorted_detail::upper_bound_indexed_impl(
+                hay_tp, zero, hay_nelems, needle_v, comp, hay_indexer);
+        }
+
+        positions_tp[positions_indexer(i)] = pos;
+    }
+};
+
+typedef sycl::event (*searchsorted_contig_impl_fp_ptr_t)(
+    sycl::queue &,
+    const std::size_t,
+    const std::size_t,
+    const char *,
+    const ssize_t,
+    const char *,
+    const ssize_t,
+    char *,
+    const ssize_t,
+    const std::vector<sycl::event> &);
+
+template <typename T1, typename T2, bool left_closed>
+class searchsorted_contig_impl_krn;
+
+template <typename argTy, typename indTy, bool left_closed, typename Compare>
+sycl::event searchsorted_contig_impl(sycl::queue &exec_q,
+                                     const std::size_t hay_nelems,
+                                     const std::size_t needles_nelems,
+                                     const char *hay_cp,
+                                     const ssize_t hay_offset,
+                                     const char *needles_cp,
+                                     const ssize_t needles_offset,
+                                     char *positions_cp,
+                                     const ssize_t positions_offset,
+                                     const std::vector<sycl::event> &depends)
+{
+    const argTy *hay_tp = reinterpret_cast<const argTy *>(hay_cp) + hay_offset;
+    const argTy *needles_tp =
+        reinterpret_cast<const argTy *>(needles_cp) + needles_offset;
+
+    indTy *positions_tp =
+        reinterpret_cast<indTy *>(positions_cp) + positions_offset;
+
+    sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+
+        using KernelName =
+            class searchsorted_contig_impl_krn<argTy, indTy, left_closed>;
+
+        sycl::range<1> gRange(needles_nelems);
+
+        using TrivialIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+
+        static constexpr TrivialIndexerT hay_indexer{};
+        static constexpr TrivialIndexerT needles_indexer{};
+        static constexpr TrivialIndexerT positions_indexer{};
+
+        const auto fnctr =
+            SearchSortedFunctor<argTy, indTy, left_closed, TrivialIndexerT,
+                                TrivialIndexerT, TrivialIndexerT, Compare>(
+                hay_tp, needles_tp, positions_tp, hay_nelems, hay_indexer,
+                needles_indexer, positions_indexer);
+
+        cgh.parallel_for<KernelName>(gRange, fnctr);
+    });
+
+    return comp_ev;
+}
+
+typedef sycl::event (*searchsorted_strided_impl_fp_ptr_t)(
+    sycl::queue &,
+    const std::size_t,
+    const std::size_t,
+    const char *,
+    const ssize_t,
+    const ssize_t,
+    const char *,
+    const ssize_t,
+    char *,
+    const ssize_t,
+    int,
+    const ssize_t *,
+    const std::vector<sycl::event> &);
+
+template <typename T1, typename T2, bool left_closed>
+class searchsorted_strided_impl_krn;
+
+template <typename argTy, typename indTy, bool left_closed, typename Compare>
+sycl::event searchsorted_strided_impl(
+    sycl::queue &exec_q,
+    const std::size_t hay_nelems,
+    const std::size_t needles_nelems,
+    const char *hay_cp,
+    const ssize_t hay_offset,
+    // hay is 1D, so hay_nelems, hay_offset, hay_stride describe strided array
+    const ssize_t hay_stride,
+    const char *needles_cp,
+    const ssize_t needles_offset,
+    char *positions_cp,
+    const ssize_t positions_offset,
+    const int needles_nd,
+    // packed_shape_strides is [needles_shape, needles_strides,
+    // positions_strides] has length of 3*needles_nd
+    const ssize_t *packed_shape_strides,
+    const std::vector<sycl::event> &depends)
+{
+    const argTy *hay_tp = reinterpret_cast<const argTy *>(hay_cp);
+    const argTy *needles_tp = reinterpret_cast<const argTy *>(needles_cp);
+
+    indTy *positions_tp = reinterpret_cast<indTy *>(positions_cp);
+
+    sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+
+        sycl::range<1> gRange(needles_nelems);
+
+        using HayIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer;
+        const HayIndexerT hay_indexer(
+            /* offset */ hay_offset,
+            /* size   */ hay_nelems,
+            /* step   */ hay_stride);
+
+        using NeedlesIndexerT = dpctl::tensor::offset_utils::StridedIndexer;
+        const ssize_t *needles_shape_strides = packed_shape_strides;
+        const NeedlesIndexerT needles_indexer(needles_nd, needles_offset,
+                                              needles_shape_strides);
+        using PositionsIndexerT =
+            dpctl::tensor::offset_utils::UnpackedStridedIndexer;
+
+        const ssize_t *positions_shape = packed_shape_strides;
+        const ssize_t *positions_strides =
+            packed_shape_strides + 2 * needles_nd;
+        const PositionsIndexerT positions_indexer(
+            needles_nd, positions_offset, positions_shape, positions_strides);
+
+        const auto fnctr =
+            SearchSortedFunctor<argTy, indTy, left_closed, HayIndexerT,
+                                NeedlesIndexerT, PositionsIndexerT, Compare>(
+                hay_tp, needles_tp, positions_tp, hay_nelems, hay_indexer,
+                needles_indexer, positions_indexer);
+        using KernelName =
+            class searchsorted_strided_impl_krn<argTy, indTy, left_closed>;
+
+        cgh.parallel_for<KernelName>(gRange, fnctr);
+    });
+
+    return comp_ev;
+}
+
+} // namespace dpctl::tensor::kernels
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/sorting/sort_impl_fn_ptr_t.hpp b/dpctl_ext/tensor/libtensor/include/kernels/sorting/sort_impl_fn_ptr_t.hpp
new file mode 100644
index 000000000000..7b48f310a445
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/include/kernels/sorting/sort_impl_fn_ptr_t.hpp
@@ -0,0 +1,61 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_sorting_impl
+/// extension.
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <cstddef>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "kernels/dpctl_tensor_types.hpp"
+
+namespace dpctl::tensor::kernels
+{
+
+using dpctl::tensor::ssize_t;
+
+typedef sycl::event (*sort_contig_fn_ptr_t)(sycl::queue &,
+                                            std::size_t,
+                                            std::size_t,
+                                            const char *,
+                                            char *,
+                                            ssize_t,
+                                            ssize_t,
+                                            ssize_t,
+                                            ssize_t,
+                                            const std::vector<sycl::event> &);
+
+} // namespace dpctl::tensor::kernels
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/sorting/sort_utils.hpp b/dpctl_ext/tensor/libtensor/include/kernels/sorting/sort_utils.hpp
new file mode 100644
index 000000000000..fd32905b808e
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/include/kernels/sorting/sort_utils.hpp
@@ -0,0 +1,144 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for tensor sort/argsort operations.
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <array>
+#include <cstddef>
+#include <cstdint>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+namespace dpctl::tensor::kernels::sort_utils_detail
+{
+
+namespace syclexp = sycl::ext::oneapi::experimental;
+
+template <class KernelName, typename T>
+sycl::event iota_impl(sycl::queue &exec_q,
+                      T *data,
+                      std::size_t nelems,
+                      const std::vector<sycl::event> &dependent_events)
+{
+    static constexpr std::uint32_t lws = 256;
+    static constexpr std::uint32_t n_wi = 4;
+    const std::size_t n_groups = (nelems + n_wi * lws - 1) / (n_wi * lws);
+
+    sycl::range<1> gRange{n_groups * lws};
+    sycl::range<1> lRange{lws};
+    sycl::nd_range<1> ndRange{gRange, lRange};
+
+    sycl::event e = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(dependent_events);
+        cgh.parallel_for<KernelName>(ndRange, [=](sycl::nd_item<1> it) {
+            const std::size_t gid = it.get_global_linear_id();
+            const auto &sg = it.get_sub_group();
+            const std::uint32_t lane_id = sg.get_local_id()[0];
+
+            const std::size_t offset = (gid - lane_id) * n_wi;
+            const std::uint32_t max_sgSize = sg.get_max_local_range()[0];
+
+            std::array<T, n_wi> stripe{};
+#pragma unroll
+            for (std::uint32_t i = 0; i < n_wi; ++i) {
+                stripe[i] = T(offset + lane_id + i * max_sgSize);
+            }
+
+            if (offset + n_wi * max_sgSize < nelems) {
+                static constexpr auto group_ls_props =
+                    syclexp::properties{syclexp::data_placement_striped};
+
+                auto out_multi_ptr = sycl::address_space_cast<
+                    sycl::access::address_space::global_space,
+                    sycl::access::decorated::yes>(&data[offset]);
+
+                syclexp::group_store(sg, sycl::span<T, n_wi>{&stripe[0], n_wi},
+                                     out_multi_ptr, group_ls_props);
+            }
+            else {
+                for (std::size_t idx = offset + lane_id; idx < nelems;
+                     idx += max_sgSize) {
+                    data[idx] = T(idx);
+                }
+            }
+        });
+    });
+
+    return e;
+}
+
+template <class KernelName, typename IndexTy>
+sycl::event map_back_impl(sycl::queue &exec_q,
+                          std::size_t nelems,
+                          const IndexTy *flat_index_data,
+                          IndexTy *reduced_index_data,
+                          std::size_t row_size,
+                          const std::vector<sycl::event> &dependent_events)
+{
+    static constexpr std::uint32_t lws = 64;
+    static constexpr std::uint32_t n_wi = 4;
+    const std::size_t n_groups = (nelems + lws * n_wi - 1) / (n_wi * lws);
+
+    sycl::range<1> lRange{lws};
+    sycl::range<1> gRange{n_groups * lws};
+    sycl::nd_range<1> ndRange{gRange, lRange};
+
+    sycl::event map_back_ev = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(dependent_events);
+
+        cgh.parallel_for<KernelName>(ndRange, [=](sycl::nd_item<1> it) {
+            const std::size_t gid = it.get_global_linear_id();
+            const auto &sg = it.get_sub_group();
+            const std::uint32_t lane_id = sg.get_local_id()[0];
+            const std::uint32_t sg_size = sg.get_max_local_range()[0];
+
+            const std::size_t start_id = (gid - lane_id) * n_wi + lane_id;
+
+#pragma unroll
+            for (std::uint32_t i = 0; i < n_wi; ++i) {
+                const std::size_t data_id = start_id + i * sg_size;
+
+                if (data_id < nelems) {
+                    const IndexTy linear_index = flat_index_data[data_id];
+                    reduced_index_data[data_id] = (linear_index % row_size);
+                }
+            }
+        });
+    });
+
+    return map_back_ev;
+}
+
+} // namespace dpctl::tensor::kernels::sort_utils_detail
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/sorting/topk.hpp b/dpctl_ext/tensor/libtensor/include/kernels/sorting/topk.hpp
new file mode 100644
index 000000000000..d9a103a02e99
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/include/kernels/sorting/topk.hpp
@@ -0,0 +1,512 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for tensor topk operation.
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <functional>
+#include <stdexcept>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "kernels/sorting/merge_sort.hpp"
+#include "kernels/sorting/radix_sort.hpp"
+#include "kernels/sorting/search_sorted_detail.hpp"
+#include "kernels/sorting/sort_utils.hpp"
+#include "utils/sycl_alloc_utils.hpp"
+
+namespace dpctl::tensor::kernels
+{
+
+namespace topk_detail
+{
+
+void scale_topk_params(const std::uint64_t nelems_per_slm,
+                       const std::size_t sub_groups_per_work_group,
+                       const std::uint32_t elems_per_wi,
+                       const std::vector<std::size_t> &sg_sizes,
+                       std::size_t &lws,
+                       std::size_t &nelems_wg_sorts)
+{
+    for (auto it = sg_sizes.rbegin(); it != sg_sizes.rend(); ++it) {
+        auto sg_size = *it;
+        lws = sub_groups_per_work_group * sg_size;
+        nelems_wg_sorts = elems_per_wi * lws;
+        if (nelems_wg_sorts < nelems_per_slm) {
+            return;
+        }
+    }
+    // should never reach
+    throw std::runtime_error("Could not construct top k kernel parameters");
+}
+
+template <class KernelName, typename argTy, typename IndexTy>
+sycl::event write_out_impl(sycl::queue &exec_q,
+                           std::size_t iter_nelems,
+                           std::size_t k,
+                           const argTy *arg_tp,
+                           const IndexTy *index_data,
+                           std::size_t iter_index_stride,
+                           std::size_t axis_nelems,
+                           argTy *vals_tp,
+                           IndexTy *inds_tp,
+                           const std::vector<sycl::event> &depends)
+{
+    static constexpr std::uint32_t lws = 64;
+    static constexpr std::uint32_t n_wi = 4;
+    const std::size_t nelems = iter_nelems * k;
+    const std::size_t n_groups = (nelems + lws * n_wi - 1) / (n_wi * lws);
+
+    sycl::range<1> lRange{lws};
+    sycl::range<1> gRange{n_groups * lws};
+    sycl::nd_range<1> ndRange{gRange, lRange};
+
+    sycl::event write_out_ev = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+
+        cgh.parallel_for<KernelName>(ndRange, [=](sycl::nd_item<1> it) {
+            const std::size_t gid = it.get_global_linear_id();
+            const auto &sg = it.get_sub_group();
+            const std::uint32_t lane_id = sg.get_local_id()[0];
+            const std::uint32_t sg_size = sg.get_max_local_range()[0];
+
+            const std::size_t start_id = (gid - lane_id) * n_wi + lane_id;
+
+#pragma unroll
+            for (std::uint32_t i = 0; i < n_wi; ++i) {
+                const std::size_t data_id = start_id + i * sg_size;
+
+                if (data_id < nelems) {
+                    const std::size_t iter_id = data_id / k;
+
+                    /*
+                    const std::size_t axis_gid = data_id - (iter_gid * k);
+                    const std::size_t src_idx = iter_gid * iter_index_stride +
+                    axis_gid;
+                    */
+                    const std::size_t src_idx =
+                        data_id + iter_id * (iter_index_stride - k);
+
+                    const IndexTy res_ind = index_data[src_idx];
+                    const argTy v = arg_tp[res_ind];
+
+                    const std::size_t dst_idx = data_id;
+                    vals_tp[dst_idx] = v;
+                    inds_tp[dst_idx] = (res_ind % axis_nelems);
+                }
+            }
+        });
+    });
+
+    return write_out_ev;
+}
+
+} // namespace topk_detail
+
+template <typename T1, typename T2>
+class topk_populate_index_data_krn;
+
+template <typename T1, typename T2>
+class topk_full_merge_map_back_krn;
+
+template <typename argTy, typename IndexTy, typename CompT>
+sycl::event
+    topk_full_merge_sort_impl(sycl::queue &exec_q,
+                              std::size_t iter_nelems, // number of sub-arrays
+                              std::size_t axis_nelems, // size of each sub-array
+                              std::size_t k,
+                              const argTy *arg_tp,
+                              argTy *vals_tp,
+                              IndexTy *inds_tp,
+                              const CompT &comp,
+                              const std::vector<sycl::event> &depends)
+{
+    auto index_data_owner =
+        dpctl::tensor::alloc_utils::smart_malloc_device<IndexTy>(
+            iter_nelems * axis_nelems, exec_q);
+    // extract USM pointer
+    IndexTy *index_data = index_data_owner.get();
+
+    using IotaKernelName = topk_populate_index_data_krn<argTy, IndexTy>;
+
+    using dpctl::tensor::kernels::sort_utils_detail::iota_impl;
+
+    sycl::event populate_indexed_data_ev = iota_impl<IotaKernelName, IndexTy>(
+        exec_q, index_data, iter_nelems * axis_nelems, depends);
+
+    std::size_t sorted_block_size;
+    // Sort segments of the array
+    sycl::event base_sort_ev =
+        merge_sort_detail::sort_over_work_group_contig_impl(
+            exec_q, iter_nelems, axis_nelems, index_data, index_data, comp,
+            sorted_block_size, // modified in place with size of sorted block
+                               // size
+            {populate_indexed_data_ev});
+
+    // Merge segments in parallel until all elements are sorted
+    sycl::event merges_ev = merge_sort_detail::merge_sorted_block_contig_impl(
+        exec_q, iter_nelems, axis_nelems, index_data, comp, sorted_block_size,
+        {base_sort_ev});
+
+    using WriteOutKernelName = topk_full_merge_map_back_krn<argTy, IndexTy>;
+
+    sycl::event write_out_ev =
+        topk_detail::write_out_impl<WriteOutKernelName, argTy, IndexTy>(
+            exec_q, iter_nelems, k, arg_tp, index_data, axis_nelems,
+            axis_nelems, vals_tp, inds_tp, {merges_ev});
+
+    sycl::event cleanup_host_task_event =
+        dpctl::tensor::alloc_utils::async_smart_free(exec_q, {write_out_ev},
+                                                     index_data_owner);
+
+    return cleanup_host_task_event;
+};
+
+template <typename T1, typename T2>
+class topk_partial_merge_map_back_krn;
+
+template <typename T1, typename T2, typename Comp>
+class topk_over_work_group_krn;
+
+template <typename argTy,
+          typename IndexTy,
+          typename ValueComp = std::less<argTy>>
+sycl::event topk_merge_impl(
+    sycl::queue &exec_q,
+    std::size_t iter_nelems, // number of sub-arrays to sort (num. of rows
+                             // in a matrix when sorting over rows)
+    std::size_t axis_nelems, // size of each array to sort  (length of
+                             // rows, i.e. number of columns)
+    std::size_t k,
+    const char *arg_cp,
+    char *vals_cp,
+    char *inds_cp,
+    const std::vector<sycl::event> &depends)
+{
+    if (axis_nelems < k) {
+        throw std::runtime_error("Invalid sort axis size for value of k");
+    }
+
+    const argTy *arg_tp = reinterpret_cast<const argTy *>(arg_cp);
+    argTy *vals_tp = reinterpret_cast<argTy *>(vals_cp);
+    IndexTy *inds_tp = reinterpret_cast<IndexTy *>(inds_cp);
+
+    using dpctl::tensor::kernels::IndexComp;
+    const IndexComp<IndexTy, argTy, ValueComp> index_comp{arg_tp, ValueComp{}};
+
+    if (axis_nelems <= 512 || k >= 1024 || k > axis_nelems / 2) {
+        return topk_full_merge_sort_impl(exec_q, iter_nelems, axis_nelems, k,
+                                         arg_tp, vals_tp, inds_tp, index_comp,
+                                         depends);
+    }
+    else {
+        using PartialKernelName =
+            topk_over_work_group_krn<IndexTy, IndexTy, ValueComp>;
+
+        const auto &kernel_id = sycl::get_kernel_id<PartialKernelName>();
+
+        auto const &ctx = exec_q.get_context();
+        auto const &dev = exec_q.get_device();
+
+        auto kb = sycl::get_kernel_bundle<sycl::bundle_state::executable>(
+            ctx, {dev}, {kernel_id});
+
+        auto krn = kb.get_kernel(kernel_id);
+
+        const std::uint32_t max_sg_size = krn.template get_info<
+            sycl::info::kernel_device_specific::max_sub_group_size>(dev);
+        const std::uint64_t device_local_memory_size =
+            dev.get_info<sycl::info::device::local_mem_size>();
+
+        //  leave 512 bytes of local memory for RT
+        const std::uint64_t safety_margin = 512;
+
+        const std::uint64_t nelems_per_slm =
+            (device_local_memory_size - safety_margin) / (2 * sizeof(IndexTy));
+
+        static constexpr std::uint32_t sub_groups_per_work_group = 4;
+        const std::uint32_t elems_per_wi = dev.has(sycl::aspect::cpu) ? 8 : 2;
+
+        std::size_t lws = sub_groups_per_work_group * max_sg_size;
+
+        std::size_t sorted_block_size = elems_per_wi * lws;
+        if (sorted_block_size > nelems_per_slm) {
+            const std::vector<std::size_t> sg_sizes =
+                dev.get_info<sycl::info::device::sub_group_sizes>();
+            topk_detail::scale_topk_params(
+                nelems_per_slm, sub_groups_per_work_group, elems_per_wi,
+                sg_sizes,
+                lws,              // modified by reference
+                sorted_block_size // modified by reference
+            );
+        }
+
+        // This assumption permits doing away with using a loop
+        assert(sorted_block_size % lws == 0);
+
+        using search_sorted_detail::quotient_ceil;
+        const std::size_t n_segments =
+            quotient_ceil<std::size_t>(axis_nelems, sorted_block_size);
+
+        // round k up for the later merge kernel if necessary
+        const std::size_t round_k_to = dev.has(sycl::aspect::cpu) ? 32 : 4;
+        std::size_t k_rounded =
+            (k < round_k_to)
+                ? k
+                : quotient_ceil<std::size_t>(k, round_k_to) * round_k_to;
+
+        // get length of tail for alloc size
+        auto rem = axis_nelems % sorted_block_size;
+        auto alloc_len = (rem && rem < k_rounded)
+                             ? rem + k_rounded * (n_segments - 1)
+                             : k_rounded * n_segments;
+
+        // if allocation would be sufficiently large or k is larger than
+        // elements processed, use full sort
+        if (k_rounded >= axis_nelems || k_rounded >= sorted_block_size ||
+            alloc_len >= axis_nelems / 2)
+        {
+            return topk_full_merge_sort_impl(exec_q, iter_nelems, axis_nelems,
+                                             k, arg_tp, vals_tp, inds_tp,
+                                             index_comp, depends);
+        }
+
+        auto index_data_owner =
+            dpctl::tensor::alloc_utils::smart_malloc_device<IndexTy>(
+                iter_nelems * alloc_len, exec_q);
+        // get raw USM pointer
+        IndexTy *index_data = index_data_owner.get();
+
+        // no need to populate index data: SLM will be populated with default
+        // values
+
+        sycl::event base_sort_ev = exec_q.submit([&](sycl::handler &cgh) {
+            cgh.depends_on(depends);
+
+            cgh.use_kernel_bundle(kb);
+
+            sycl::range<1> global_range{iter_nelems * n_segments * lws};
+            sycl::range<1> local_range{lws};
+
+            sycl::range<1> slm_range{sorted_block_size};
+            sycl::local_accessor<IndexTy, 1> work_space(slm_range, cgh);
+            sycl::local_accessor<IndexTy, 1> scratch_space(slm_range, cgh);
+
+            sycl::nd_range<1> ndRange(global_range, local_range);
+
+            cgh.parallel_for<PartialKernelName>(
+                ndRange, [=](sycl::nd_item<1> it) {
+                    const std::size_t group_id = it.get_group_linear_id();
+                    const std::size_t iter_id = group_id / n_segments;
+                    const std::size_t segment_id =
+                        group_id - iter_id * n_segments;
+                    const std::size_t lid = it.get_local_linear_id();
+
+                    const std::size_t segment_start_idx =
+                        segment_id * sorted_block_size;
+                    const std::size_t segment_end_idx = std::min<std::size_t>(
+                        segment_start_idx + sorted_block_size, axis_nelems);
+                    const std::size_t wg_chunk_size =
+                        segment_end_idx - segment_start_idx;
+
+                    // load input into SLM
+                    for (std::size_t array_id = segment_start_idx + lid;
+                         array_id < segment_end_idx; array_id += lws)
+                    {
+                        IndexTy v = (array_id < axis_nelems)
+                                        ? iter_id * axis_nelems + array_id
+                                        : IndexTy{};
+                        work_space[array_id - segment_start_idx] = v;
+                    }
+                    sycl::group_barrier(it.get_group());
+
+                    const std::size_t chunk =
+                        quotient_ceil<std::size_t>(sorted_block_size, lws);
+
+                    const std::size_t chunk_start_idx = lid * chunk;
+                    const std::size_t chunk_end_idx =
+                        sycl::min(chunk_start_idx + chunk, wg_chunk_size);
+
+                    merge_sort_detail::leaf_sort_impl(
+                        work_space, chunk_start_idx, chunk_end_idx, index_comp);
+
+                    sycl::group_barrier(it.get_group());
+
+                    bool data_in_temp = false;
+                    std::size_t n_chunks_merged = 1;
+
+                    // merge chunk while n_chunks_merged * chunk < wg_chunk_size
+                    const std::size_t max_chunks_merged =
+                        1 + ((wg_chunk_size - 1) / chunk);
+                    for (; n_chunks_merged < max_chunks_merged;
+                         data_in_temp = !data_in_temp, n_chunks_merged *= 2)
+                    {
+                        const std::size_t nelems_sorted_so_far =
+                            n_chunks_merged * chunk;
+                        const std::size_t q = (lid / n_chunks_merged);
+                        const std::size_t start_1 = sycl::min(
+                            2 * nelems_sorted_so_far * q, wg_chunk_size);
+                        const std::size_t end_1 = sycl::min(
+                            start_1 + nelems_sorted_so_far, wg_chunk_size);
+                        const std::size_t end_2 = sycl::min(
+                            end_1 + nelems_sorted_so_far, wg_chunk_size);
+                        const std::size_t offset =
+                            chunk * (lid - q * n_chunks_merged);
+
+                        if (data_in_temp) {
+                            merge_sort_detail::merge_impl(
+                                offset, scratch_space, work_space, start_1,
+                                end_1, end_2, start_1, index_comp, chunk);
+                        }
+                        else {
+                            merge_sort_detail::merge_impl(
+                                offset, work_space, scratch_space, start_1,
+                                end_1, end_2, start_1, index_comp, chunk);
+                        }
+                        sycl::group_barrier(it.get_group());
+                    }
+
+                    // output assumed to be structured as (iter_nelems,
+                    // alloc_len)
+                    const std::size_t k_segment_start_idx =
+                        segment_id * k_rounded;
+                    const std::size_t k_segment_end_idx = std::min<std::size_t>(
+                        k_segment_start_idx + k_rounded, alloc_len);
+                    const auto &out_src =
+                        (data_in_temp) ? scratch_space : work_space;
+                    for (std::size_t array_id = k_segment_start_idx + lid;
+                         array_id < k_segment_end_idx; array_id += lws)
+                    {
+                        if (lid < k_rounded) {
+                            index_data[iter_id * alloc_len + array_id] =
+                                out_src[array_id - k_segment_start_idx];
+                        }
+                    }
+                });
+        });
+
+        // Merge segments in parallel until all elements are sorted
+        sycl::event merges_ev =
+            merge_sort_detail::merge_sorted_block_contig_impl(
+                exec_q, iter_nelems, alloc_len, index_data, index_comp,
+                k_rounded, {base_sort_ev});
+
+        // Write out top k of the merge-sorted memory
+        using WriteOutKernelName =
+            topk_partial_merge_map_back_krn<argTy, IndexTy>;
+
+        sycl::event write_topk_ev =
+            topk_detail::write_out_impl<WriteOutKernelName, argTy, IndexTy>(
+                exec_q, iter_nelems, k, arg_tp, index_data, alloc_len,
+                axis_nelems, vals_tp, inds_tp, {merges_ev});
+
+        sycl::event cleanup_host_task_event =
+            dpctl::tensor::alloc_utils::async_smart_free(
+                exec_q, {write_topk_ev}, index_data_owner);
+
+        return cleanup_host_task_event;
+    }
+}
+
+template <typename T1, typename T2>
+class topk_iota_krn;
+
+template <typename T1, typename T2>
+class topk_radix_map_back_krn;
+
+template <typename argTy, typename IndexTy>
+sycl::event topk_radix_impl(sycl::queue &exec_q,
+                            std::size_t iter_nelems, // number of sub-arrays
+                            std::size_t axis_nelems, // size of each sub-array
+                            std::size_t k,
+                            bool ascending,
+                            const char *arg_cp,
+                            char *vals_cp,
+                            char *inds_cp,
+                            const std::vector<sycl::event> &depends)
+{
+    if (axis_nelems < k) {
+        throw std::runtime_error("Invalid sort axis size for value of k");
+    }
+
+    const argTy *arg_tp = reinterpret_cast<const argTy *>(arg_cp);
+    argTy *vals_tp = reinterpret_cast<argTy *>(vals_cp);
+    IndexTy *inds_tp = reinterpret_cast<IndexTy *>(inds_cp);
+
+    const std::size_t total_nelems = iter_nelems * axis_nelems;
+    const std::size_t padded_total_nelems = ((total_nelems + 63) / 64) * 64;
+    auto workspace_owner =
+        dpctl::tensor::alloc_utils::smart_malloc_device<IndexTy>(
+            padded_total_nelems + total_nelems, exec_q);
+
+    // get raw USM pointer
+    IndexTy *workspace = workspace_owner.get();
+    IndexTy *tmp_tp = workspace + padded_total_nelems;
+
+    using IdentityProjT = radix_sort_details::IdentityProj;
+    using IndexedProjT =
+        radix_sort_details::IndexedProj<IndexTy, argTy, IdentityProjT>;
+    const IndexedProjT proj_op{arg_tp};
+
+    using IotaKernelName = topk_iota_krn<argTy, IndexTy>;
+
+    using dpctl::tensor::kernels::sort_utils_detail::iota_impl;
+
+    sycl::event iota_ev = iota_impl<IotaKernelName, IndexTy>(
+        exec_q, workspace, total_nelems, depends);
+
+    sycl::event radix_sort_ev =
+        radix_sort_details::parallel_radix_sort_impl<IndexTy, IndexedProjT>(
+            exec_q, iter_nelems, axis_nelems, workspace, tmp_tp, proj_op,
+            ascending, {iota_ev});
+
+    // Write out top k of the temporary
+    using WriteOutKernelName = topk_radix_map_back_krn<argTy, IndexTy>;
+
+    sycl::event write_topk_ev =
+        topk_detail::write_out_impl<WriteOutKernelName, argTy, IndexTy>(
+            exec_q, iter_nelems, k, arg_tp, tmp_tp, axis_nelems, axis_nelems,
+            vals_tp, inds_tp, {radix_sort_ev});
+
+    sycl::event cleanup_ev = dpctl::tensor::alloc_utils::async_smart_free(
+        exec_q, {write_topk_ev}, workspace_owner);
+
+    return cleanup_ev;
+}
+
+} // namespace dpctl::tensor::kernels
diff --git a/dpctl_ext/tensor/libtensor/include/utils/rich_comparisons.hpp b/dpctl_ext/tensor/libtensor/include/utils/rich_comparisons.hpp
new file mode 100644
index 000000000000..87cdfbfbd54f
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/include/utils/rich_comparisons.hpp
@@ -0,0 +1,149 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_sorting_impl
+/// extension.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+
+#include <cmath>
+#include <complex>
+#include <functional>
+#include <type_traits>
+
+#include "sycl/sycl.hpp"
+
+namespace dpctl::tensor::rich_comparisons
+{
+
+namespace detail
+{
+template <typename fpT>
+struct ExtendedRealFPLess
+{
+    /* [R, nan] */
+    bool operator()(const fpT v1, const fpT v2) const
+    {
+        return (!std::isnan(v1) && (std::isnan(v2) || (v1 < v2)));
+    }
+};
+
+template <typename fpT>
+struct ExtendedRealFPGreater
+{
+    bool operator()(const fpT v1, const fpT v2) const
+    {
+        return (!std::isnan(v2) && (std::isnan(v1) || (v2 < v1)));
+    }
+};
+
+template <typename cT>
+struct ExtendedComplexFPLess
+{
+    /* [(R, R), (R, nan), (nan, R), (nan, nan)] */
+
+    bool operator()(const cT &v1, const cT &v2) const
+    {
+        using realT = typename cT::value_type;
+
+        const realT real1 = std::real(v1);
+        const realT real2 = std::real(v2);
+
+        const bool r1_nan = std::isnan(real1);
+        const bool r2_nan = std::isnan(real2);
+
+        const realT imag1 = std::imag(v1);
+        const realT imag2 = std::imag(v2);
+
+        const bool i1_nan = std::isnan(imag1);
+        const bool i2_nan = std::isnan(imag2);
+
+        const int idx1 = ((r1_nan) ? 2 : 0) + ((i1_nan) ? 1 : 0);
+        const int idx2 = ((r2_nan) ? 2 : 0) + ((i2_nan) ? 1 : 0);
+
+        const bool res =
+            !(r1_nan && i1_nan) &&
+            ((idx1 < idx2) ||
+             ((idx1 == idx2) &&
+              ((r1_nan && !i1_nan && (imag1 < imag2)) ||
+               (!r1_nan && i1_nan && (real1 < real2)) ||
+               (!r1_nan && !i1_nan &&
+                ((real1 < real2) || (!(real2 < real1) && (imag1 < imag2)))))));
+
+        return res;
+    }
+};
+
+template <typename cT>
+struct ExtendedComplexFPGreater
+{
+    bool operator()(const cT &v1, const cT &v2) const
+    {
+        auto less_ = ExtendedComplexFPLess<cT>{};
+        return less_(v2, v1);
+    }
+};
+
+template <typename T>
+inline constexpr bool is_fp_v = (std::is_same_v<T, sycl::half> ||
+                                 std::is_same_v<T, float> ||
+                                 std::is_same_v<T, double>);
+
+} // namespace detail
+
+template <typename argTy>
+struct AscendingSorter
+{
+    using type = std::conditional_t<detail::is_fp_v<argTy>,
+                                    detail::ExtendedRealFPLess<argTy>,
+                                    std::less<argTy>>;
+};
+
+template <typename T>
+struct AscendingSorter<std::complex<T>>
+{
+    using type = detail::ExtendedComplexFPLess<std::complex<T>>;
+};
+
+template <typename argTy>
+struct DescendingSorter
+{
+    using type = std::conditional_t<detail::is_fp_v<argTy>,
+                                    detail::ExtendedRealFPGreater<argTy>,
+                                    std::greater<argTy>>;
+};
+
+template <typename T>
+struct DescendingSorter<std::complex<T>>
+{
+    using type = detail::ExtendedComplexFPGreater<std::complex<T>>;
+};
+
+} // namespace dpctl::tensor::rich_comparisons
diff --git a/dpctl_ext/tensor/libtensor/source/sorting/isin.cpp b/dpctl_ext/tensor/libtensor/source/sorting/isin.cpp
new file mode 100644
index 000000000000..f1ae5863bbb9
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/sorting/isin.cpp
@@ -0,0 +1,325 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_sorting_impl
+/// extension.
+//===----------------------------------------------------------------------===//
+
+#include <cstddef>
+#include <stdexcept>
+#include <tuple>
+#include <utility>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "kernels/sorting/isin.hpp"
+#include "utils/memory_overlap.hpp"
+#include "utils/offset_utils.hpp"
+#include "utils/output_validation.hpp"
+#include "utils/sycl_alloc_utils.hpp"
+#include "utils/type_dispatch.hpp"
+
+#include "simplify_iteration_space.hpp"
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace dpctl::tensor::py_internal
+{
+namespace detail
+{
+
+using dpctl::tensor::kernels::isin_contig_impl_fp_ptr_t;
+
+static isin_contig_impl_fp_ptr_t
+    isin_contig_impl_dispatch_vector[td_ns::num_types];
+
+template <typename fnT, typename argTy>
+struct IsinContigFactory
+{
+    constexpr IsinContigFactory() {}
+
+    fnT get() const
+    {
+        using dpctl::tensor::kernels::isin_contig_impl;
+        return isin_contig_impl<argTy>;
+    }
+};
+
+using dpctl::tensor::kernels::isin_strided_impl_fp_ptr_t;
+
+static isin_strided_impl_fp_ptr_t
+    isin_strided_impl_dispatch_vector[td_ns::num_types];
+
+template <typename fnT, typename argTy>
+struct IsinStridedFactory
+{
+    constexpr IsinStridedFactory() {}
+
+    fnT get() const
+    {
+        using dpctl::tensor::kernels::isin_strided_impl;
+        return isin_strided_impl<argTy>;
+    }
+};
+
+void init_isin_dispatch_vector(void)
+{
+
+    // Contiguous input function dispatch
+    td_ns::DispatchVectorBuilder<isin_contig_impl_fp_ptr_t, IsinContigFactory,
+                                 td_ns::num_types>
+        dvb1;
+    dvb1.populate_dispatch_vector(isin_contig_impl_dispatch_vector);
+
+    // Strided input function dispatch
+    td_ns::DispatchVectorBuilder<isin_strided_impl_fp_ptr_t, IsinStridedFactory,
+                                 td_ns::num_types>
+        dvb2;
+    dvb2.populate_dispatch_vector(isin_strided_impl_dispatch_vector);
+}
+
+} // namespace detail
+
+/*! @brief search for needle from needles in sorted hay */
+std::pair<sycl::event, sycl::event>
+    py_isin(const dpctl::tensor::usm_ndarray &needles,
+            const dpctl::tensor::usm_ndarray &hay,
+            const dpctl::tensor::usm_ndarray &dst,
+            sycl::queue &exec_q,
+            const bool invert,
+            const std::vector<sycl::event> &depends)
+{
+    const int hay_nd = hay.get_ndim();
+    const int needles_nd = needles.get_ndim();
+    const int dst_nd = dst.get_ndim();
+
+    if (hay_nd != 1 || needles_nd != dst_nd) {
+        throw py::value_error("Array dimensions mismatch");
+    }
+
+    // check that needle and dst have the same shape
+    std::size_t needles_nelems(1);
+    bool same_shape(true);
+
+    const std::size_t hay_nelems = static_cast<std::size_t>(hay.get_shape(0));
+
+    const py::ssize_t *needles_shape_ptr = needles.get_shape_raw();
+    const py::ssize_t *dst_shape_ptr = dst.get_shape_raw();
+
+    for (int i = 0; (i < needles_nd) && same_shape; ++i) {
+        const auto needles_sh_i = needles_shape_ptr[i];
+        const auto dst_sh_i = dst_shape_ptr[i];
+
+        same_shape = same_shape && (needles_sh_i == dst_sh_i);
+        needles_nelems *= static_cast<std::size_t>(needles_sh_i);
+    }
+
+    if (!same_shape) {
+        throw py::value_error(
+            "Array of values to search for and array of their "
+            "dst do not have the same shape");
+    }
+
+    // check that dst is ample enough
+    dpctl::tensor::validation::AmpleMemory::throw_if_not_ample(dst,
+                                                               needles_nelems);
+
+    // check that dst is writable
+    dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst);
+
+    // check that queues are compatible
+    if (!dpctl::utils::queues_are_compatible(exec_q, {hay, needles, dst})) {
+        throw py::value_error(
+            "Execution queue is not compatible with allocation queues");
+    }
+
+    // if output array overlaps with input arrays, race condition results
+    auto const &overlap = dpctl::tensor::overlap::MemoryOverlap();
+    if (overlap(dst, hay) || overlap(dst, needles)) {
+        throw py::value_error("Destination array overlaps with input.");
+    }
+
+    const int hay_typenum = hay.get_typenum();
+    const int needles_typenum = needles.get_typenum();
+    const int dst_typenum = dst.get_typenum();
+
+    auto const &array_types = td_ns::usm_ndarray_types();
+    const int hay_typeid = array_types.typenum_to_lookup_id(hay_typenum);
+    const int needles_typeid =
+        array_types.typenum_to_lookup_id(needles_typenum);
+    const int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum);
+
+    // check hay and needle have the same data-type
+    if (needles_typeid != hay_typeid) {
+        throw py::value_error(
+            "Hay array and needles array must have the same data types");
+    }
+    // check that dst has boolean data type
+    const auto dst_typenum_t_v = static_cast<td_ns::typenum_t>(dst_typeid);
+    if (dst_typenum_t_v != td_ns::typenum_t::BOOL) {
+        throw py::value_error("dst array must have data-type bool");
+    }
+
+    if (needles_nelems == 0) {
+        // Nothing to do
+        return std::make_pair(sycl::event{}, sycl::event{});
+    }
+
+    // if all inputs are contiguous call contiguous implementations
+    // otherwise call strided implementation
+    const bool hay_is_c_contig = hay.is_c_contiguous();
+    const bool hay_is_f_contig = hay.is_f_contiguous();
+
+    const bool needles_is_c_contig = needles.is_c_contiguous();
+    const bool needles_is_f_contig = needles.is_f_contiguous();
+
+    const bool dst_is_c_contig = dst.is_c_contiguous();
+    const bool dst_is_f_contig = dst.is_f_contiguous();
+
+    const bool all_c_contig =
+        (hay_is_c_contig && needles_is_c_contig && dst_is_c_contig);
+    const bool all_f_contig =
+        (hay_is_f_contig && needles_is_f_contig && dst_is_f_contig);
+
+    const char *hay_data = hay.get_data();
+    const char *needles_data = needles.get_data();
+
+    char *dst_data = dst.get_data();
+
+    if (all_c_contig || all_f_contig) {
+        auto fn = detail::isin_contig_impl_dispatch_vector[hay_typeid];
+
+        static constexpr py::ssize_t zero_offset(0);
+
+        sycl::event comp_ev = fn(exec_q, invert, hay_nelems, needles_nelems,
+                                 hay_data, zero_offset, needles_data,
+                                 zero_offset, dst_data, zero_offset, depends);
+
+        return std::make_pair(dpctl::utils::keep_args_alive(
+                                  exec_q, {hay, needles, dst}, {comp_ev}),
+                              comp_ev);
+    }
+
+    // strided case
+
+    const auto &needles_strides = needles.get_strides_vector();
+    const auto &dst_strides = dst.get_strides_vector();
+
+    int simplified_nd = needles_nd;
+
+    using shT = std::vector<py::ssize_t>;
+    shT simplified_common_shape;
+    shT simplified_needles_strides;
+    shT simplified_dst_strides;
+    py::ssize_t needles_offset(0);
+    py::ssize_t dst_offset(0);
+
+    if (simplified_nd == 0) {
+        // needles and dst have same nd
+        simplified_nd = 1;
+        simplified_common_shape.push_back(1);
+        simplified_needles_strides.push_back(0);
+        simplified_dst_strides.push_back(0);
+    }
+    else {
+        simplify_iteration_space(
+            // modified by reference
+            simplified_nd,
+            // read-only inputs
+            needles_shape_ptr, needles_strides, dst_strides,
+            // output, modified by reference
+            simplified_common_shape, simplified_needles_strides,
+            simplified_dst_strides, needles_offset, dst_offset);
+    }
+    std::vector<sycl::event> host_task_events;
+    host_task_events.reserve(2);
+
+    using dpctl::tensor::offset_utils::device_allocate_and_pack;
+
+    auto ptr_size_event_tuple = device_allocate_and_pack<py::ssize_t>(
+        exec_q, host_task_events,
+        // vectors being packed
+        simplified_common_shape, simplified_needles_strides,
+        simplified_dst_strides);
+    auto packed_shape_strides_owner =
+        std::move(std::get<0>(ptr_size_event_tuple));
+    const sycl::event &copy_shape_strides_ev =
+        std::get<2>(ptr_size_event_tuple);
+    const py::ssize_t *packed_shape_strides = packed_shape_strides_owner.get();
+
+    std::vector<sycl::event> all_deps;
+    all_deps.reserve(depends.size() + 1);
+    all_deps.insert(all_deps.end(), depends.begin(), depends.end());
+    all_deps.push_back(copy_shape_strides_ev);
+
+    auto strided_fn = detail::isin_strided_impl_dispatch_vector[hay_typeid];
+
+    if (!strided_fn) {
+        throw std::runtime_error(
+            "No implementation for data types of input arrays");
+    }
+
+    static constexpr py::ssize_t zero_offset(0);
+    py::ssize_t hay_step = hay.get_strides_vector()[0];
+
+    const sycl::event &comp_ev = strided_fn(
+        exec_q, invert, hay_nelems, needles_nelems, hay_data, zero_offset,
+        hay_step, needles_data, needles_offset, dst_data, dst_offset,
+        simplified_nd, packed_shape_strides, all_deps);
+
+    // free packed temporaries
+    sycl::event temporaries_cleanup_ev =
+        dpctl::tensor::alloc_utils::async_smart_free(
+            exec_q, {comp_ev}, packed_shape_strides_owner);
+
+    host_task_events.push_back(temporaries_cleanup_ev);
+    const sycl::event &ht_ev = dpctl::utils::keep_args_alive(
+        exec_q, {hay, needles, dst}, host_task_events);
+
+    return std::make_pair(ht_ev, comp_ev);
+}
+
+void init_isin_functions(py::module_ m)
+{
+    detail::init_isin_dispatch_vector();
+
+    m.def("_isin", &py_isin, py::arg("needles"), py::arg("hay"), py::arg("dst"),
+          py::arg("sycl_queue"), py::arg("invert"),
+          py::arg("depends") = py::list());
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/sorting/isin.hpp b/dpctl_ext/tensor/libtensor/source/sorting/isin.hpp
new file mode 100644
index 000000000000..236e8b5898c6
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/sorting/isin.hpp
@@ -0,0 +1,47 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_sorting_impl
+/// extension.
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern void init_isin_functions(py::module_ m);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/sorting/merge_argsort.cpp b/dpctl_ext/tensor/libtensor/source/sorting/merge_argsort.cpp
new file mode 100644
index 000000000000..2b6dcc8bf447
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/sorting/merge_argsort.cpp
@@ -0,0 +1,157 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_sorting_impl
+/// extension.
+//===----------------------------------------------------------------------===//
+
+#include <cstdint>
+#include <type_traits>
+#include <utility>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "utils/rich_comparisons.hpp"
+#include "utils/type_dispatch.hpp"
+
+#include "kernels/sorting/merge_sort.hpp"
+#include "kernels/sorting/sort_impl_fn_ptr_t.hpp"
+
+#include "merge_argsort.hpp"
+#include "py_argsort_common.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+using dpctl::tensor::kernels::sort_contig_fn_ptr_t;
+static sort_contig_fn_ptr_t
+    ascending_argsort_contig_dispatch_table[td_ns::num_types][td_ns::num_types];
+static sort_contig_fn_ptr_t
+    descending_argsort_contig_dispatch_table[td_ns::num_types]
+                                            [td_ns::num_types];
+
+template <typename fnT, typename argTy, typename IndexTy>
+struct AscendingArgSortContigFactory
+{
+    fnT get()
+    {
+        if constexpr (std::is_same_v<IndexTy, std::int64_t> ||
+                      std::is_same_v<IndexTy, std::int32_t>)
+        {
+            using dpctl::tensor::rich_comparisons::AscendingSorter;
+            using Comp = typename AscendingSorter<argTy>::type;
+
+            using dpctl::tensor::kernels::stable_argsort_axis1_contig_impl;
+            return stable_argsort_axis1_contig_impl<argTy, IndexTy, Comp>;
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+template <typename fnT, typename argTy, typename IndexTy>
+struct DescendingArgSortContigFactory
+{
+    fnT get()
+    {
+        if constexpr (std::is_same_v<IndexTy, std::int64_t> ||
+                      std::is_same_v<IndexTy, std::int32_t>)
+        {
+            using dpctl::tensor::rich_comparisons::DescendingSorter;
+            using Comp = typename DescendingSorter<argTy>::type;
+
+            using dpctl::tensor::kernels::stable_argsort_axis1_contig_impl;
+            return stable_argsort_axis1_contig_impl<argTy, IndexTy, Comp>;
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+void init_merge_argsort_dispatch_tables(void)
+{
+    using dpctl::tensor::kernels::sort_contig_fn_ptr_t;
+
+    td_ns::DispatchTableBuilder<sort_contig_fn_ptr_t,
+                                AscendingArgSortContigFactory, td_ns::num_types>
+        dtb1;
+    dtb1.populate_dispatch_table(ascending_argsort_contig_dispatch_table);
+
+    td_ns::DispatchTableBuilder<
+        sort_contig_fn_ptr_t, DescendingArgSortContigFactory, td_ns::num_types>
+        dtb2;
+    dtb2.populate_dispatch_table(descending_argsort_contig_dispatch_table);
+}
+
+void init_merge_argsort_functions(py::module_ m)
+{
+    init_merge_argsort_dispatch_tables();
+
+    auto py_argsort_ascending = [](const dpctl::tensor::usm_ndarray &src,
+                                   const int trailing_dims_to_sort,
+                                   const dpctl::tensor::usm_ndarray &dst,
+                                   sycl::queue &exec_q,
+                                   const std::vector<sycl::event> &depends)
+        -> std::pair<sycl::event, sycl::event> {
+        return py_argsort(src, trailing_dims_to_sort, dst, exec_q, depends,
+                          ascending_argsort_contig_dispatch_table);
+    };
+    m.def("_argsort_ascending", py_argsort_ascending, py::arg("src"),
+          py::arg("trailing_dims_to_sort"), py::arg("dst"),
+          py::arg("sycl_queue"), py::arg("depends") = py::list());
+
+    auto py_argsort_descending = [](const dpctl::tensor::usm_ndarray &src,
+                                    const int trailing_dims_to_sort,
+                                    const dpctl::tensor::usm_ndarray &dst,
+                                    sycl::queue &exec_q,
+                                    const std::vector<sycl::event> &depends)
+        -> std::pair<sycl::event, sycl::event> {
+        return py_argsort(src, trailing_dims_to_sort, dst, exec_q, depends,
+                          descending_argsort_contig_dispatch_table);
+    };
+    m.def("_argsort_descending", py_argsort_descending, py::arg("src"),
+          py::arg("trailing_dims_to_sort"), py::arg("dst"),
+          py::arg("sycl_queue"), py::arg("depends") = py::list());
+
+    return;
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/sorting/merge_argsort.hpp b/dpctl_ext/tensor/libtensor/source/sorting/merge_argsort.hpp
new file mode 100644
index 000000000000..10777b4bc2fd
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/sorting/merge_argsort.hpp
@@ -0,0 +1,47 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_sorting_impl
+/// extension.
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern void init_merge_argsort_functions(py::module_);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/sorting/merge_sort.cpp b/dpctl_ext/tensor/libtensor/source/sorting/merge_sort.cpp
new file mode 100644
index 000000000000..fbd60621b3bb
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/sorting/merge_sort.cpp
@@ -0,0 +1,139 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_sorting_impl
+/// extension.
+//===----------------------------------------------------------------------===//
+
+#include <utility>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "utils/rich_comparisons.hpp"
+#include "utils/type_dispatch.hpp"
+
+#include "kernels/sorting/merge_sort.hpp"
+#include "kernels/sorting/sort_impl_fn_ptr_t.hpp"
+
+#include "merge_sort.hpp"
+#include "py_sort_common.hpp"
+
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace dpctl::tensor::py_internal
+{
+
+using dpctl::tensor::kernels::sort_contig_fn_ptr_t;
+static sort_contig_fn_ptr_t
+    ascending_sort_contig_dispatch_vector[td_ns::num_types];
+static sort_contig_fn_ptr_t
+    descending_sort_contig_dispatch_vector[td_ns::num_types];
+
+template <typename fnT, typename argTy>
+struct AscendingSortContigFactory
+{
+    fnT get()
+    {
+        using dpctl::tensor::rich_comparisons::AscendingSorter;
+        using Comp = typename AscendingSorter<argTy>::type;
+
+        using dpctl::tensor::kernels::stable_sort_axis1_contig_impl;
+        return stable_sort_axis1_contig_impl<argTy, Comp>;
+    }
+};
+
+template <typename fnT, typename argTy>
+struct DescendingSortContigFactory
+{
+    fnT get()
+    {
+        using dpctl::tensor::rich_comparisons::DescendingSorter;
+        using Comp = typename DescendingSorter<argTy>::type;
+
+        using dpctl::tensor::kernels::stable_sort_axis1_contig_impl;
+        return stable_sort_axis1_contig_impl<argTy, Comp>;
+    }
+};
+
+void init_merge_sort_dispatch_vectors(void)
+{
+    using dpctl::tensor::kernels::sort_contig_fn_ptr_t;
+
+    td_ns::DispatchVectorBuilder<sort_contig_fn_ptr_t,
+                                 AscendingSortContigFactory, td_ns::num_types>
+        dtv1;
+    dtv1.populate_dispatch_vector(ascending_sort_contig_dispatch_vector);
+
+    td_ns::DispatchVectorBuilder<sort_contig_fn_ptr_t,
+                                 DescendingSortContigFactory, td_ns::num_types>
+        dtv2;
+    dtv2.populate_dispatch_vector(descending_sort_contig_dispatch_vector);
+}
+
+void init_merge_sort_functions(py::module_ m)
+{
+    init_merge_sort_dispatch_vectors();
+
+    auto py_sort_ascending = [](const dpctl::tensor::usm_ndarray &src,
+                                const int trailing_dims_to_sort,
+                                const dpctl::tensor::usm_ndarray &dst,
+                                sycl::queue &exec_q,
+                                const std::vector<sycl::event> &depends)
+        -> std::pair<sycl::event, sycl::event> {
+        return py_sort(src, trailing_dims_to_sort, dst, exec_q, depends,
+                       ascending_sort_contig_dispatch_vector);
+    };
+    m.def("_sort_ascending", py_sort_ascending, py::arg("src"),
+          py::arg("trailing_dims_to_sort"), py::arg("dst"),
+          py::arg("sycl_queue"), py::arg("depends") = py::list());
+
+    auto py_sort_descending = [](const dpctl::tensor::usm_ndarray &src,
+                                 const int trailing_dims_to_sort,
+                                 const dpctl::tensor::usm_ndarray &dst,
+                                 sycl::queue &exec_q,
+                                 const std::vector<sycl::event> &depends)
+        -> std::pair<sycl::event, sycl::event> {
+        return py_sort(src, trailing_dims_to_sort, dst, exec_q, depends,
+                       descending_sort_contig_dispatch_vector);
+    };
+    m.def("_sort_descending", py_sort_descending, py::arg("src"),
+          py::arg("trailing_dims_to_sort"), py::arg("dst"),
+          py::arg("sycl_queue"), py::arg("depends") = py::list());
+
+    return;
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/sorting/merge_sort.hpp b/dpctl_ext/tensor/libtensor/source/sorting/merge_sort.hpp
new file mode 100644
index 000000000000..a6bdd0a4efe9
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/sorting/merge_sort.hpp
@@ -0,0 +1,47 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_sorting_impl
+/// extension.
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern void init_merge_sort_functions(py::module_);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/sorting/py_argsort_common.hpp b/dpctl_ext/tensor/libtensor/source/sorting/py_argsort_common.hpp
new file mode 100644
index 000000000000..6328b3339376
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/sorting/py_argsort_common.hpp
@@ -0,0 +1,184 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_sorting_impl
+/// extension.
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <cassert>
+#include <cstddef>
+#include <utility>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/pybind11.h>
+
+#include "utils/memory_overlap.hpp"
+#include "utils/output_validation.hpp"
+#include "utils/type_dispatch.hpp"
+
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace dpctl::tensor::py_internal
+{
+
+template <typename sorting_contig_impl_fnT>
+std::pair<sycl::event, sycl::event>
+    py_argsort(const dpctl::tensor::usm_ndarray &src,
+               const int trailing_dims_to_sort,
+               const dpctl::tensor::usm_ndarray &dst,
+               sycl::queue &exec_q,
+               const std::vector<sycl::event> &depends,
+               const sorting_contig_impl_fnT &sort_contig_fns)
+{
+    int src_nd = src.get_ndim();
+    int dst_nd = dst.get_ndim();
+    if (src_nd != dst_nd) {
+        throw py::value_error("The input and output arrays must have "
+                              "the same array ranks");
+    }
+    int iteration_nd = src_nd - trailing_dims_to_sort;
+    if (trailing_dims_to_sort <= 0 || iteration_nd < 0) {
+        throw py::value_error("Trailing_dim_to_sort must be positive, but no "
+                              "greater than rank of the array being sorted");
+    }
+
+    const py::ssize_t *src_shape_ptr = src.get_shape_raw();
+    const py::ssize_t *dst_shape_ptr = dst.get_shape_raw();
+
+    bool same_shapes = true;
+    std::size_t iter_nelems(1);
+
+    for (int i = 0; same_shapes && (i < iteration_nd); ++i) {
+        auto src_shape_i = src_shape_ptr[i];
+        same_shapes = same_shapes && (src_shape_i == dst_shape_ptr[i]);
+        iter_nelems *= static_cast<std::size_t>(src_shape_i);
+    }
+
+    std::size_t sort_nelems(1);
+    for (int i = iteration_nd; same_shapes && (i < src_nd); ++i) {
+        auto src_shape_i = src_shape_ptr[i];
+        same_shapes = same_shapes && (src_shape_i == dst_shape_ptr[i]);
+        sort_nelems *= static_cast<std::size_t>(src_shape_i);
+    }
+
+    if (!same_shapes) {
+        throw py::value_error(
+            "Destination shape does not match the input shape");
+    }
+
+    if (!dpctl::utils::queues_are_compatible(exec_q, {src, dst})) {
+        throw py::value_error(
+            "Execution queue is not compatible with allocation queues");
+    }
+
+    dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst);
+
+    if ((iter_nelems == 0) || (sort_nelems == 0)) {
+        // Nothing to do
+        return std::make_pair(sycl::event(), sycl::event());
+    }
+
+    // check that dst and src do not overlap
+    auto const &overlap = dpctl::tensor::overlap::MemoryOverlap();
+    if (overlap(src, dst)) {
+        throw py::value_error("Arrays index overlapping segments of memory");
+    }
+
+    dpctl::tensor::validation::AmpleMemory::throw_if_not_ample(
+        dst, sort_nelems * iter_nelems);
+
+    int src_typenum = src.get_typenum();
+    int dst_typenum = dst.get_typenum();
+
+    const auto &array_types = td_ns::usm_ndarray_types();
+    int src_typeid = array_types.typenum_to_lookup_id(src_typenum);
+    int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum);
+
+    if ((dst_typeid != static_cast<int>(td_ns::typenum_t::INT64)) &&
+        (dst_typeid != static_cast<int>(td_ns::typenum_t::INT32)))
+    {
+        throw py::value_error(
+            "Output index array must have data type int32 or int64");
+    }
+
+    bool is_src_c_contig = src.is_c_contiguous();
+    bool is_dst_c_contig = dst.is_c_contiguous();
+
+    if (is_src_c_contig && is_dst_c_contig) {
+        if (sort_nelems > 1) {
+            static constexpr py::ssize_t zero_offset = py::ssize_t(0);
+
+            auto fn = sort_contig_fns[src_typeid][dst_typeid];
+
+            if (fn == nullptr) {
+                throw py::value_error(
+                    "Not implemented for dtypes of input arrays");
+            }
+
+            sycl::event comp_ev =
+                fn(exec_q, iter_nelems, sort_nelems, src.get_data(),
+                   dst.get_data(), zero_offset, zero_offset, zero_offset,
+                   zero_offset, depends);
+
+            sycl::event keep_args_alive_ev =
+                dpctl::utils::keep_args_alive(exec_q, {src, dst}, {comp_ev});
+
+            return std::make_pair(keep_args_alive_ev, comp_ev);
+        }
+        else {
+            assert(dst.get_size() == iter_nelems);
+            int dst_elemsize = dst.get_elemsize();
+            static constexpr int memset_val(0);
+
+            sycl::event fill_ev = exec_q.submit([&](sycl::handler &cgh) {
+                cgh.depends_on(depends);
+
+                cgh.memset(reinterpret_cast<void *>(dst.get_data()), memset_val,
+                           iter_nelems * dst_elemsize);
+            });
+
+            sycl::event keep_args_alive_ev =
+                dpctl::utils::keep_args_alive(exec_q, {src, dst}, {fill_ev});
+
+            return std::make_pair(keep_args_alive_ev, fill_ev);
+        }
+    }
+
+    throw py::value_error(
+        "Both source and destination arrays must be C-contiguous");
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/sorting/py_sort_common.hpp b/dpctl_ext/tensor/libtensor/source/sorting/py_sort_common.hpp
new file mode 100644
index 000000000000..ee8777f35077
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/sorting/py_sort_common.hpp
@@ -0,0 +1,178 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_sorting_impl
+/// extension.
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <cassert>
+#include <cstddef>
+#include <utility>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/pybind11.h>
+
+#include "utils/memory_overlap.hpp"
+#include "utils/output_validation.hpp"
+#include "utils/type_dispatch.hpp"
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace dpctl::tensor::py_internal
+{
+
+template <typename sorting_contig_impl_fnT>
+std::pair<sycl::event, sycl::event>
+    py_sort(const dpctl::tensor::usm_ndarray &src,
+            const int trailing_dims_to_sort,
+            const dpctl::tensor::usm_ndarray &dst,
+            sycl::queue &exec_q,
+            const std::vector<sycl::event> &depends,
+            const sorting_contig_impl_fnT &sort_contig_fns)
+{
+    int src_nd = src.get_ndim();
+    int dst_nd = dst.get_ndim();
+    if (src_nd != dst_nd) {
+        throw py::value_error("The input and output arrays must have "
+                              "the same array ranks");
+    }
+    int iteration_nd = src_nd - trailing_dims_to_sort;
+    if (trailing_dims_to_sort <= 0 || iteration_nd < 0) {
+        throw py::value_error("Trailing_dim_to_sort must be positive, but no "
+                              "greater than rank of the array being sorted");
+    }
+
+    const py::ssize_t *src_shape_ptr = src.get_shape_raw();
+    const py::ssize_t *dst_shape_ptr = dst.get_shape_raw();
+
+    bool same_shapes = true;
+    std::size_t iter_nelems(1);
+
+    for (int i = 0; same_shapes && (i < iteration_nd); ++i) {
+        auto src_shape_i = src_shape_ptr[i];
+        same_shapes = same_shapes && (src_shape_i == dst_shape_ptr[i]);
+        iter_nelems *= static_cast<std::size_t>(src_shape_i);
+    }
+
+    std::size_t sort_nelems(1);
+    for (int i = iteration_nd; same_shapes && (i < src_nd); ++i) {
+        auto src_shape_i = src_shape_ptr[i];
+        same_shapes = same_shapes && (src_shape_i == dst_shape_ptr[i]);
+        sort_nelems *= static_cast<std::size_t>(src_shape_i);
+    }
+
+    if (!same_shapes) {
+        throw py::value_error(
+            "Destination shape does not match the input shape");
+    }
+
+    if (!dpctl::utils::queues_are_compatible(exec_q, {src, dst})) {
+        throw py::value_error(
+            "Execution queue is not compatible with allocation queues");
+    }
+
+    dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst);
+
+    if ((iter_nelems == 0) || (sort_nelems == 0)) {
+        // Nothing to do
+        return std::make_pair(sycl::event(), sycl::event());
+    }
+
+    // check that dst and src do not overlap
+    auto const &overlap = dpctl::tensor::overlap::MemoryOverlap();
+    if (overlap(src, dst)) {
+        throw py::value_error("Arrays index overlapping segments of memory");
+    }
+
+    dpctl::tensor::validation::AmpleMemory::throw_if_not_ample(
+        dst, sort_nelems * iter_nelems);
+
+    int src_typenum = src.get_typenum();
+    int dst_typenum = dst.get_typenum();
+
+    const auto &array_types = td_ns::usm_ndarray_types();
+    int src_typeid = array_types.typenum_to_lookup_id(src_typenum);
+    int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum);
+
+    if (src_typeid != dst_typeid) {
+        throw py::value_error("Both input arrays must have "
+                              "the same value data type");
+    }
+
+    bool is_src_c_contig = src.is_c_contiguous();
+    bool is_dst_c_contig = dst.is_c_contiguous();
+
+    if (is_src_c_contig && is_dst_c_contig) {
+        if (sort_nelems > 1) {
+            static constexpr py::ssize_t zero_offset = py::ssize_t(0);
+
+            auto fn = sort_contig_fns[src_typeid];
+
+            if (nullptr == fn) {
+                throw py::value_error(
+                    "Not implemented for the dtype of input arrays");
+            }
+
+            sycl::event comp_ev =
+                fn(exec_q, iter_nelems, sort_nelems, src.get_data(),
+                   dst.get_data(), zero_offset, zero_offset, zero_offset,
+                   zero_offset, depends);
+
+            sycl::event keep_args_alive_ev =
+                dpctl::utils::keep_args_alive(exec_q, {src, dst}, {comp_ev});
+
+            return std::make_pair(keep_args_alive_ev, comp_ev);
+        }
+        else {
+            assert(dst.get_size() == iter_nelems);
+            int src_elemsize = src.get_elemsize();
+
+            sycl::event copy_ev =
+                exec_q.copy<char>(src.get_data(), dst.get_data(),
+                                  src_elemsize * iter_nelems, depends);
+
+            return std::make_pair(
+                dpctl::utils::keep_args_alive(exec_q, {src, dst}, {copy_ev}),
+                copy_ev);
+        }
+    }
+
+    throw py::value_error(
+        "Both source and destination arrays must be C-contiguous");
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/sorting/radix_argsort.cpp b/dpctl_ext/tensor/libtensor/source/sorting/radix_argsort.cpp
new file mode 100644
index 000000000000..e54b8f739a4b
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/sorting/radix_argsort.cpp
@@ -0,0 +1,187 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_sorting_impl
+/// extension.
+//===----------------------------------------------------------------------===//
+
+#include <cstddef>
+#include <cstdint>
+#include <type_traits>
+#include <utility>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "utils/type_dispatch.hpp"
+
+#include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/sorting/radix_sort.hpp"
+#include "kernels/sorting/sort_impl_fn_ptr_t.hpp"
+
+#include "py_argsort_common.hpp"
+#include "radix_argsort.hpp"
+#include "radix_sort_support.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+namespace impl_ns = dpctl::tensor::kernels::radix_sort_details;
+
+using dpctl::tensor::ssize_t;
+using dpctl::tensor::kernels::sort_contig_fn_ptr_t;
+
+static sort_contig_fn_ptr_t
+    ascending_radix_argsort_contig_dispatch_table[td_ns::num_types]
+                                                 [td_ns::num_types];
+static sort_contig_fn_ptr_t
+    descending_radix_argsort_contig_dispatch_table[td_ns::num_types]
+                                                  [td_ns::num_types];
+
+namespace
+{
+
+template <bool is_ascending, typename T, typename I>
+sycl::event argsort_axis1_contig_caller(sycl::queue &q,
+                                        std::size_t iter_nelems,
+                                        std::size_t sort_nelems,
+                                        const char *arg_cp,
+                                        char *res_cp,
+                                        ssize_t iter_arg_offset,
+                                        ssize_t iter_res_offset,
+                                        ssize_t sort_arg_offset,
+                                        ssize_t sort_res_offset,
+                                        const std::vector<sycl::event> &depends)
+{
+    using dpctl::tensor::kernels::radix_argsort_axis1_contig_impl;
+
+    return radix_argsort_axis1_contig_impl<T, I>(
+        q, is_ascending, iter_nelems, sort_nelems, arg_cp, res_cp,
+        iter_arg_offset, iter_res_offset, sort_arg_offset, sort_res_offset,
+        depends);
+}
+
+} // end of anonymous namespace
+
+template <typename fnT, typename argTy, typename IndexTy>
+struct AscendingRadixArgSortContigFactory
+{
+    fnT get()
+    {
+        if constexpr (RadixSortSupportVector<argTy>::is_defined &&
+                      (std::is_same_v<IndexTy, std::int64_t> ||
+                       std::is_same_v<IndexTy, std::int32_t>))
+        {
+            return argsort_axis1_contig_caller<
+                /*ascending*/ true, argTy, IndexTy>;
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+template <typename fnT, typename argTy, typename IndexTy>
+struct DescendingRadixArgSortContigFactory
+{
+    fnT get()
+    {
+        if constexpr (RadixSortSupportVector<argTy>::is_defined &&
+                      (std::is_same_v<IndexTy, std::int64_t> ||
+                       std::is_same_v<IndexTy, std::int32_t>))
+        {
+            return argsort_axis1_contig_caller<
+                /*ascending*/ false, argTy, IndexTy>;
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+void init_radix_argsort_dispatch_tables(void)
+{
+    using dpctl::tensor::kernels::sort_contig_fn_ptr_t;
+
+    td_ns::DispatchTableBuilder<sort_contig_fn_ptr_t,
+                                AscendingRadixArgSortContigFactory,
+                                td_ns::num_types>
+        dtb1;
+    dtb1.populate_dispatch_table(ascending_radix_argsort_contig_dispatch_table);
+
+    td_ns::DispatchTableBuilder<sort_contig_fn_ptr_t,
+                                DescendingRadixArgSortContigFactory,
+                                td_ns::num_types>
+        dtb2;
+    dtb2.populate_dispatch_table(
+        descending_radix_argsort_contig_dispatch_table);
+}
+
+void init_radix_argsort_functions(py::module_ m)
+{
+    init_radix_argsort_dispatch_tables();
+
+    auto py_radix_argsort_ascending =
+        [](const dpctl::tensor::usm_ndarray &src,
+           const int trailing_dims_to_sort,
+           const dpctl::tensor::usm_ndarray &dst, sycl::queue &exec_q,
+           const std::vector<sycl::event> &depends)
+        -> std::pair<sycl::event, sycl::event> {
+        return py_argsort(src, trailing_dims_to_sort, dst, exec_q, depends,
+                          ascending_radix_argsort_contig_dispatch_table);
+    };
+    m.def("_radix_argsort_ascending", py_radix_argsort_ascending,
+          py::arg("src"), py::arg("trailing_dims_to_sort"), py::arg("dst"),
+          py::arg("sycl_queue"), py::arg("depends") = py::list());
+
+    auto py_radix_argsort_descending =
+        [](const dpctl::tensor::usm_ndarray &src,
+           const int trailing_dims_to_sort,
+           const dpctl::tensor::usm_ndarray &dst, sycl::queue &exec_q,
+           const std::vector<sycl::event> &depends)
+        -> std::pair<sycl::event, sycl::event> {
+        return py_argsort(src, trailing_dims_to_sort, dst, exec_q, depends,
+                          descending_radix_argsort_contig_dispatch_table);
+    };
+    m.def("_radix_argsort_descending", py_radix_argsort_descending,
+          py::arg("src"), py::arg("trailing_dims_to_sort"), py::arg("dst"),
+          py::arg("sycl_queue"), py::arg("depends") = py::list());
+
+    return;
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/sorting/radix_argsort.hpp b/dpctl_ext/tensor/libtensor/source/sorting/radix_argsort.hpp
new file mode 100644
index 000000000000..89013fbb1bdc
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/sorting/radix_argsort.hpp
@@ -0,0 +1,47 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_sorting_impl
+/// extension.
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern void init_radix_argsort_functions(py::module_);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/sorting/radix_sort.cpp b/dpctl_ext/tensor/libtensor/source/sorting/radix_sort.cpp
new file mode 100644
index 000000000000..35c71a0eb7d3
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/sorting/radix_sort.cpp
@@ -0,0 +1,188 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_sorting_impl
+/// extension.
+//===----------------------------------------------------------------------===//
+
+#include <cstddef>
+#include <exception>
+#include <utility>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "utils/type_dispatch.hpp"
+
+#include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/sorting/radix_sort.hpp"
+#include "kernels/sorting/sort_impl_fn_ptr_t.hpp"
+
+#include "py_sort_common.hpp"
+#include "radix_sort.hpp"
+#include "radix_sort_support.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+namespace impl_ns = dpctl::tensor::kernels::radix_sort_details;
+
+using dpctl::tensor::ssize_t;
+using dpctl::tensor::kernels::sort_contig_fn_ptr_t;
+static sort_contig_fn_ptr_t
+    ascending_radix_sort_contig_dispatch_vector[td_ns::num_types];
+static sort_contig_fn_ptr_t
+    descending_radix_sort_contig_dispatch_vector[td_ns::num_types];
+
+namespace
+{
+
+template <bool is_ascending, typename T>
+sycl::event sort_axis1_contig_caller(sycl::queue &q,
+                                     std::size_t iter_nelems,
+                                     std::size_t sort_nelems,
+                                     const char *arg_cp,
+                                     char *res_cp,
+                                     ssize_t iter_arg_offset,
+                                     ssize_t iter_res_offset,
+                                     ssize_t sort_arg_offset,
+                                     ssize_t sort_res_offset,
+                                     const std::vector<sycl::event> &depends)
+{
+    using dpctl::tensor::kernels::radix_sort_axis1_contig_impl;
+
+    return radix_sort_axis1_contig_impl<T>(
+        q, is_ascending, iter_nelems, sort_nelems, arg_cp, res_cp,
+        iter_arg_offset, iter_res_offset, sort_arg_offset, sort_res_offset,
+        depends);
+}
+
+} // end of anonymous namespace
+
+template <typename fnT, typename argTy>
+struct AscendingRadixSortContigFactory
+{
+    fnT get()
+    {
+        if constexpr (RadixSortSupportVector<argTy>::is_defined) {
+            return sort_axis1_contig_caller</*ascending*/ true, argTy>;
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+template <typename fnT, typename argTy>
+struct DescendingRadixSortContigFactory
+{
+    fnT get()
+    {
+        if constexpr (RadixSortSupportVector<argTy>::is_defined) {
+            return sort_axis1_contig_caller</*ascending*/ false, argTy>;
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+void init_radix_sort_dispatch_vectors(void)
+{
+    using dpctl::tensor::kernels::sort_contig_fn_ptr_t;
+
+    td_ns::DispatchVectorBuilder<
+        sort_contig_fn_ptr_t, AscendingRadixSortContigFactory, td_ns::num_types>
+        dtv1;
+    dtv1.populate_dispatch_vector(ascending_radix_sort_contig_dispatch_vector);
+
+    td_ns::DispatchVectorBuilder<sort_contig_fn_ptr_t,
+                                 DescendingRadixSortContigFactory,
+                                 td_ns::num_types>
+        dtv2;
+    dtv2.populate_dispatch_vector(descending_radix_sort_contig_dispatch_vector);
+}
+
+bool py_radix_sort_defined(int typenum)
+{
+    const auto &array_types = td_ns::usm_ndarray_types();
+
+    try {
+        int type_id = array_types.typenum_to_lookup_id(typenum);
+        return (nullptr !=
+                ascending_radix_sort_contig_dispatch_vector[type_id]);
+    } catch (const std::exception &e) {
+        return false;
+    }
+}
+
+void init_radix_sort_functions(py::module_ m)
+{
+    init_radix_sort_dispatch_vectors();
+
+    auto py_radix_sort_ascending = [](const dpctl::tensor::usm_ndarray &src,
+                                      const int trailing_dims_to_sort,
+                                      const dpctl::tensor::usm_ndarray &dst,
+                                      sycl::queue &exec_q,
+                                      const std::vector<sycl::event> &depends)
+        -> std::pair<sycl::event, sycl::event> {
+        return py_sort(src, trailing_dims_to_sort, dst, exec_q, depends,
+                       ascending_radix_sort_contig_dispatch_vector);
+    };
+    m.def("_radix_sort_ascending", py_radix_sort_ascending, py::arg("src"),
+          py::arg("trailing_dims_to_sort"), py::arg("dst"),
+          py::arg("sycl_queue"), py::arg("depends") = py::list());
+
+    auto py_radix_sort_descending = [](const dpctl::tensor::usm_ndarray &src,
+                                       const int trailing_dims_to_sort,
+                                       const dpctl::tensor::usm_ndarray &dst,
+                                       sycl::queue &exec_q,
+                                       const std::vector<sycl::event> &depends)
+        -> std::pair<sycl::event, sycl::event> {
+        return py_sort(src, trailing_dims_to_sort, dst, exec_q, depends,
+                       descending_radix_sort_contig_dispatch_vector);
+    };
+    m.def("_radix_sort_descending", py_radix_sort_descending, py::arg("src"),
+          py::arg("trailing_dims_to_sort"), py::arg("dst"),
+          py::arg("sycl_queue"), py::arg("depends") = py::list());
+
+    m.def("_radix_sort_dtype_supported", py_radix_sort_defined);
+
+    return;
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/sorting/radix_sort.hpp b/dpctl_ext/tensor/libtensor/source/sorting/radix_sort.hpp
new file mode 100644
index 000000000000..5f3c771b464b
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/sorting/radix_sort.hpp
@@ -0,0 +1,47 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_sorting_impl
+/// extension.
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern void init_radix_sort_functions(py::module_);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/sorting/radix_sort_support.hpp b/dpctl_ext/tensor/libtensor/source/sorting/radix_sort_support.hpp
new file mode 100644
index 000000000000..8d7e55a5cd28
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/sorting/radix_sort_support.hpp
@@ -0,0 +1,78 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_sorting_impl
+/// extension.
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <cstdint>
+#include <type_traits>
+
+#include <sycl/sycl.hpp>
+
+namespace dpctl::tensor::py_internal
+{
+
+template <typename Ty, typename ArgTy>
+struct TypeDefinedEntry : std::bool_constant<std::is_same_v<Ty, ArgTy>>
+{
+    static constexpr bool is_defined = true;
+};
+
+struct NotDefinedEntry : std::true_type
+{
+    static constexpr bool is_defined = false;
+};
+
+template <typename T>
+struct RadixSortSupportVector
+{
+    using resolver_t =
+        typename std::disjunction<TypeDefinedEntry<T, bool>,
+                                  TypeDefinedEntry<T, std::int8_t>,
+                                  TypeDefinedEntry<T, std::uint8_t>,
+                                  TypeDefinedEntry<T, std::int16_t>,
+                                  TypeDefinedEntry<T, std::uint16_t>,
+                                  TypeDefinedEntry<T, std::int32_t>,
+                                  TypeDefinedEntry<T, std::uint32_t>,
+                                  TypeDefinedEntry<T, std::int64_t>,
+                                  TypeDefinedEntry<T, std::uint64_t>,
+                                  TypeDefinedEntry<T, sycl::half>,
+                                  TypeDefinedEntry<T, float>,
+                                  TypeDefinedEntry<T, double>,
+                                  NotDefinedEntry>;
+
+    static constexpr bool is_defined = resolver_t::is_defined;
+};
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/sorting/searchsorted.cpp b/dpctl_ext/tensor/libtensor/source/sorting/searchsorted.cpp
new file mode 100644
index 000000000000..8b1ce04a97d6
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/sorting/searchsorted.cpp
@@ -0,0 +1,478 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_sorting_impl
+/// extension.
+//===----------------------------------------------------------------------===//
+
+#include <cstddef>
+#include <cstdint>
+#include <stdexcept>
+#include <tuple>
+#include <type_traits>
+#include <utility>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "kernels/sorting/searchsorted.hpp"
+#include "utils/memory_overlap.hpp"
+#include "utils/offset_utils.hpp"
+#include "utils/output_validation.hpp"
+#include "utils/rich_comparisons.hpp"
+#include "utils/sycl_alloc_utils.hpp"
+#include "utils/type_dispatch.hpp"
+
+#include "simplify_iteration_space.hpp"
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace detail
+{
+
+using dpctl::tensor::kernels::searchsorted_contig_impl_fp_ptr_t;
+
+static searchsorted_contig_impl_fp_ptr_t
+    left_side_searchsorted_contig_impl[td_ns::num_types][td_ns::num_types];
+
+static searchsorted_contig_impl_fp_ptr_t
+    right_side_searchsorted_contig_impl[td_ns::num_types][td_ns::num_types];
+
+template <typename fnT, typename argTy, typename indTy>
+struct LeftSideSearchSortedContigFactory
+{
+    constexpr LeftSideSearchSortedContigFactory() {}
+
+    fnT get() const
+    {
+        if constexpr (std::is_same_v<indTy, std::int32_t> ||
+                      std::is_same_v<indTy, std::int64_t>)
+        {
+            static constexpr bool left_side_search(true);
+            using dpctl::tensor::kernels::searchsorted_contig_impl;
+            using dpctl::tensor::rich_comparisons::AscendingSorter;
+
+            using Compare = typename AscendingSorter<argTy>::type;
+
+            return searchsorted_contig_impl<argTy, indTy, left_side_search,
+                                            Compare>;
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+template <typename fnT, typename argTy, typename indTy>
+struct RightSideSearchSortedContigFactory
+{
+    constexpr RightSideSearchSortedContigFactory() {}
+
+    fnT get() const
+    {
+        if constexpr (std::is_same_v<indTy, std::int32_t> ||
+                      std::is_same_v<indTy, std::int64_t>)
+        {
+            static constexpr bool right_side_search(false);
+
+            using dpctl::tensor::kernels::searchsorted_contig_impl;
+            using dpctl::tensor::rich_comparisons::AscendingSorter;
+
+            using Compare = typename AscendingSorter<argTy>::type;
+
+            return searchsorted_contig_impl<argTy, indTy, right_side_search,
+                                            Compare>;
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+using dpctl::tensor::kernels::searchsorted_strided_impl_fp_ptr_t;
+
+static searchsorted_strided_impl_fp_ptr_t
+    left_side_searchsorted_strided_impl[td_ns::num_types][td_ns::num_types];
+
+static searchsorted_strided_impl_fp_ptr_t
+    right_side_searchsorted_strided_impl[td_ns::num_types][td_ns::num_types];
+
+template <typename fnT, typename argTy, typename indTy>
+struct LeftSideSearchSortedStridedFactory
+{
+    constexpr LeftSideSearchSortedStridedFactory() {}
+
+    fnT get() const
+    {
+        if constexpr (std::is_same_v<indTy, std::int32_t> ||
+                      std::is_same_v<indTy, std::int64_t>)
+        {
+            static constexpr bool left_side_search(true);
+            using dpctl::tensor::kernels::searchsorted_strided_impl;
+            using dpctl::tensor::rich_comparisons::AscendingSorter;
+
+            using Compare = typename AscendingSorter<argTy>::type;
+
+            return searchsorted_strided_impl<argTy, indTy, left_side_search,
+                                             Compare>;
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+template <typename fnT, typename argTy, typename indTy>
+struct RightSideSearchSortedStridedFactory
+{
+    constexpr RightSideSearchSortedStridedFactory() {}
+
+    fnT get() const
+    {
+        if constexpr (std::is_same_v<indTy, std::int32_t> ||
+                      std::is_same_v<indTy, std::int64_t>)
+        {
+            static constexpr bool right_side_search(false);
+            using dpctl::tensor::kernels::searchsorted_strided_impl;
+            using dpctl::tensor::rich_comparisons::AscendingSorter;
+
+            using Compare = typename AscendingSorter<argTy>::type;
+
+            return searchsorted_strided_impl<argTy, indTy, right_side_search,
+                                             Compare>;
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+void init_searchsorted_dispatch_table(void)
+{
+
+    // Contiguous input function dispatch
+    td_ns::DispatchTableBuilder<searchsorted_contig_impl_fp_ptr_t,
+                                LeftSideSearchSortedContigFactory,
+                                td_ns::num_types>
+        dtb1;
+    dtb1.populate_dispatch_table(left_side_searchsorted_contig_impl);
+
+    td_ns::DispatchTableBuilder<searchsorted_contig_impl_fp_ptr_t,
+                                RightSideSearchSortedContigFactory,
+                                td_ns::num_types>
+        dtb2;
+    dtb2.populate_dispatch_table(right_side_searchsorted_contig_impl);
+
+    // Strided input function dispatch
+    td_ns::DispatchTableBuilder<searchsorted_strided_impl_fp_ptr_t,
+                                LeftSideSearchSortedStridedFactory,
+                                td_ns::num_types>
+        dtb3;
+    dtb3.populate_dispatch_table(left_side_searchsorted_strided_impl);
+
+    td_ns::DispatchTableBuilder<searchsorted_strided_impl_fp_ptr_t,
+                                RightSideSearchSortedStridedFactory,
+                                td_ns::num_types>
+        dtb4;
+    dtb4.populate_dispatch_table(right_side_searchsorted_strided_impl);
+}
+
+} // namespace detail
+
+/*! @brief search for needle from needles in sorted hay */
+std::pair<sycl::event, sycl::event>
+    py_searchsorted(const dpctl::tensor::usm_ndarray &hay,
+                    const dpctl::tensor::usm_ndarray &needles,
+                    const dpctl::tensor::usm_ndarray &positions,
+                    sycl::queue &exec_q,
+                    const bool search_left_side,
+                    const std::vector<sycl::event> &depends)
+{
+    const int hay_nd = hay.get_ndim();
+    const int needles_nd = needles.get_ndim();
+    const int positions_nd = positions.get_ndim();
+
+    if (hay_nd != 1 || needles_nd != positions_nd) {
+        throw py::value_error("Array dimensions mismatch");
+    }
+
+    // check that needle and positions have the same shape
+    std::size_t needles_nelems(1);
+    bool same_shape(true);
+
+    const std::size_t hay_nelems = static_cast<std::size_t>(hay.get_shape(0));
+
+    const py::ssize_t *needles_shape_ptr = needles.get_shape_raw();
+    const py::ssize_t *positions_shape_ptr = positions.get_shape_raw();
+
+    for (int i = 0; (i < needles_nd) && same_shape; ++i) {
+        const auto needles_sh_i = needles_shape_ptr[i];
+        const auto positions_sh_i = positions_shape_ptr[i];
+
+        same_shape = same_shape && (needles_sh_i == positions_sh_i);
+        needles_nelems *= static_cast<std::size_t>(needles_sh_i);
+    }
+
+    if (!same_shape) {
+        throw py::value_error(
+            "Array of values to search for and array of their "
+            "positions do not have the same shape");
+    }
+
+    // check that positions is ample enough
+    dpctl::tensor::validation::AmpleMemory::throw_if_not_ample(positions,
+                                                               needles_nelems);
+
+    // check that positions is writable
+    dpctl::tensor::validation::CheckWritable::throw_if_not_writable(positions);
+
+    // check that queues are compatible
+    if (!dpctl::utils::queues_are_compatible(exec_q, {hay, needles, positions}))
+    {
+        throw py::value_error(
+            "Execution queue is not compatible with allocation queues");
+    }
+
+    // if output array overlaps with input arrays, race condition results
+    auto const &overlap = dpctl::tensor::overlap::MemoryOverlap();
+    if (overlap(positions, hay) || overlap(positions, needles)) {
+        throw py::value_error("Destination array overlaps with input.");
+    }
+
+    const int hay_typenum = hay.get_typenum();
+    const int needles_typenum = needles.get_typenum();
+    const int positions_typenum = positions.get_typenum();
+
+    auto const &array_types = td_ns::usm_ndarray_types();
+    const int hay_typeid = array_types.typenum_to_lookup_id(hay_typenum);
+    const int needles_typeid =
+        array_types.typenum_to_lookup_id(needles_typenum);
+    const int positions_typeid =
+        array_types.typenum_to_lookup_id(positions_typenum);
+
+    // check hay and needle have the same data-type
+    if (needles_typeid != hay_typeid) {
+        throw py::value_error(
+            "Hay array and needles array must have the same data types");
+    }
+    // check that positions has indexing data-type (int32, or int64)
+    const auto positions_typenum_t_v =
+        static_cast<td_ns::typenum_t>(positions_typeid);
+    if (positions_typenum_t_v != td_ns::typenum_t::INT32 &&
+        positions_typenum_t_v != td_ns::typenum_t::INT64)
+    {
+        throw py::value_error(
+            "Positions array must have data-type int32, or int64");
+    }
+
+    if (needles_nelems == 0) {
+        // Nothing to do
+        return std::make_pair(sycl::event{}, sycl::event{});
+    }
+
+    // if all inputs are contiguous call contiguous implementations
+    // otherwise call strided implementation
+    const bool hay_is_c_contig = hay.is_c_contiguous();
+    const bool hay_is_f_contig = hay.is_f_contiguous();
+
+    const bool needles_is_c_contig = needles.is_c_contiguous();
+    const bool needles_is_f_contig = needles.is_f_contiguous();
+
+    const bool positions_is_c_contig = positions.is_c_contiguous();
+    const bool positions_is_f_contig = positions.is_f_contiguous();
+
+    const bool all_c_contig =
+        (hay_is_c_contig && needles_is_c_contig && positions_is_c_contig);
+    const bool all_f_contig =
+        (hay_is_f_contig && needles_is_f_contig && positions_is_f_contig);
+
+    const char *hay_data = hay.get_data();
+    const char *needles_data = needles.get_data();
+
+    char *positions_data = positions.get_data();
+
+    if (all_c_contig || all_f_contig) {
+        auto fn =
+            (search_left_side)
+                ? detail::left_side_searchsorted_contig_impl[hay_typeid]
+                                                            [positions_typeid]
+                : detail::right_side_searchsorted_contig_impl[hay_typeid]
+                                                             [positions_typeid];
+
+        if (fn) {
+            static constexpr py::ssize_t zero_offset(0);
+
+            sycl::event comp_ev =
+                fn(exec_q, hay_nelems, needles_nelems, hay_data, zero_offset,
+                   needles_data, zero_offset, positions_data, zero_offset,
+                   depends);
+
+            return std::make_pair(
+                dpctl::utils::keep_args_alive(exec_q, {hay, needles, positions},
+                                              {comp_ev}),
+                comp_ev);
+        }
+    }
+
+    // strided case
+
+    const auto &needles_strides = needles.get_strides_vector();
+    const auto &positions_strides = positions.get_strides_vector();
+
+    int simplified_nd = needles_nd;
+
+    using shT = std::vector<py::ssize_t>;
+    shT simplified_common_shape;
+    shT simplified_needles_strides;
+    shT simplified_positions_strides;
+    py::ssize_t needles_offset(0);
+    py::ssize_t positions_offset(0);
+
+    if (simplified_nd == 0) {
+        // needles and positions have same nd
+        simplified_nd = 1;
+        simplified_common_shape.push_back(1);
+        simplified_needles_strides.push_back(0);
+        simplified_positions_strides.push_back(0);
+    }
+    else {
+        simplify_iteration_space(
+            // modified by reference
+            simplified_nd,
+            // read-only inputs
+            needles_shape_ptr, needles_strides, positions_strides,
+            // output, modified by reference
+            simplified_common_shape, simplified_needles_strides,
+            simplified_positions_strides, needles_offset, positions_offset);
+    }
+    std::vector<sycl::event> host_task_events;
+    host_task_events.reserve(2);
+
+    using dpctl::tensor::offset_utils::device_allocate_and_pack;
+
+    auto ptr_size_event_tuple = device_allocate_and_pack<py::ssize_t>(
+        exec_q, host_task_events,
+        // vectors being packed
+        simplified_common_shape, simplified_needles_strides,
+        simplified_positions_strides);
+    auto packed_shape_strides_owner =
+        std::move(std::get<0>(ptr_size_event_tuple));
+    const sycl::event &copy_shape_strides_ev =
+        std::get<2>(ptr_size_event_tuple);
+    const py::ssize_t *packed_shape_strides = packed_shape_strides_owner.get();
+
+    std::vector<sycl::event> all_deps;
+    all_deps.reserve(depends.size() + 1);
+    all_deps.insert(all_deps.end(), depends.begin(), depends.end());
+    all_deps.push_back(copy_shape_strides_ev);
+
+    auto strided_fn =
+        (search_left_side)
+            ? detail::left_side_searchsorted_strided_impl[hay_typeid]
+                                                         [positions_typeid]
+            : detail::right_side_searchsorted_strided_impl[hay_typeid]
+                                                          [positions_typeid];
+
+    if (!strided_fn) {
+        throw std::runtime_error(
+            "No implementation for data types of input arrays");
+    }
+
+    static constexpr py::ssize_t zero_offset(0);
+    py::ssize_t hay_step = hay.get_strides_vector()[0];
+
+    const sycl::event &comp_ev = strided_fn(
+        exec_q, hay_nelems, needles_nelems, hay_data, zero_offset, hay_step,
+        needles_data, needles_offset, positions_data, positions_offset,
+        simplified_nd, packed_shape_strides, all_deps);
+
+    // free packed temporaries
+    sycl::event temporaries_cleanup_ev =
+        dpctl::tensor::alloc_utils::async_smart_free(
+            exec_q, {comp_ev}, packed_shape_strides_owner);
+
+    host_task_events.push_back(temporaries_cleanup_ev);
+    const sycl::event &ht_ev = dpctl::utils::keep_args_alive(
+        exec_q, {hay, needles, positions}, host_task_events);
+
+    return std::make_pair(ht_ev, comp_ev);
+}
+
+/*! @brief search for needle from needles in sorted hay,
+ *         hay[pos] <= needle < hay[pos + 1]
+ */
+std::pair<sycl::event, sycl::event>
+    py_searchsorted_left(const dpctl::tensor::usm_ndarray &hay,
+                         const dpctl::tensor::usm_ndarray &needles,
+                         const dpctl::tensor::usm_ndarray &positions,
+                         sycl::queue &exec_q,
+                         const std::vector<sycl::event> &depends)
+{
+    static constexpr bool side_left(true);
+    return py_searchsorted(hay, needles, positions, exec_q, side_left, depends);
+}
+
+/*! @brief search for needle from needles in sorted hay,
+ *         hay[pos] < needle <= hay[pos + 1]
+ */
+std::pair<sycl::event, sycl::event>
+    py_searchsorted_right(const dpctl::tensor::usm_ndarray &hay,
+                          const dpctl::tensor::usm_ndarray &needles,
+                          const dpctl::tensor::usm_ndarray &positions,
+                          sycl::queue &exec_q,
+                          const std::vector<sycl::event> &depends)
+{
+    static constexpr bool side_right(false);
+    return py_searchsorted(hay, needles, positions, exec_q, side_right,
+                           depends);
+}
+
+void init_searchsorted_functions(py::module_ m)
+{
+    detail::init_searchsorted_dispatch_table();
+
+    m.def("_searchsorted_left", &py_searchsorted_left, py::arg("hay"),
+          py::arg("needles"), py::arg("positions"), py::arg("sycl_queue"),
+          py::arg("depends") = py::list());
+    m.def("_searchsorted_right", &py_searchsorted_right, py::arg("hay"),
+          py::arg("needles"), py::arg("positions"), py::arg("sycl_queue"),
+          py::arg("depends") = py::list());
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/sorting/searchsorted.hpp b/dpctl_ext/tensor/libtensor/source/sorting/searchsorted.hpp
new file mode 100644
index 000000000000..b60dae1e0ec9
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/sorting/searchsorted.hpp
@@ -0,0 +1,47 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_sorting_impl
+/// extension.
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern void init_searchsorted_functions(py::module_ m);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/sorting/topk.cpp b/dpctl_ext/tensor/libtensor/source/sorting/topk.cpp
new file mode 100644
index 000000000000..6b8344df12c8
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/sorting/topk.cpp
@@ -0,0 +1,303 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_sorting_impl
+/// extension.
+//===----------------------------------------------------------------------===//
+
+#include <cstddef>
+#include <cstdint>
+#include <optional>
+#include <type_traits>
+#include <utility>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "kernels/sorting/topk.hpp"
+#include "utils/memory_overlap.hpp"
+#include "utils/output_validation.hpp"
+#include "utils/rich_comparisons.hpp"
+#include "utils/type_dispatch.hpp"
+
+#include "topk.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+typedef sycl::event (*topk_impl_fn_ptr_t)(sycl::queue &,
+                                          std::size_t,
+                                          std::size_t,
+                                          std::size_t,
+                                          bool,
+                                          const char *,
+                                          char *,
+                                          char *,
+                                          const std::vector<sycl::event> &);
+
+static topk_impl_fn_ptr_t topk_dispatch_vector[td_ns::num_types];
+
+namespace
+{
+
+template <typename T, typename = void>
+struct use_radix_sort : public std::false_type
+{
+};
+
+template <typename T>
+struct use_radix_sort<
+    T,
+    std::enable_if_t<std::disjunction<std::is_same<T, bool>,
+                                      std::is_same<T, std::uint8_t>,
+                                      std::is_same<T, std::int8_t>,
+                                      std::is_same<T, std::uint16_t>,
+                                      std::is_same<T, std::int16_t>>::value>>
+    : public std::true_type
+{
+};
+
+template <typename argTy, typename IndexTy>
+sycl::event topk_caller(sycl::queue &exec_q,
+                        std::size_t iter_nelems, // number of sub-arrays
+                        std::size_t axis_nelems, // size of each sub-array
+                        std::size_t k,
+                        bool largest,
+                        const char *arg_cp,
+                        char *vals_cp,
+                        char *inds_cp,
+                        const std::vector<sycl::event> &depends)
+{
+    if constexpr (use_radix_sort<argTy>::value) {
+        using dpctl::tensor::kernels::topk_radix_impl;
+        auto ascending = !largest;
+        return topk_radix_impl<argTy, IndexTy>(exec_q, iter_nelems, axis_nelems,
+                                               k, ascending, arg_cp, vals_cp,
+                                               inds_cp, depends);
+    }
+    else {
+        using dpctl::tensor::kernels::topk_merge_impl;
+        if (largest) {
+            using CompTy =
+                typename dpctl::tensor::rich_comparisons::DescendingSorter<
+                    argTy>::type;
+            return topk_merge_impl<argTy, IndexTy, CompTy>(
+                exec_q, iter_nelems, axis_nelems, k, arg_cp, vals_cp, inds_cp,
+                depends);
+        }
+        else {
+            using CompTy =
+                typename dpctl::tensor::rich_comparisons::AscendingSorter<
+                    argTy>::type;
+            return topk_merge_impl<argTy, IndexTy, CompTy>(
+                exec_q, iter_nelems, axis_nelems, k, arg_cp, vals_cp, inds_cp,
+                depends);
+        }
+    }
+}
+
+} // namespace
+
+std::pair<sycl::event, sycl::event>
+    py_topk(const dpctl::tensor::usm_ndarray &src,
+            std::optional<const int> trailing_dims_to_search,
+            const std::size_t k,
+            const bool largest,
+            const dpctl::tensor::usm_ndarray &vals,
+            const dpctl::tensor::usm_ndarray &inds,
+            sycl::queue &exec_q,
+            const std::vector<sycl::event> &depends)
+{
+    int src_nd = src.get_ndim();
+    int vals_nd = vals.get_ndim();
+    int inds_nd = inds.get_ndim();
+
+    const py::ssize_t *src_shape_ptr = src.get_shape_raw();
+    const py::ssize_t *vals_shape_ptr = vals.get_shape_raw();
+    const py::ssize_t *inds_shape_ptr = inds.get_shape_raw();
+
+    std::size_t axis_nelems(1);
+    std::size_t iter_nelems(1);
+    if (trailing_dims_to_search.has_value()) {
+        if (src_nd != vals_nd || src_nd != inds_nd) {
+            throw py::value_error("The input and output arrays must have "
+                                  "the same array ranks");
+        }
+
+        auto trailing_dims = trailing_dims_to_search.value();
+        int iter_nd = src_nd - trailing_dims;
+        if (trailing_dims <= 0 || iter_nd < 0) {
+            throw py::value_error(
+                "trailing_dims_to_search must be positive, but no "
+                "greater than rank of the array being searched");
+        }
+
+        bool same_shapes = true;
+        for (int i = 0; same_shapes && (i < iter_nd); ++i) {
+            auto src_shape_i = src_shape_ptr[i];
+            same_shapes = same_shapes && (src_shape_i == vals_shape_ptr[i] &&
+                                          src_shape_i == inds_shape_ptr[i]);
+            iter_nelems *= static_cast<std::size_t>(src_shape_i);
+        }
+
+        if (!same_shapes) {
+            throw py::value_error(
+                "Destination shape does not match the input shape");
+        }
+
+        std::size_t vals_k(1);
+        std::size_t inds_k(1);
+        for (int i = iter_nd; i < src_nd; ++i) {
+            axis_nelems *= static_cast<std::size_t>(src_shape_ptr[i]);
+            vals_k *= static_cast<std::size_t>(vals_shape_ptr[i]);
+            inds_k *= static_cast<std::size_t>(inds_shape_ptr[i]);
+        }
+
+        bool valid_k = (vals_k == k && inds_k == k && axis_nelems >= k);
+        if (!valid_k) {
+            throw py::value_error("The value of k is invalid for the input and "
+                                  "destination arrays");
+        }
+    }
+    else {
+        if (vals_nd != 1 || inds_nd != 1) {
+            throw py::value_error("Output arrays must be one-dimensional");
+        }
+
+        for (int i = 0; i < src_nd; ++i) {
+            axis_nelems *= static_cast<std::size_t>(src_shape_ptr[i]);
+        }
+
+        bool valid_k = (axis_nelems >= k &&
+                        static_cast<std::size_t>(vals_shape_ptr[0]) == k &&
+                        static_cast<std::size_t>(inds_shape_ptr[0]) == k);
+        if (!valid_k) {
+            throw py::value_error("The value of k is invalid for the input and "
+                                  "destination arrays");
+        }
+    }
+
+    if (!dpctl::utils::queues_are_compatible(exec_q, {src, vals, inds})) {
+        throw py::value_error(
+            "Execution queue is not compatible with allocation queues");
+    }
+
+    dpctl::tensor::validation::CheckWritable::throw_if_not_writable(vals);
+    dpctl::tensor::validation::CheckWritable::throw_if_not_writable(inds);
+
+    if ((iter_nelems == 0) || (axis_nelems == 0)) {
+        // Nothing to do
+        return std::make_pair(sycl::event(), sycl::event());
+    }
+
+    auto const &overlap = dpctl::tensor::overlap::MemoryOverlap();
+    if (overlap(src, vals) || overlap(src, inds)) {
+        throw py::value_error("Arrays index overlapping segments of memory");
+    }
+
+    dpctl::tensor::validation::AmpleMemory::throw_if_not_ample(vals,
+                                                               k * iter_nelems);
+
+    dpctl::tensor::validation::AmpleMemory::throw_if_not_ample(inds,
+                                                               k * iter_nelems);
+
+    int src_typenum = src.get_typenum();
+    int vals_typenum = vals.get_typenum();
+    int inds_typenum = inds.get_typenum();
+
+    const auto &array_types = td_ns::usm_ndarray_types();
+    int src_typeid = array_types.typenum_to_lookup_id(src_typenum);
+    int vals_typeid = array_types.typenum_to_lookup_id(vals_typenum);
+    int inds_typeid = array_types.typenum_to_lookup_id(inds_typenum);
+
+    if (src_typeid != vals_typeid) {
+        throw py::value_error("Input array and vals array must have "
+                              "the same data type");
+    }
+
+    if (inds_typeid != static_cast<int>(td_ns::typenum_t::INT64)) {
+        throw py::value_error("Inds array must have data type int64");
+    }
+
+    bool is_src_c_contig = src.is_c_contiguous();
+    bool is_vals_c_contig = vals.is_c_contiguous();
+    bool is_inds_c_contig = inds.is_c_contiguous();
+
+    if (is_src_c_contig && is_vals_c_contig && is_inds_c_contig) {
+        auto fn = topk_dispatch_vector[src_typeid];
+
+        sycl::event comp_ev =
+            fn(exec_q, iter_nelems, axis_nelems, k, largest, src.get_data(),
+               vals.get_data(), inds.get_data(), depends);
+
+        sycl::event keep_args_alive_ev =
+            dpctl::utils::keep_args_alive(exec_q, {src, vals, inds}, {comp_ev});
+
+        return std::make_pair(keep_args_alive_ev, comp_ev);
+    }
+
+    return std::make_pair(sycl::event(), sycl::event());
+}
+
+template <typename fnT, typename T>
+struct TopKFactory
+{
+    fnT get()
+    {
+        using IdxT = std::int64_t;
+        return topk_caller<T, IdxT>;
+    }
+};
+
+void init_topk_dispatch_vectors(void)
+{
+    td_ns::DispatchVectorBuilder<topk_impl_fn_ptr_t, TopKFactory,
+                                 td_ns::num_types>
+        dvb;
+    dvb.populate_dispatch_vector(topk_dispatch_vector);
+}
+
+void init_topk_functions(py::module_ m)
+{
+    init_topk_dispatch_vectors();
+
+    m.def("_topk", &py_topk, py::arg("src"), py::arg("trailing_dims_to_search"),
+          py::arg("k"), py::arg("largest"), py::arg("vals"), py::arg("inds"),
+          py::arg("sycl_queue"), py::arg("depends") = py::list());
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/sorting/topk.hpp b/dpctl_ext/tensor/libtensor/source/sorting/topk.hpp
new file mode 100644
index 000000000000..d39c0eefdb93
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/sorting/topk.hpp
@@ -0,0 +1,47 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_sorting_impl
+/// extension.
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern void init_topk_functions(py::module_);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/tensor_sorting.cpp b/dpctl_ext/tensor/libtensor/source/tensor_sorting.cpp
new file mode 100644
index 000000000000..318c3559d77c
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/tensor_sorting.cpp
@@ -0,0 +1,55 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_sorting_impl
+/// extension.
+//===----------------------------------------------------------------------===//
+
+#include <pybind11/pybind11.h>
+
+#include "sorting/isin.hpp"
+#include "sorting/merge_argsort.hpp"
+#include "sorting/merge_sort.hpp"
+#include "sorting/radix_argsort.hpp"
+#include "sorting/radix_sort.hpp"
+#include "sorting/searchsorted.hpp"
+#include "sorting/topk.hpp"
+
+PYBIND11_MODULE(_tensor_sorting_impl, m)
+{
+    dpctl::tensor::py_internal::init_isin_functions(m);
+    dpctl::tensor::py_internal::init_merge_sort_functions(m);
+    dpctl::tensor::py_internal::init_merge_argsort_functions(m);
+    dpctl::tensor::py_internal::init_searchsorted_functions(m);
+    dpctl::tensor::py_internal::init_radix_sort_functions(m);
+    dpctl::tensor::py_internal::init_radix_argsort_functions(m);
+    dpctl::tensor::py_internal::init_topk_functions(m);
+}
diff --git a/dpnp/dpnp_iface_logic.py b/dpnp/dpnp_iface_logic.py
index 1834f25a0485..3e3501b14c7c 100644
--- a/dpnp/dpnp_iface_logic.py
+++ b/dpnp/dpnp_iface_logic.py
@@ -49,6 +49,9 @@
 import dpctl.utils as dpu
 import numpy
 
+# TODO: revert to `import dpctl.tensor...`
+# when dpnp fully migrates dpctl/tensor
+import dpctl_ext.tensor as dpt_ext
 import dpnp
 import dpnp.backend.extensions.ufunc._ufunc_impl as ufi
 from dpnp.dpnp_algo.dpnp_elementwise_common import DPNPBinaryFunc, DPNPUnaryFunc
@@ -1273,7 +1276,7 @@ def isin(
         usm_element = dpnp.get_usm_ndarray(element)
         usm_test = dpnp.get_usm_ndarray(test_elements)
     return dpnp_array._create_from_usm_ndarray(
-        dpt.isin(
+        dpt_ext.isin(
             usm_element,
             usm_test,
             invert=invert,
diff --git a/dpnp/dpnp_iface_manipulation.py b/dpnp/dpnp_iface_manipulation.py
index d188ae098cd9..7eb44f79ae38 100644
--- a/dpnp/dpnp_iface_manipulation.py
+++ b/dpnp/dpnp_iface_manipulation.py
@@ -378,22 +378,24 @@ def _get_first_nan_index(usm_a):
                 true_val = dpt_ext.asarray(
                     True, sycl_queue=usm_a.sycl_queue, usm_type=usm_a.usm_type
                 )
-                return dpt.searchsorted(dpt.isnan(usm_a), true_val, side="left")
-            return dpt.searchsorted(usm_a, usm_a[-1], side="left")
+                return dpt_ext.searchsorted(
+                    dpt.isnan(usm_a), true_val, side="left"
+                )
+            return dpt_ext.searchsorted(usm_a, usm_a[-1], side="left")
         return None
 
     usm_ar = dpnp.get_usm_ndarray(ar)
 
     num_of_flags = (return_index, return_inverse, return_counts).count(True)
     if num_of_flags == 0:
-        usm_res = dpt.unique_values(usm_ar)
+        usm_res = dpt_ext.unique_values(usm_ar)
         usm_res = (usm_res,)  # cast to a tuple to align with other cases
     elif num_of_flags == 1 and return_inverse:
-        usm_res = dpt.unique_inverse(usm_ar)
+        usm_res = dpt_ext.unique_inverse(usm_ar)
     elif num_of_flags == 1 and return_counts:
-        usm_res = dpt.unique_counts(usm_ar)
+        usm_res = dpt_ext.unique_counts(usm_ar)
     else:
-        usm_res = dpt.unique_all(usm_ar)
+        usm_res = dpt_ext.unique_all(usm_ar)
 
     first_nan = None
     if equal_nan:
diff --git a/dpnp/dpnp_iface_searching.py b/dpnp/dpnp_iface_searching.py
index 15f52338ec7e..055aaa999c3a 100644
--- a/dpnp/dpnp_iface_searching.py
+++ b/dpnp/dpnp_iface_searching.py
@@ -382,7 +382,7 @@ def searchsorted(a, v, side="left", sorter=None):
 
     usm_sorter = None if sorter is None else dpnp.get_usm_ndarray(sorter)
     return dpnp_array._create_from_usm_ndarray(
-        dpt.searchsorted(usm_a, usm_v, side=side, sorter=usm_sorter)
+        dpt_ext.searchsorted(usm_a, usm_v, side=side, sorter=usm_sorter)
     )
 
 
diff --git a/dpnp/dpnp_iface_sorting.py b/dpnp/dpnp_iface_sorting.py
index 5f7a3829b3c9..e7abef1f4338 100644
--- a/dpnp/dpnp_iface_sorting.py
+++ b/dpnp/dpnp_iface_sorting.py
@@ -41,11 +41,9 @@
 
 from collections.abc import Sequence
 
-import dpctl.tensor as dpt
-
 # TODO: revert to `import dpctl.tensor...`
 # when dpnp fully migrates dpctl/tensor
-import dpctl_ext.tensor as dpt_ext
+import dpctl_ext.tensor as dpt
 import dpnp
 from dpctl_ext.tensor._numpy_helper import normalize_axis_index
 
@@ -87,7 +85,7 @@ def _wrap_sort_argsort(
 
     usm_a = dpnp.get_usm_ndarray(a)
     if axis is None:
-        usm_a = dpt_ext.reshape(usm_a, -1)
+        usm_a = dpt.reshape(usm_a, -1)
         axis = -1
 
     axis = normalize_axis_index(axis, ndim=usm_a.ndim)
diff --git a/pyproject.toml b/pyproject.toml
index 67fb75cb5f54..09253467b8dc 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -108,7 +108,7 @@ target-version = ['py310', 'py311', 'py312', 'py313', 'py314']
 [tool.codespell]
 builtin = "clear,rare,informal,names"
 check-filenames = true
-ignore-words-list = "amin,arange,elemt,fro,hist,ith,mone,nd,nin,sinc,vart,GroupT,AccessorT,IndexT"
+ignore-words-list = "amin,arange,elemt,fro,hist,ith,mone,nd,nin,sinc,vart,GroupT,AccessorT,IndexT,fpT,OffsetT,inpT"
 quiet-level = 3
 
 [tool.coverage.report]

From 6cc6b6c6b9ab4b6c3b17300fc32d2f65c5717f67 Mon Sep 17 00:00:00 2001
From: vlad-perevezentsev <vladislav.perevezentsev@intel.com>
Date: Tue, 10 Mar 2026 16:45:36 +0100
Subject: [PATCH 11/43] Move `_tensor_reductions_impl` extension and use it for
 dpnp (#2794)

This PR completely moves `_tensor_reductions_impl` pybind11 extension
into `dpctl_ext.tensor` and extends dpctl_ext.tensor Python API with the
functions: `all, any, diff, argmax, argmin, count_nonzero, logsumexp, max. min, prod, reduce_hypot and sum`
 reusing them in dpnp
---
 dpctl_ext/tensor/CMakeLists.txt               |   25 +-
 dpctl_ext/tensor/__init__.py                  |   24 +
 dpctl_ext/tensor/_manipulation_functions.py   |    4 +-
 dpctl_ext/tensor/_reduction.py                |  834 +++++
 dpctl_ext/tensor/_utility_functions.py        |  509 +++
 .../libtensor/include/kernels/reductions.hpp  | 3323 +++++++++++++++++
 .../libtensor/source/reductions/all.cpp       |  164 +
 .../libtensor/source/reductions/all.hpp       |   46 +
 .../libtensor/source/reductions/any.cpp       |  164 +
 .../libtensor/source/reductions/any.hpp       |   46 +
 .../libtensor/source/reductions/argmax.cpp    |  279 ++
 .../libtensor/source/reductions/argmax.hpp    |   46 +
 .../libtensor/source/reductions/argmin.cpp    |  279 ++
 .../libtensor/source/reductions/argmin.hpp    |   46 +
 .../libtensor/source/reductions/logsumexp.cpp |  258 ++
 .../libtensor/source/reductions/logsumexp.hpp |   46 +
 .../libtensor/source/reductions/max.cpp       |  410 ++
 .../libtensor/source/reductions/max.hpp       |   46 +
 .../libtensor/source/reductions/min.cpp       |  412 ++
 .../libtensor/source/reductions/min.hpp       |   46 +
 .../libtensor/source/reductions/prod.cpp      |  466 +++
 .../libtensor/source/reductions/prod.hpp      |   46 +
 .../source/reductions/reduce_hypot.cpp        |  254 ++
 .../source/reductions/reduce_hypot.hpp        |   46 +
 .../reductions/reduction_atomic_support.hpp   |  147 +
 .../source/reductions/reduction_common.cpp    |   69 +
 .../source/reductions/reduction_common.hpp    |   46 +
 .../source/reductions/reduction_over_axis.hpp | 1318 +++++++
 .../libtensor/source/reductions/sum.cpp       |  463 +++
 .../libtensor/source/reductions/sum.hpp       |   46 +
 .../libtensor/source/tensor_reductions.cpp    |   43 +
 dpnp/dpnp_iface_counting.py                   |    5 +-
 dpnp/dpnp_iface_logic.py                      |    5 +-
 dpnp/dpnp_iface_manipulation.py               |    4 +-
 dpnp/dpnp_iface_mathematical.py               |   13 +-
 dpnp/dpnp_iface_searching.py                  |   12 +-
 dpnp/dpnp_iface_statistics.py                 |    4 +-
 dpnp/dpnp_iface_trigonometric.py              |    6 +-
 38 files changed, 9970 insertions(+), 30 deletions(-)
 create mode 100644 dpctl_ext/tensor/_reduction.py
 create mode 100644 dpctl_ext/tensor/_utility_functions.py
 create mode 100644 dpctl_ext/tensor/libtensor/include/kernels/reductions.hpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/reductions/all.cpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/reductions/all.hpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/reductions/any.cpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/reductions/any.hpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/reductions/argmax.cpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/reductions/argmax.hpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/reductions/argmin.cpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/reductions/argmin.hpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/reductions/logsumexp.cpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/reductions/logsumexp.hpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/reductions/max.cpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/reductions/max.hpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/reductions/min.cpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/reductions/min.hpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/reductions/prod.cpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/reductions/prod.hpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/reductions/reduce_hypot.cpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/reductions/reduce_hypot.hpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/reductions/reduction_atomic_support.hpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/reductions/reduction_common.cpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/reductions/reduction_common.hpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/reductions/reduction_over_axis.hpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/reductions/sum.cpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/reductions/sum.hpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/tensor_reductions.cpp

diff --git a/dpctl_ext/tensor/CMakeLists.txt b/dpctl_ext/tensor/CMakeLists.txt
index 056b7c425544..cf55035c23d9 100644
--- a/dpctl_ext/tensor/CMakeLists.txt
+++ b/dpctl_ext/tensor/CMakeLists.txt
@@ -69,6 +69,19 @@ set(_accumulator_sources
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/accumulators/cumulative_prod.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/accumulators/cumulative_sum.cpp
 )
+set(_reduction_sources
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/reductions/reduction_common.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/reductions/all.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/reductions/any.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/reductions/argmax.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/reductions/argmin.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/reductions/logsumexp.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/reductions/max.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/reductions/min.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/reductions/prod.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/reductions/reduce_hypot.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/reductions/sum.cpp
+)
 set(_sorting_sources
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/sorting/isin.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/sorting/merge_sort.cpp
@@ -82,6 +95,10 @@ set(_tensor_accumulation_impl_sources
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/tensor_accumulation.cpp
     ${_accumulator_sources}
 )
+set(_tensor_reductions_impl_sources
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/tensor_reductions.cpp
+    ${_reduction_sources}
+)
 set(_tensor_sorting_impl_sources
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/tensor_sorting.cpp
     ${_sorting_sources}
@@ -114,6 +131,12 @@ add_sycl_to_target(TARGET ${python_module_name} SOURCES ${_tensor_accumulation_i
 target_link_libraries(${python_module_name} PRIVATE ${_static_lib_trgt})
 list(APPEND _py_trgts ${python_module_name})
 
+set(python_module_name _tensor_reductions_impl)
+pybind11_add_module(${python_module_name} MODULE ${_tensor_reductions_impl_sources})
+add_sycl_to_target(TARGET ${python_module_name} SOURCES ${_tensor_reductions_impl_sources})
+target_link_libraries(${python_module_name} PRIVATE ${_static_lib_trgt})
+list(APPEND _py_trgts ${python_module_name})
+
 set(python_module_name _tensor_sorting_impl)
 pybind11_add_module(${python_module_name} MODULE ${_tensor_sorting_impl_sources})
 add_sycl_to_target(TARGET ${python_module_name} SOURCES ${_tensor_sorting_impl_sources})
@@ -135,7 +158,7 @@ set(_no_fast_math_sources
 list(
     APPEND _no_fast_math_sources
     # ${_elementwise_sources}
-    # ${_reduction_sources}
+    ${_reduction_sources}
     ${_sorting_sources}
     # ${_linalg_sources}
     ${_accumulator_sources}
diff --git a/dpctl_ext/tensor/__init__.py b/dpctl_ext/tensor/__init__.py
index cba7c417d559..ac24151bedfe 100644
--- a/dpctl_ext/tensor/__init__.py
+++ b/dpctl_ext/tensor/__init__.py
@@ -78,6 +78,17 @@
     tile,
     unstack,
 )
+from ._reduction import (
+    argmax,
+    argmin,
+    count_nonzero,
+    logsumexp,
+    max,
+    min,
+    prod,
+    reduce_hypot,
+    sum,
+)
 from ._reshape import reshape
 from ._search_functions import where
 from ._searchsorted import searchsorted
@@ -90,9 +101,14 @@
 )
 from ._sorting import argsort, sort, top_k
 from ._type_utils import can_cast, finfo, iinfo, isdtype, result_type
+from ._utility_functions import all, any, diff
 
 __all__ = [
+    "all",
+    "any",
     "arange",
+    "argmax",
+    "argmin",
     "argsort",
     "asarray",
     "asnumpy",
@@ -102,10 +118,12 @@
     "can_cast",
     "concat",
     "copy",
+    "count_nonzero",
     "clip",
     "cumulative_logsumexp",
     "cumulative_prod",
     "cumulative_sum",
+    "diff",
     "empty",
     "empty_like",
     "extract",
@@ -120,15 +138,20 @@
     "isdtype",
     "isin",
     "linspace",
+    "logsumexp",
+    "max",
     "meshgrid",
+    "min",
     "moveaxis",
     "permute_dims",
     "nonzero",
     "ones",
     "ones_like",
     "place",
+    "prod",
     "put",
     "put_along_axis",
+    "reduce_hypot",
     "repeat",
     "reshape",
     "result_type",
@@ -137,6 +160,7 @@
     "sort",
     "squeeze",
     "stack",
+    "sum",
     "swapaxes",
     "take",
     "take_along_axis",
diff --git a/dpctl_ext/tensor/_manipulation_functions.py b/dpctl_ext/tensor/_manipulation_functions.py
index 08459dcaea76..e2d55c533bc0 100644
--- a/dpctl_ext/tensor/_manipulation_functions.py
+++ b/dpctl_ext/tensor/_manipulation_functions.py
@@ -624,7 +624,7 @@ def repeat(x, repeats, /, *, axis=None):
                     "'repeats' array must be broadcastable to the size of "
                     "the repeated axis"
                 )
-            if not dpt.all(repeats >= 0):
+            if not dpt_ext.all(repeats >= 0):
                 raise ValueError("'repeats' elements must be positive")
 
     elif isinstance(repeats, (tuple, list, range)):
@@ -646,7 +646,7 @@ def repeat(x, repeats, /, *, axis=None):
             repeats = dpt_ext.asarray(
                 repeats, dtype=dpt.int64, usm_type=usm_type, sycl_queue=exec_q
             )
-            if not dpt.all(repeats >= 0):
+            if not dpt_ext.all(repeats >= 0):
                 raise ValueError("`repeats` elements must be positive")
     else:
         raise TypeError(
diff --git a/dpctl_ext/tensor/_reduction.py b/dpctl_ext/tensor/_reduction.py
new file mode 100644
index 000000000000..b8fdcf4f37e6
--- /dev/null
+++ b/dpctl_ext/tensor/_reduction.py
@@ -0,0 +1,834 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+import dpctl
+import dpctl.tensor as dpt
+from dpctl.utils import ExecutionPlacementError, SequentialOrderManager
+
+# TODO: revert to `import dpctl.tensor...`
+# when dpnp fully migrates dpctl/tensor
+import dpctl_ext.tensor as dpt_ext
+import dpctl_ext.tensor._tensor_impl as ti
+import dpctl_ext.tensor._tensor_reductions_impl as tri
+
+from ._numpy_helper import normalize_axis_tuple
+from ._type_utils import (
+    _default_accumulation_dtype,
+    _default_accumulation_dtype_fp_types,
+    _to_device_supported_dtype,
+)
+
+
+def _comparison_over_axis(x, axis, keepdims, out, _reduction_fn):
+    if not isinstance(x, dpt.usm_ndarray):
+        raise TypeError(f"Expected dpctl.tensor.usm_ndarray, got {type(x)}")
+
+    nd = x.ndim
+    if axis is None:
+        axis = tuple(range(nd))
+        perm = list(axis)
+        x_tmp = x
+    else:
+        if not isinstance(axis, (tuple, list)):
+            axis = (axis,)
+        axis = normalize_axis_tuple(axis, nd, "axis")
+        perm = [i for i in range(nd) if i not in axis] + list(axis)
+        x_tmp = dpt_ext.permute_dims(x, perm)
+    red_nd = len(axis)
+    if any([x_tmp.shape[i] == 0 for i in range(-red_nd, 0)]):
+        raise ValueError("reduction cannot be performed over zero-size axes")
+    res_shape = x_tmp.shape[: nd - red_nd]
+    exec_q = x.sycl_queue
+    res_dt = x.dtype
+    res_usm_type = x.usm_type
+
+    orig_out = out
+    if out is not None:
+        if not isinstance(out, dpt.usm_ndarray):
+            raise TypeError(
+                f"output array must be of usm_ndarray type, got {type(out)}"
+            )
+        if not out.flags.writable:
+            raise ValueError("provided `out` array is read-only")
+        if not keepdims:
+            final_res_shape = res_shape
+        else:
+            inp_shape = x.shape
+            final_res_shape = tuple(
+                inp_shape[i] if i not in axis else 1 for i in range(nd)
+            )
+        if not out.shape == final_res_shape:
+            raise ValueError(
+                "The shape of input and output arrays are inconsistent. "
+                f"Expected output shape is {final_res_shape}, got {out.shape}"
+            )
+        if res_dt != out.dtype:
+            raise ValueError(
+                f"Output array of type {res_dt} is needed, got {out.dtype}"
+            )
+        if dpctl.utils.get_execution_queue((exec_q, out.sycl_queue)) is None:
+            raise ExecutionPlacementError(
+                "Input and output allocation queues are not compatible"
+            )
+        if keepdims:
+            out = dpt_ext.squeeze(out, axis=axis)
+            orig_out = out
+        if ti._array_overlap(x, out):
+            out = dpt_ext.empty_like(out)
+    else:
+        out = dpt_ext.empty(
+            res_shape, dtype=res_dt, usm_type=res_usm_type, sycl_queue=exec_q
+        )
+
+    _manager = SequentialOrderManager[exec_q]
+    dep_evs = _manager.submitted_events
+    if red_nd == 0:
+        ht_e_cpy, cpy_e = ti._copy_usm_ndarray_into_usm_ndarray(
+            src=x_tmp, dst=out, sycl_queue=exec_q, depends=dep_evs
+        )
+        _manager.add_event_pair(ht_e_cpy, cpy_e)
+        if not (orig_out is None or orig_out is out):
+            ht_e_cpy2, cpy2_e = ti._copy_usm_ndarray_into_usm_ndarray(
+                src=out, dst=orig_out, sycl_queue=exec_q, depends=[cpy_e]
+            )
+            _manager.add_event_pair(ht_e_cpy2, cpy2_e)
+            out = orig_out
+        return out
+
+    hev, red_ev = _reduction_fn(
+        src=x_tmp,
+        trailing_dims_to_reduce=red_nd,
+        dst=out,
+        sycl_queue=exec_q,
+        depends=dep_evs,
+    )
+    _manager.add_event_pair(hev, red_ev)
+    if not (orig_out is None or orig_out is out):
+        ht_e_cpy2, cpy2_e = ti._copy_usm_ndarray_into_usm_ndarray(
+            src=out, dst=orig_out, sycl_queue=exec_q, depends=[red_ev]
+        )
+        _manager.add_event_pair(ht_e_cpy2, cpy2_e)
+        out = orig_out
+
+    if keepdims:
+        res_shape = res_shape + (1,) * red_nd
+        inv_perm = sorted(range(nd), key=lambda d: perm[d])
+        out = dpt_ext.permute_dims(dpt_ext.reshape(out, res_shape), inv_perm)
+    return out
+
+
+def _reduction_over_axis(
+    x,
+    axis,
+    dtype,
+    keepdims,
+    out,
+    _reduction_fn,
+    _dtype_supported,
+    _default_reduction_type_fn,
+):
+    if not isinstance(x, dpt.usm_ndarray):
+        raise TypeError(f"Expected dpctl.tensor.usm_ndarray, got {type(x)}")
+    nd = x.ndim
+    if axis is None:
+        axis = tuple(range(nd))
+        perm = list(axis)
+        arr = x
+    else:
+        if not isinstance(axis, (tuple, list)):
+            axis = (axis,)
+        axis = normalize_axis_tuple(axis, nd, "axis")
+        perm = [i for i in range(nd) if i not in axis] + list(axis)
+        arr = dpt_ext.permute_dims(x, perm)
+    red_nd = len(axis)
+    res_shape = arr.shape[: nd - red_nd]
+    q = x.sycl_queue
+    inp_dt = x.dtype
+    if dtype is None:
+        res_dt = _default_reduction_type_fn(inp_dt, q)
+    else:
+        res_dt = dpt.dtype(dtype)
+        res_dt = _to_device_supported_dtype(res_dt, q.sycl_device)
+
+    res_usm_type = x.usm_type
+
+    implemented_types = _dtype_supported(inp_dt, res_dt, res_usm_type, q)
+    if dtype is None and not implemented_types:
+        raise RuntimeError(
+            "Automatically determined reduction data type does not "
+            "have direct implementation"
+        )
+    orig_out = out
+    if out is not None:
+        if not isinstance(out, dpt.usm_ndarray):
+            raise TypeError(
+                f"output array must be of usm_ndarray type, got {type(out)}"
+            )
+        if not out.flags.writable:
+            raise ValueError("provided `out` array is read-only")
+        if not keepdims:
+            final_res_shape = res_shape
+        else:
+            inp_shape = x.shape
+            final_res_shape = tuple(
+                inp_shape[i] if i not in axis else 1 for i in range(nd)
+            )
+        if not out.shape == final_res_shape:
+            raise ValueError(
+                "The shape of input and output arrays are inconsistent. "
+                f"Expected output shape is {final_res_shape}, got {out.shape}"
+            )
+        if res_dt != out.dtype:
+            raise ValueError(
+                f"Output array of type {res_dt} is needed, got {out.dtype}"
+            )
+        if dpctl.utils.get_execution_queue((q, out.sycl_queue)) is None:
+            raise ExecutionPlacementError(
+                "Input and output allocation queues are not compatible"
+            )
+        if keepdims:
+            out = dpt_ext.squeeze(out, axis=axis)
+            orig_out = out
+        if ti._array_overlap(x, out) and implemented_types:
+            out = dpt_ext.empty_like(out)
+    else:
+        out = dpt_ext.empty(
+            res_shape, dtype=res_dt, usm_type=res_usm_type, sycl_queue=q
+        )
+
+    _manager = SequentialOrderManager[q]
+    dep_evs = _manager.submitted_events
+    if red_nd == 0:
+        ht_e_cpy, cpy_e = ti._copy_usm_ndarray_into_usm_ndarray(
+            src=arr, dst=out, sycl_queue=q, depends=dep_evs
+        )
+        _manager.add_event_pair(ht_e_cpy, cpy_e)
+        if not (orig_out is None or orig_out is out):
+            ht_e_cpy2, cpy2_e = ti._copy_usm_ndarray_into_usm_ndarray(
+                src=out, dst=orig_out, sycl_queue=q, depends=[cpy_e]
+            )
+            _manager.add_event_pair(ht_e_cpy2, cpy2_e)
+            out = orig_out
+        return out
+
+    if implemented_types:
+        ht_e, red_e = _reduction_fn(
+            src=arr,
+            trailing_dims_to_reduce=red_nd,
+            dst=out,
+            sycl_queue=q,
+            depends=dep_evs,
+        )
+        _manager.add_event_pair(ht_e, red_e)
+        if not (orig_out is None or orig_out is out):
+            ht_e_cpy, cpy_e = ti._copy_usm_ndarray_into_usm_ndarray(
+                src=out, dst=orig_out, sycl_queue=q, depends=[red_e]
+            )
+            _manager.add_event_pair(ht_e_cpy, cpy_e)
+            out = orig_out
+    else:
+        if _dtype_supported(res_dt, res_dt, res_usm_type, q):
+            tmp = dpt_ext.empty(
+                arr.shape, dtype=res_dt, usm_type=res_usm_type, sycl_queue=q
+            )
+            ht_e_cpy, cpy_e = ti._copy_usm_ndarray_into_usm_ndarray(
+                src=arr, dst=tmp, sycl_queue=q, depends=dep_evs
+            )
+            _manager.add_event_pair(ht_e_cpy, cpy_e)
+            ht_e_red, red_ev = _reduction_fn(
+                src=tmp,
+                trailing_dims_to_reduce=red_nd,
+                dst=out,
+                sycl_queue=q,
+                depends=[cpy_e],
+            )
+            _manager.add_event_pair(ht_e_red, red_ev)
+        else:
+            buf_dt = _default_reduction_type_fn(inp_dt, q)
+            tmp = dpt_ext.empty(
+                arr.shape, dtype=buf_dt, usm_type=res_usm_type, sycl_queue=q
+            )
+            ht_e_cpy, cpy_e = ti._copy_usm_ndarray_into_usm_ndarray(
+                src=arr, dst=tmp, sycl_queue=q, depends=dep_evs
+            )
+            _manager.add_event_pair(ht_e_cpy, cpy_e)
+            tmp_res = dpt_ext.empty(
+                res_shape, dtype=buf_dt, usm_type=res_usm_type, sycl_queue=q
+            )
+            ht_e_red, r_e = _reduction_fn(
+                src=tmp,
+                trailing_dims_to_reduce=red_nd,
+                dst=tmp_res,
+                sycl_queue=q,
+                depends=[cpy_e],
+            )
+            _manager.add_event_pair(ht_e_red, r_e)
+            ht_e_cpy2, cpy2_e = ti._copy_usm_ndarray_into_usm_ndarray(
+                src=tmp_res, dst=out, sycl_queue=q, depends=[r_e]
+            )
+            _manager.add_event_pair(ht_e_cpy2, cpy2_e)
+
+    if keepdims:
+        res_shape = res_shape + (1,) * red_nd
+        inv_perm = sorted(range(nd), key=lambda d: perm[d])
+        out = dpt_ext.permute_dims(dpt_ext.reshape(out, res_shape), inv_perm)
+    return out
+
+
+def _search_over_axis(x, axis, keepdims, out, _reduction_fn):
+    if not isinstance(x, dpt.usm_ndarray):
+        raise TypeError(f"Expected dpctl.tensor.usm_ndarray, got {type(x)}")
+
+    nd = x.ndim
+    if axis is None:
+        axis = tuple(range(nd))
+        perm = list(axis)
+        x_tmp = x
+    else:
+        if isinstance(axis, int):
+            axis = (axis,)
+        else:
+            raise TypeError(
+                f"'axis' argument expected to have type 'int' "
+                r"or be `None`, "
+                f"got type {type(axis)}"
+            )
+        axis = normalize_axis_tuple(axis, nd, "axis")
+        perm = [i for i in range(nd) if i not in axis] + list(axis)
+        x_tmp = dpt_ext.permute_dims(x, perm)
+    axis = normalize_axis_tuple(axis, nd, "axis")
+    red_nd = len(axis)
+    if any([x_tmp.shape[i] == 0 for i in range(-red_nd, 0)]):
+        raise ValueError("reduction cannot be performed over zero-size axes")
+    res_shape = x_tmp.shape[: nd - red_nd]
+    exec_q = x.sycl_queue
+    res_dt = ti.default_device_index_type(exec_q.sycl_device)
+    res_usm_type = x.usm_type
+
+    orig_out = out
+    if out is not None:
+        if not isinstance(out, dpt.usm_ndarray):
+            raise TypeError(
+                f"output array must be of usm_ndarray type, got {type(out)}"
+            )
+        if not out.flags.writable:
+            raise ValueError("provided `out` array is read-only")
+        if not keepdims:
+            final_res_shape = res_shape
+        else:
+            inp_shape = x.shape
+            final_res_shape = tuple(
+                inp_shape[i] if i not in axis else 1 for i in range(nd)
+            )
+        if not out.shape == final_res_shape:
+            raise ValueError(
+                "The shape of input and output arrays are inconsistent. "
+                f"Expected output shape is {final_res_shape}, got {out.shape}"
+            )
+        if res_dt != out.dtype:
+            raise ValueError(
+                f"Output array of type {res_dt} is needed, got {out.dtype}"
+            )
+        if dpctl.utils.get_execution_queue((exec_q, out.sycl_queue)) is None:
+            raise ExecutionPlacementError(
+                "Input and output allocation queues are not compatible"
+            )
+        if keepdims:
+            out = dpt_ext.squeeze(out, axis=axis)
+            orig_out = out
+        if ti._array_overlap(x, out) and red_nd > 0:
+            out = dpt_ext.empty_like(out)
+    else:
+        out = dpt_ext.empty(
+            res_shape, dtype=res_dt, usm_type=res_usm_type, sycl_queue=exec_q
+        )
+
+    _manager = SequentialOrderManager[exec_q]
+    dep_evs = _manager.submitted_events
+    if red_nd == 0:
+        ht_e_fill, fill_ev = ti._full_usm_ndarray(
+            fill_value=0, dst=out, sycl_queue=exec_q, depends=dep_evs
+        )
+        _manager.add_event_pair(ht_e_fill, fill_ev)
+        return out
+
+    hev, red_ev = _reduction_fn(
+        src=x_tmp,
+        trailing_dims_to_reduce=red_nd,
+        dst=out,
+        sycl_queue=exec_q,
+        depends=dep_evs,
+    )
+    _manager.add_event_pair(hev, red_ev)
+    if not (orig_out is None or orig_out is out):
+        ht_e_cpy2, cpy2_e = ti._copy_usm_ndarray_into_usm_ndarray(
+            src=out, dst=orig_out, sycl_queue=exec_q, depends=[red_ev]
+        )
+        _manager.add_event_pair(ht_e_cpy2, cpy2_e)
+        out = orig_out
+
+    if keepdims:
+        res_shape = res_shape + (1,) * red_nd
+        inv_perm = sorted(range(nd), key=lambda d: perm[d])
+        out = dpt_ext.permute_dims(dpt_ext.reshape(out, res_shape), inv_perm)
+    return out
+
+
+def argmax(x, /, *, axis=None, keepdims=False, out=None):
+    """
+    Returns the indices of the maximum values of the input array ``x`` along a
+    specified axis.
+
+    When the maximum value occurs multiple times, the indices corresponding to
+    the first occurrence are returned.
+
+    Args:
+        x (usm_ndarray):
+            input array.
+        axis (Optional[int]):
+            axis along which to search. If ``None``, returns the index of the
+            maximum value of the flattened array.
+            Default: ``None``.
+        keepdims (Optional[bool]):
+            if ``True``, the reduced axes (dimensions) are included in the
+            result as singleton dimensions, so that the returned array remains
+            compatible with the input arrays according to Array Broadcasting
+            rules. Otherwise, if ``False``, the reduced axes are not included
+            in the returned array. Default: ``False``.
+        out (Optional[usm_ndarray]):
+            the array into which the result is written.
+            The data type of ``out`` must match the expected shape and the
+            expected data type of the result.
+            If ``None`` then a new array is returned. Default: ``None``.
+
+    Returns:
+        usm_ndarray:
+            an array containing the indices of the first occurrence of the
+            maximum values. If the entire array was searched, a
+            zero-dimensional array is returned. The returned array has the
+            default array index data type for the device of ``x``.
+    """
+    return _search_over_axis(x, axis, keepdims, out, tri._argmax_over_axis)
+
+
+def argmin(x, /, *, axis=None, keepdims=False, out=None):
+    """
+    Returns the indices of the minimum values of the input array ``x`` along a
+    specified axis.
+
+    When the minimum value occurs multiple times, the indices corresponding to
+    the first occurrence are returned.
+
+    Args:
+        x (usm_ndarray):
+            input array.
+        axis (Optional[int]):
+            axis along which to search. If ``None``, returns the index of the
+            minimum value of the flattened array.
+            Default: ``None``.
+        keepdims (Optional[bool]):
+            if ``True``, the reduced axes (dimensions) are included in the
+            result as singleton dimensions, so that the returned array remains
+            compatible with the input arrays according to Array Broadcasting
+            rules. Otherwise, if ``False``, the reduced axes are not included
+            in the returned array. Default: ``False``.
+        out (Optional[usm_ndarray]):
+            the array into which the result is written.
+            The data type of ``out`` must match the expected shape and the
+            expected data type of the result.
+            If ``None`` then a new array is returned. Default: ``None``.
+
+    Returns:
+        usm_ndarray:
+            an array containing the indices of the first occurrence of the
+            minimum values. If the entire array was searched, a
+            zero-dimensional array is returned. The returned array has the
+            default array index data type for the device of ``x``.
+    """
+    return _search_over_axis(x, axis, keepdims, out, tri._argmin_over_axis)
+
+
+def count_nonzero(x, /, *, axis=None, keepdims=False, out=None):
+    """
+    Counts the number of elements in the input array ``x`` which are non-zero.
+
+    Args:
+        x (usm_ndarray):
+            input array.
+        axis (Optional[int, Tuple[int, ...]]):
+            axis or axes along which to count. If a tuple of unique integers,
+            the number of non-zero values are computed over multiple axes.
+            If ``None``, the number of non-zero values is computed over the
+            entire array.
+            Default: ``None``.
+        keepdims (Optional[bool]):
+            if ``True``, the reduced axes (dimensions) are included in the
+            result as singleton dimensions, so that the returned array remains
+            compatible with the input arrays according to Array Broadcasting
+            rules. Otherwise, if ``False``, the reduced axes are not included
+            in the returned array. Default: ``False``.
+        out (Optional[usm_ndarray]):
+            the array into which the result is written.
+            The data type of ``out`` must match the expected shape and data
+            type.
+            If ``None`` then a new array is returned. Default: ``None``.
+
+    Returns:
+        usm_ndarray:
+            an array containing the count of non-zero values. If the sum was
+            computed over the entire array, a zero-dimensional array is
+            returned. The returned array will have the default array index data
+            type.
+    """
+    if x.dtype != dpt.bool:
+        x = dpt.astype(x, dpt.bool, copy=False)
+    return sum(
+        x,
+        axis=axis,
+        dtype=ti.default_device_index_type(x.sycl_device),
+        keepdims=keepdims,
+        out=out,
+    )
+
+
+def logsumexp(x, /, *, axis=None, dtype=None, keepdims=False, out=None):
+    """
+    Calculates the logarithm of the sum of exponentials of elements in the
+    input array ``x``.
+
+    Args:
+        x (usm_ndarray):
+            input array.
+        axis (Optional[int, Tuple[int, ...]]):
+            axis or axes along which values must be computed. If a tuple
+            of unique integers, values are computed over multiple axes.
+            If ``None``, the result is computed over the entire array.
+            Default: ``None``.
+        dtype (Optional[dtype]):
+            data type of the returned array. If ``None``, the default data
+            type is inferred from the "kind" of the input array data type.
+
+            * If ``x`` has a real-valued floating-point data type, the
+              returned array will have the same data type as ``x``.
+            * If ``x`` has a boolean or integral data type, the returned array
+              will have the default floating point data type for the device
+              where input array ``x`` is allocated.
+            * If ``x`` has a complex-valued floating-point data type,
+              an error is raised.
+
+            If the data type (either specified or resolved) differs from the
+            data type of ``x``, the input array elements are cast to the
+            specified data type before computing the result.
+            Default: ``None``.
+        keepdims (Optional[bool]):
+            if ``True``, the reduced axes (dimensions) are included in the
+            result as singleton dimensions, so that the returned array remains
+            compatible with the input arrays according to Array Broadcasting
+            rules. Otherwise, if ``False``, the reduced axes are not included
+            in the returned array. Default: ``False``.
+        out (Optional[usm_ndarray]):
+            the array into which the result is written.
+            The data type of ``out`` must match the expected shape and the
+            expected data type of the result or (if provided) ``dtype``.
+            If ``None`` then a new array is returned. Default: ``None``.
+
+    Returns:
+        usm_ndarray:
+            an array containing the results. If the result was computed over
+            the entire array, a zero-dimensional array is returned.
+            The returned array has the data type as described in the
+            ``dtype`` parameter description above.
+    """
+    return _reduction_over_axis(
+        x,
+        axis,
+        dtype,
+        keepdims,
+        out,
+        tri._logsumexp_over_axis,
+        lambda inp_dt, res_dt, *_: tri._logsumexp_over_axis_dtype_supported(
+            inp_dt, res_dt
+        ),
+        _default_accumulation_dtype_fp_types,
+    )
+
+
+def max(x, /, *, axis=None, keepdims=False, out=None):
+    """
+    Calculates the maximum value of the input array ``x``.
+
+    Args:
+        x (usm_ndarray):
+            input array.
+        axis (Optional[int, Tuple[int, ...]]):
+            axis or axes along which maxima must be computed. If a tuple
+            of unique integers, the maxima are computed over multiple axes.
+            If ``None``, the max is computed over the entire array.
+            Default: ``None``.
+        keepdims (Optional[bool]):
+            if ``True``, the reduced axes (dimensions) are included in the
+            result as singleton dimensions, so that the returned array remains
+            compatible with the input arrays according to Array Broadcasting
+            rules. Otherwise, if ``False``, the reduced axes are not included
+            in the returned array. Default: ``False``.
+        out (Optional[usm_ndarray]):
+            the array into which the result is written.
+            The data type of ``out`` must match the expected shape and the
+            expected data type of the result.
+            If ``None`` then a new array is returned. Default: ``None``.
+
+    Returns:
+        usm_ndarray:
+            an array containing the maxima. If the max was computed over the
+            entire array, a zero-dimensional array is returned. The returned
+            array has the same data type as ``x``.
+    """
+    return _comparison_over_axis(x, axis, keepdims, out, tri._max_over_axis)
+
+
+def min(x, /, *, axis=None, keepdims=False, out=None):
+    """
+    Calculates the minimum value of the input array ``x``.
+
+    Args:
+        x (usm_ndarray):
+            input array.
+        axis (Optional[int, Tuple[int, ...]]):
+            axis or axes along which minima must be computed. If a tuple
+            of unique integers, the minima are computed over multiple axes.
+            If ``None``, the min is computed over the entire array.
+            Default: ``None``.
+        keepdims (Optional[bool]):
+            if ``True``, the reduced axes (dimensions) are included in the
+            result as singleton dimensions, so that the returned array remains
+            compatible with the input arrays according to Array Broadcasting
+            rules. Otherwise, if ``False``, the reduced axes are not included
+            in the returned array. Default: ``False``.
+        out (Optional[usm_ndarray]):
+            the array into which the result is written.
+            The data type of ``out`` must match the expected shape and the
+            expected data type of the result.
+            If ``None`` then a new array is returned. Default: ``None``.
+
+    Returns:
+        usm_ndarray:
+            an array containing the minima. If the min was computed over the
+            entire array, a zero-dimensional array is returned. The returned
+            array has the same data type as ``x``.
+    """
+    return _comparison_over_axis(x, axis, keepdims, out, tri._min_over_axis)
+
+
+def prod(x, /, *, axis=None, dtype=None, keepdims=False, out=None):
+    """
+    Calculates the product of elements in the input array ``x``.
+
+    Args:
+        x (usm_ndarray):
+            input array.
+        axis (Optional[int, Tuple[int, ...]]):
+            axis or axes along which products must be computed. If a tuple
+            of unique integers, products are computed over multiple axes.
+            If ``None``, the product is computed over the entire array.
+            Default: ``None``.
+        dtype (Optional[dtype]):
+            data type of the returned array. If ``None``, the default data
+            type is inferred from the "kind" of the input array data type.
+
+            * If ``x`` has a real- or complex-valued floating-point data
+              type, the returned array will have the same data type as
+              ``x``.
+            * If ``x`` has signed integral data type, the returned array
+              will have the default signed integral type for the device
+              where input array ``x`` is allocated.
+            * If ``x`` has unsigned integral data type, the returned array
+              will have the default unsigned integral type for the device
+              where input array ``x`` is allocated.
+            * If ``x`` has a boolean data type, the returned array will
+              have the default signed integral type for the device
+              where input array ``x`` is allocated.
+
+            If the data type (either specified or resolved) differs from the
+            data type of ``x``, the input array elements are cast to the
+            specified data type before computing the product.
+            Default: ``None``.
+        keepdims (Optional[bool]):
+            if ``True``, the reduced axes (dimensions) are included in the
+            result as singleton dimensions, so that the returned array remains
+            compatible with the input arrays according to Array Broadcasting
+            rules. Otherwise, if ``False``, the reduced axes are not included
+            in the returned array. Default: ``False``.
+        out (Optional[usm_ndarray]):
+            the array into which the result is written.
+            The data type of ``out`` must match the expected shape and the
+            expected data type of the result or (if provided) ``dtype``.
+            If ``None`` then a new array is returned. Default: ``None``.
+
+    Returns:
+        usm_ndarray:
+            an array containing the products. If the product was computed over
+            the entire array, a zero-dimensional array is returned. The
+            returned array has the data type as described in the ``dtype``
+            parameter description above.
+    """
+    return _reduction_over_axis(
+        x,
+        axis,
+        dtype,
+        keepdims,
+        out,
+        tri._prod_over_axis,
+        tri._prod_over_axis_dtype_supported,
+        _default_accumulation_dtype,
+    )
+
+
+def reduce_hypot(x, /, *, axis=None, dtype=None, keepdims=False, out=None):
+    """
+    Calculates the square root of the sum of squares of elements in the input
+    array ``x``.
+
+    Args:
+        x (usm_ndarray):
+            input array.
+        axis (Optional[int, Tuple[int, ...]]):
+            axis or axes along which values must be computed. If a tuple
+            of unique integers, values are computed over multiple axes.
+            If ``None``, the result is computed over the entire array.
+            Default: ``None``.
+        dtype (Optional[dtype]):
+            data type of the returned array. If ``None``, the default data
+            type is inferred from the "kind" of the input array data type.
+
+            * If ``x`` has a real-valued floating-point data type, the
+              returned array will have the same data type as ``x``.
+            * If ``x`` has a boolean or integral data type, the returned array
+              will have the default floating point data type for the device
+              where input array ``x`` is allocated.
+            * If ``x`` has a complex-valued floating-point data type,
+              an error is raised.
+
+            If the data type (either specified or resolved) differs from the
+            data type of ``x``, the input array elements are cast to the
+            specified data type before computing the result. Default: ``None``.
+        keepdims (Optional[bool]):
+            if ``True``, the reduced axes (dimensions) are included in the
+            result as singleton dimensions, so that the returned array remains
+            compatible with the input arrays according to Array Broadcasting
+            rules. Otherwise, if ``False``, the reduced axes are not included
+            in the returned array. Default: ``False``.
+        out (Optional[usm_ndarray]):
+            the array into which the result is written.
+            The data type of ``out`` must match the expected shape and the
+            expected data type of the result or (if provided) ``dtype``.
+            If ``None`` then a new array is returned. Default: ``None``.
+
+    Returns:
+        usm_ndarray:
+            an array containing the results. If the result was computed over
+            the entire array, a zero-dimensional array is returned. The
+            returned array has the data type as described in the ``dtype``
+            parameter description above.
+    """
+    return _reduction_over_axis(
+        x,
+        axis,
+        dtype,
+        keepdims,
+        out,
+        tri._hypot_over_axis,
+        lambda inp_dt, res_dt, *_: tri._hypot_over_axis_dtype_supported(
+            inp_dt, res_dt
+        ),
+        _default_accumulation_dtype_fp_types,
+    )
+
+
+def sum(x, /, *, axis=None, dtype=None, keepdims=False, out=None):
+    """
+    Calculates the sum of elements in the input array ``x``.
+
+    Args:
+        x (usm_ndarray):
+            input array.
+        axis (Optional[int, Tuple[int, ...]]):
+            axis or axes along which sums must be computed. If a tuple
+            of unique integers, sums are computed over multiple axes.
+            If ``None``, the sum is computed over the entire array.
+            Default: ``None``.
+        dtype (Optional[dtype]):
+            data type of the returned array. If ``None``, the default data
+            type is inferred from the "kind" of the input array data type.
+
+            * If ``x`` has a real- or complex-valued floating-point data
+              type, the returned array will have the same data type as
+              ``x``.
+            * If ``x`` has signed integral data type, the returned array
+              will have the default signed integral type for the device
+              where input array ``x`` is allocated.
+            * If ``x`` has unsigned integral data type, the returned array
+              will have the default unsigned integral type for the device
+              where input array ``x`` is allocated.
+              array ``x`` is allocated.
+            * If ``x`` has a boolean data type, the returned array will
+              have the default signed integral type for the device
+              where input array ``x`` is allocated.
+
+            If the data type (either specified or resolved) differs from the
+            data type of ``x``, the input array elements are cast to the
+            specified data type before computing the sum.
+            Default: ``None``.
+        keepdims (Optional[bool]):
+            if ``True``, the reduced axes (dimensions) are included in the
+            result as singleton dimensions, so that the returned array remains
+            compatible with the input arrays according to Array Broadcasting
+            rules. Otherwise, if ``False``, the reduced axes are not included
+            in the returned array. Default: ``False``.
+        out (Optional[usm_ndarray]):
+            the array into which the result is written.
+            The data type of ``out`` must match the expected shape and the
+            expected data type of the result or (if provided) ``dtype``.
+            If ``None`` then a new array is returned. Default: ``None``.
+
+    Returns:
+        usm_ndarray:
+            an array containing the sums. If the sum was computed over the
+            entire array, a zero-dimensional array is returned. The returned
+            array has the data type as described in the ``dtype`` parameter
+            description above.
+    """
+    return _reduction_over_axis(
+        x,
+        axis,
+        dtype,
+        keepdims,
+        out,
+        tri._sum_over_axis,
+        tri._sum_over_axis_dtype_supported,
+        _default_accumulation_dtype,
+    )
diff --git a/dpctl_ext/tensor/_utility_functions.py b/dpctl_ext/tensor/_utility_functions.py
new file mode 100644
index 000000000000..a122ac3d6cea
--- /dev/null
+++ b/dpctl_ext/tensor/_utility_functions.py
@@ -0,0 +1,509 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+import builtins
+import operator
+
+import dpctl.tensor as dpt
+import dpctl.utils as du
+
+# TODO: revert to `import dpctl.tensor...`
+# when dpnp fully migrates dpctl/tensor
+import dpctl_ext.tensor as dpt_ext
+import dpctl_ext.tensor._tensor_impl as ti
+import dpctl_ext.tensor._tensor_reductions_impl as tri
+
+from ._numpy_helper import normalize_axis_index, normalize_axis_tuple
+from ._scalar_utils import (
+    _get_dtype,
+    _get_queue_usm_type,
+    _get_shape,
+    _validate_dtype,
+)
+from ._type_utils import (
+    _resolve_one_strong_one_weak_types,
+    _resolve_one_strong_two_weak_types,
+)
+
+
+def _boolean_reduction(x, axis, keepdims, func):
+    if not isinstance(x, dpt.usm_ndarray):
+        raise TypeError(f"Expected dpctl.tensor.usm_ndarray, got {type(x)}")
+
+    nd = x.ndim
+    if axis is None:
+        red_nd = nd
+        # case of a scalar
+        if red_nd == 0:
+            return dpt_ext.astype(x, dpt.bool)
+        x_tmp = x
+        res_shape = ()
+        perm = list(range(nd))
+    else:
+        if not isinstance(axis, (tuple, list)):
+            axis = (axis,)
+        axis = normalize_axis_tuple(axis, nd, "axis")
+
+        red_nd = len(axis)
+        # check for axis=()
+        if red_nd == 0:
+            return dpt_ext.astype(x, dpt.bool)
+        perm = [i for i in range(nd) if i not in axis] + list(axis)
+        x_tmp = dpt_ext.permute_dims(x, perm)
+        res_shape = x_tmp.shape[: nd - red_nd]
+
+    exec_q = x.sycl_queue
+    res_usm_type = x.usm_type
+
+    _manager = du.SequentialOrderManager[exec_q]
+    dep_evs = _manager.submitted_events
+    # always allocate the temporary as
+    # int32 and usm-device  to ensure that atomic updates
+    # are supported
+    res_tmp = dpt_ext.empty(
+        res_shape,
+        dtype=dpt.int32,
+        usm_type="device",
+        sycl_queue=exec_q,
+    )
+    hev0, ev0 = func(
+        src=x_tmp,
+        trailing_dims_to_reduce=red_nd,
+        dst=res_tmp,
+        sycl_queue=exec_q,
+        depends=dep_evs,
+    )
+    _manager.add_event_pair(hev0, ev0)
+
+    # copy to boolean result array
+    res = dpt_ext.empty(
+        res_shape,
+        dtype=dpt.bool,
+        usm_type=res_usm_type,
+        sycl_queue=exec_q,
+    )
+    hev1, ev1 = ti._copy_usm_ndarray_into_usm_ndarray(
+        src=res_tmp, dst=res, sycl_queue=exec_q, depends=[ev0]
+    )
+    _manager.add_event_pair(hev1, ev1)
+
+    if keepdims:
+        res_shape = res_shape + (1,) * red_nd
+        inv_perm = sorted(range(nd), key=lambda d: perm[d])
+        res = dpt_ext.permute_dims(dpt_ext.reshape(res, res_shape), inv_perm)
+    return res
+
+
+def all(x, /, *, axis=None, keepdims=False):
+    """
+    all(x, axis=None, keepdims=False)
+
+    Tests whether all input array elements evaluate to True along a given axis.
+
+    Args:
+        x (usm_ndarray): Input array.
+        axis (Optional[Union[int, Tuple[int,...]]]): Axis (or axes)
+            along which to perform a logical AND reduction.
+            When `axis` is `None`, a logical AND reduction
+            is performed over all dimensions of `x`.
+            If `axis` is negative, the axis is counted from
+            the last dimension to the first.
+            Default: `None`.
+        keepdims (bool, optional): If `True`, the reduced axes are included
+            in the result as singleton dimensions, and the result is
+            broadcastable to the input array shape.
+            If `False`, the reduced axes are not included in the result.
+            Default: `False`.
+
+    Returns:
+        usm_ndarray:
+            An array with a data type of `bool`
+            containing the results of the logical AND reduction.
+    """
+    return _boolean_reduction(x, axis, keepdims, tri._all)
+
+
+def any(x, /, *, axis=None, keepdims=False):
+    """
+    any(x, axis=None, keepdims=False)
+
+    Tests whether any input array elements evaluate to True along a given axis.
+
+    Args:
+        x (usm_ndarray): Input array.
+        axis (Optional[Union[int, Tuple[int,...]]]): Axis (or axes)
+            along which to perform a logical OR reduction.
+            When `axis` is `None`, a logical OR reduction
+            is performed over all dimensions of `x`.
+            If `axis` is negative, the axis is counted from
+            the last dimension to the first.
+            Default: `None`.
+        keepdims (bool, optional): If `True`, the reduced axes are included
+            in the result as singleton dimensions, and the result is
+            broadcastable to the input array shape.
+            If `False`, the reduced axes are not included in the result.
+            Default: `False`.
+
+    Returns:
+        usm_ndarray:
+            An array with a data type of `bool`
+            containing the results of the logical OR reduction.
+    """
+    return _boolean_reduction(x, axis, keepdims, tri._any)
+
+
+def _validate_diff_shape(sh1, sh2, axis):
+    """
+    Utility for validating that two shapes `sh1` and `sh2`
+    are possible to concatenate along `axis`.
+    """
+    if not sh2:
+        # scalars will always be accepted
+        return True
+    else:
+        sh1_ndim = len(sh1)
+        if sh1_ndim == len(sh2) and builtins.all(
+            sh1[i] == sh2[i] for i in range(sh1_ndim) if i != axis
+        ):
+            return True
+        else:
+            return False
+
+
+def _concat_diff_input(arr, axis, prepend, append):
+    """
+    Concatenates `arr`, `prepend` and, `append` along `axis`,
+    where `arr` is an array and `prepend` and `append` are
+    any mixture of arrays and scalars.
+    """
+    if prepend is not None and append is not None:
+        q1, x_usm_type = arr.sycl_queue, arr.usm_type
+        q2, prepend_usm_type = _get_queue_usm_type(prepend)
+        q3, append_usm_type = _get_queue_usm_type(append)
+        if q2 is None and q3 is None:
+            exec_q = q1
+            coerced_usm_type = x_usm_type
+        elif q3 is None:
+            exec_q = du.get_execution_queue((q1, q2))
+            if exec_q is None:
+                raise du.ExecutionPlacementError(
+                    "Execution placement can not be unambiguously inferred "
+                    "from input arguments."
+                )
+            coerced_usm_type = du.get_coerced_usm_type(
+                (
+                    x_usm_type,
+                    prepend_usm_type,
+                )
+            )
+        elif q2 is None:
+            exec_q = du.get_execution_queue((q1, q3))
+            if exec_q is None:
+                raise du.ExecutionPlacementError(
+                    "Execution placement can not be unambiguously inferred "
+                    "from input arguments."
+                )
+            coerced_usm_type = du.get_coerced_usm_type(
+                (
+                    x_usm_type,
+                    append_usm_type,
+                )
+            )
+        else:
+            exec_q = du.get_execution_queue((q1, q2, q3))
+            if exec_q is None:
+                raise du.ExecutionPlacementError(
+                    "Execution placement can not be unambiguously inferred "
+                    "from input arguments."
+                )
+            coerced_usm_type = du.get_coerced_usm_type(
+                (
+                    x_usm_type,
+                    prepend_usm_type,
+                    append_usm_type,
+                )
+            )
+        du.validate_usm_type(coerced_usm_type, allow_none=False)
+        arr_shape = arr.shape
+        prepend_shape = _get_shape(prepend)
+        append_shape = _get_shape(append)
+        if not builtins.all(
+            isinstance(s, (tuple, list))
+            for s in (
+                prepend_shape,
+                append_shape,
+            )
+        ):
+            raise TypeError(
+                "Shape of arguments can not be inferred. "
+                "Arguments are expected to be "
+                "lists, tuples, or both"
+            )
+        valid_prepend_shape = _validate_diff_shape(
+            arr_shape, prepend_shape, axis
+        )
+        if not valid_prepend_shape:
+            raise ValueError(
+                f"`diff` argument `prepend` with shape {prepend_shape} is "
+                f"invalid for first input with shape {arr_shape}"
+            )
+        valid_append_shape = _validate_diff_shape(arr_shape, append_shape, axis)
+        if not valid_append_shape:
+            raise ValueError(
+                f"`diff` argument `append` with shape {append_shape} is invalid"
+                f" for first input with shape {arr_shape}"
+            )
+        sycl_dev = exec_q.sycl_device
+        arr_dtype = arr.dtype
+        prepend_dtype = _get_dtype(prepend, sycl_dev)
+        append_dtype = _get_dtype(append, sycl_dev)
+        if not builtins.all(
+            _validate_dtype(o) for o in (prepend_dtype, append_dtype)
+        ):
+            raise ValueError("Operands have unsupported data types")
+        prepend_dtype, append_dtype = _resolve_one_strong_two_weak_types(
+            arr_dtype, prepend_dtype, append_dtype, sycl_dev
+        )
+        if isinstance(prepend, dpt.usm_ndarray):
+            a_prepend = prepend
+        else:
+            a_prepend = dpt_ext.asarray(
+                prepend,
+                dtype=prepend_dtype,
+                usm_type=coerced_usm_type,
+                sycl_queue=exec_q,
+            )
+        if isinstance(append, dpt.usm_ndarray):
+            a_append = append
+        else:
+            a_append = dpt_ext.asarray(
+                append,
+                dtype=append_dtype,
+                usm_type=coerced_usm_type,
+                sycl_queue=exec_q,
+            )
+        if not prepend_shape:
+            prepend_shape = arr_shape[:axis] + (1,) + arr_shape[axis + 1 :]
+            a_prepend = dpt_ext.broadcast_to(a_prepend, prepend_shape)
+        if not append_shape:
+            append_shape = arr_shape[:axis] + (1,) + arr_shape[axis + 1 :]
+            a_append = dpt_ext.broadcast_to(a_append, append_shape)
+        return dpt_ext.concat((a_prepend, arr, a_append), axis=axis)
+    elif prepend is not None:
+        q1, x_usm_type = arr.sycl_queue, arr.usm_type
+        q2, prepend_usm_type = _get_queue_usm_type(prepend)
+        if q2 is None:
+            exec_q = q1
+            coerced_usm_type = x_usm_type
+        else:
+            exec_q = du.get_execution_queue((q1, q2))
+            if exec_q is None:
+                raise du.ExecutionPlacementError(
+                    "Execution placement can not be unambiguously inferred "
+                    "from input arguments."
+                )
+            coerced_usm_type = du.get_coerced_usm_type(
+                (
+                    x_usm_type,
+                    prepend_usm_type,
+                )
+            )
+        du.validate_usm_type(coerced_usm_type, allow_none=False)
+        arr_shape = arr.shape
+        prepend_shape = _get_shape(prepend)
+        if not isinstance(prepend_shape, (tuple, list)):
+            raise TypeError(
+                "Shape of argument can not be inferred. "
+                "Argument is expected to be a "
+                "list or tuple"
+            )
+        valid_prepend_shape = _validate_diff_shape(
+            arr_shape, prepend_shape, axis
+        )
+        if not valid_prepend_shape:
+            raise ValueError(
+                f"`diff` argument `prepend` with shape {prepend_shape} is "
+                f"invalid for first input with shape {arr_shape}"
+            )
+        sycl_dev = exec_q.sycl_device
+        arr_dtype = arr.dtype
+        prepend_dtype = _get_dtype(prepend, sycl_dev)
+        if not _validate_dtype(prepend_dtype):
+            raise ValueError("Operand has unsupported data type")
+        prepend_dtype = _resolve_one_strong_one_weak_types(
+            arr_dtype, prepend_dtype, sycl_dev
+        )
+        if isinstance(prepend, dpt.usm_ndarray):
+            a_prepend = prepend
+        else:
+            a_prepend = dpt_ext.asarray(
+                prepend,
+                dtype=prepend_dtype,
+                usm_type=coerced_usm_type,
+                sycl_queue=exec_q,
+            )
+        if not prepend_shape:
+            prepend_shape = arr_shape[:axis] + (1,) + arr_shape[axis + 1 :]
+            a_prepend = dpt_ext.broadcast_to(a_prepend, prepend_shape)
+        return dpt_ext.concat((a_prepend, arr), axis=axis)
+    elif append is not None:
+        q1, x_usm_type = arr.sycl_queue, arr.usm_type
+        q2, append_usm_type = _get_queue_usm_type(append)
+        if q2 is None:
+            exec_q = q1
+            coerced_usm_type = x_usm_type
+        else:
+            exec_q = du.get_execution_queue((q1, q2))
+            if exec_q is None:
+                raise du.ExecutionPlacementError(
+                    "Execution placement can not be unambiguously inferred "
+                    "from input arguments."
+                )
+            coerced_usm_type = du.get_coerced_usm_type(
+                (
+                    x_usm_type,
+                    append_usm_type,
+                )
+            )
+        du.validate_usm_type(coerced_usm_type, allow_none=False)
+        arr_shape = arr.shape
+        append_shape = _get_shape(append)
+        if not isinstance(append_shape, (tuple, list)):
+            raise TypeError(
+                "Shape of argument can not be inferred. "
+                "Argument is expected to be a "
+                "list or tuple"
+            )
+        valid_append_shape = _validate_diff_shape(arr_shape, append_shape, axis)
+        if not valid_append_shape:
+            raise ValueError(
+                f"`diff` argument `append` with shape {append_shape} is invalid"
+                f" for first input with shape {arr_shape}"
+            )
+        sycl_dev = exec_q.sycl_device
+        arr_dtype = arr.dtype
+        append_dtype = _get_dtype(append, sycl_dev)
+        if not _validate_dtype(append_dtype):
+            raise ValueError("Operand has unsupported data type")
+        append_dtype = _resolve_one_strong_one_weak_types(
+            arr_dtype, append_dtype, sycl_dev
+        )
+        if isinstance(append, dpt.usm_ndarray):
+            a_append = append
+        else:
+            a_append = dpt_ext.asarray(
+                append,
+                dtype=append_dtype,
+                usm_type=coerced_usm_type,
+                sycl_queue=exec_q,
+            )
+        if not append_shape:
+            append_shape = arr_shape[:axis] + (1,) + arr_shape[axis + 1 :]
+            a_append = dpt_ext.broadcast_to(a_append, append_shape)
+        return dpt_ext.concat((arr, a_append), axis=axis)
+    else:
+        arr1 = arr
+    return arr1
+
+
+def diff(x, /, *, axis=-1, n=1, prepend=None, append=None):
+    """
+    Calculates the `n`-th discrete forward difference of `x` along `axis`.
+
+    Args:
+        x (usm_ndarray):
+            input array.
+        axis (int):
+            axis along which to compute the difference. A valid axis must be on
+            the interval `[-N, N)`, where `N` is the rank (number of
+            dimensions) of `x`.
+            Default: `-1`
+        n (int):
+            number of times to recursively compute the difference.
+            Default: `1`.
+        prepend (Union[usm_ndarray, bool, int, float, complex]):
+            value or values to prepend to the specified axis before taking the
+            difference.
+            Must have the same shape as `x` except along `axis`, which can have
+            any shape.
+            Default: `None`.
+        append (Union[usm_ndarray, bool, int, float, complex]):
+            value or values to append to the specified axis before taking the
+            difference.
+            Must have the same shape as `x` except along `axis`, which can have
+            any shape.
+            Default: `None`.
+
+    Returns:
+        usm_ndarray:
+            an array containing the `n`-th differences. The array will have the
+            same shape as `x`, except along `axis`, which will have shape:
+            ``prepend.shape[axis] + x.shape[axis] + append.shape[axis] - n``
+
+            The data type of the returned array is determined by the Type
+            Promotion Rules.
+    """
+
+    if not isinstance(x, dpt.usm_ndarray):
+        raise TypeError(
+            "Expecting dpctl.tensor.usm_ndarray type, " f"got {type(x)}"
+        )
+    x_nd = x.ndim
+    axis = normalize_axis_index(operator.index(axis), x_nd)
+    n = operator.index(n)
+    if n < 0:
+        raise ValueError(f"`n` must be positive, got {n}")
+    arr = _concat_diff_input(x, axis, prepend, append)
+    if n == 0:
+        return arr
+    # form slices and recurse
+    sl0 = tuple(
+        slice(None) if i != axis else slice(1, None) for i in range(x_nd)
+    )
+    sl1 = tuple(
+        slice(None) if i != axis else slice(None, -1) for i in range(x_nd)
+    )
+
+    diff_op = dpt.not_equal if x.dtype == dpt.bool else dpt.subtract
+    if n > 1:
+        arr_tmp0 = diff_op(arr[sl0], arr[sl1])
+        arr_tmp1 = diff_op(arr_tmp0[sl0], arr_tmp0[sl1])
+        n = n - 2
+        if n > 0:
+            sl3 = tuple(
+                slice(None) if i != axis else slice(None, -2)
+                for i in range(x_nd)
+            )
+            for _ in range(n):
+                arr_tmp0_sliced = arr_tmp0[sl3]
+                diff_op(arr_tmp1[sl0], arr_tmp1[sl1], out=arr_tmp0_sliced)
+                arr_tmp0, arr_tmp1 = arr_tmp1, arr_tmp0_sliced
+        arr = arr_tmp1
+    else:
+        arr = diff_op(arr[sl0], arr[sl1])
+    return arr
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/reductions.hpp b/dpctl_ext/tensor/libtensor/include/kernels/reductions.hpp
new file mode 100644
index 000000000000..ee6431dec637
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/include/kernels/reductions.hpp
@@ -0,0 +1,3323 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for tensor reduction along axis.
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <algorithm>
+#include <cassert>
+#include <cmath>
+#include <complex>
+#include <cstddef>
+#include <cstdint>
+#include <type_traits>
+#include <utility>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpctl_tensor_types.hpp"
+#include "utils/math_utils.hpp"
+#include "utils/offset_utils.hpp"
+#include "utils/sycl_alloc_utils.hpp"
+#include "utils/sycl_utils.hpp"
+#include "utils/type_utils.hpp"
+
+namespace dpctl::tensor::kernels
+{
+
+using dpctl::tensor::ssize_t;
+namespace su_ns = dpctl::tensor::sycl_utils;
+
+namespace reduction_detail
+{
+
+inline std::size_t get_work_group_size(const sycl::device &d)
+{
+    // prevents running out of resources on CPU
+    return std::min<std::size_t>(
+        2048, d.get_info<sycl::info::device::max_work_group_size>() / 2);
+}
+
+} // namespace reduction_detail
+
+template <typename ReductionOpT, typename T>
+struct needs_workaround
+{
+    static constexpr bool value =
+        (std::is_same_v<ReductionOpT, sycl::multiplies<T>> &&
+         (std::is_same_v<T, std::int64_t> ||
+          std::is_same_v<T, std::uint64_t>)) ||
+        (__LIBSYCL_MAJOR_VERSION < 7 && std::is_same_v<T, bool> &&
+         std::is_same_v<ReductionOpT, sycl::logical_or<T>>);
+};
+
+template <typename ReductionOpT, typename T>
+struct can_use_reduce_over_group
+{
+    static constexpr bool value =
+        sycl::has_known_identity<ReductionOpT, T>::value &&
+        !needs_workaround<ReductionOpT, T>::value;
+};
+
+template <typename argT,
+          typename outT,
+          typename ReductionOp,
+          typename InputOutputIterIndexerT,
+          typename InputRedIndexerT>
+struct SequentialReduction
+{
+private:
+    const argT *inp_ = nullptr;
+    outT *out_ = nullptr;
+    ReductionOp reduction_op_;
+    outT identity_;
+    InputOutputIterIndexerT inp_out_iter_indexer_;
+    InputRedIndexerT inp_reduced_dims_indexer_;
+    std::size_t reduction_max_gid_ = 0;
+
+public:
+    SequentialReduction(const argT *inp,
+                        outT *res,
+                        const ReductionOp &reduction_op,
+                        const outT &identity_val,
+                        const InputOutputIterIndexerT &arg_res_iter_indexer,
+                        const InputRedIndexerT &arg_reduced_dims_indexer,
+                        std::size_t reduction_size)
+        : inp_(inp), out_(res), reduction_op_(reduction_op),
+          identity_(identity_val), inp_out_iter_indexer_(arg_res_iter_indexer),
+          inp_reduced_dims_indexer_(arg_reduced_dims_indexer),
+          reduction_max_gid_(reduction_size)
+    {
+    }
+
+    void operator()(sycl::id<1> id) const
+    {
+
+        auto const &inp_out_iter_offsets_ = inp_out_iter_indexer_(id[0]);
+        const ssize_t &inp_iter_offset =
+            inp_out_iter_offsets_.get_first_offset();
+        const ssize_t &out_iter_offset =
+            inp_out_iter_offsets_.get_second_offset();
+
+        outT red_val(identity_);
+        for (std::size_t m = 0; m < reduction_max_gid_; ++m) {
+            const ssize_t inp_reduction_offset = inp_reduced_dims_indexer_(m);
+            const ssize_t inp_offset = inp_iter_offset + inp_reduction_offset;
+
+            using dpctl::tensor::type_utils::convert_impl;
+            outT val;
+            if constexpr (su_ns::IsLogicalAnd<outT, ReductionOp>::value ||
+                          su_ns::IsLogicalOr<outT, ReductionOp>::value)
+            {
+                val = convert_impl<bool, argT>(inp_[inp_offset]);
+            }
+            else {
+                val = convert_impl<outT, argT>(inp_[inp_offset]);
+            }
+            red_val = reduction_op_(red_val, val);
+        }
+
+        out_[out_iter_offset] = red_val;
+    }
+};
+
+/* === Reduction, using sycl::reduce_over_group, and sycl::atomic_ref === */
+
+/*
+  This kernel only works for outT with sizeof(outT) == 4, or sizeof(outT) == 8
+  if the device has aspect atomic64 and only with those supported by
+  sycl::atomic_ref
+*/
+template <typename argT,
+          typename outT,
+          typename ReductionOp,
+          typename InputOutputIterIndexerT,
+          typename InputRedIndexerT>
+struct ReductionOverGroupWithAtomicFunctor
+{
+private:
+    const argT *inp_ = nullptr;
+    outT *out_ = nullptr;
+    ReductionOp reduction_op_;
+    outT identity_;
+    InputOutputIterIndexerT inp_out_iter_indexer_;
+    InputRedIndexerT inp_reduced_dims_indexer_;
+    std::size_t reduction_max_gid_ = 0;
+    std::size_t iter_gws_ = 1;
+    std::size_t reductions_per_wi = 16;
+
+public:
+    ReductionOverGroupWithAtomicFunctor(
+        const argT *data,
+        outT *res,
+        const ReductionOp &reduction_op,
+        const outT &identity_val,
+        const InputOutputIterIndexerT &arg_res_iter_indexer,
+        const InputRedIndexerT &arg_reduced_dims_indexer,
+        std::size_t reduction_size,
+        std::size_t iteration_size,
+        std::size_t reduction_size_per_wi)
+        : inp_(data), out_(res), reduction_op_(reduction_op),
+          identity_(identity_val), inp_out_iter_indexer_(arg_res_iter_indexer),
+          inp_reduced_dims_indexer_(arg_reduced_dims_indexer),
+          reduction_max_gid_(reduction_size), iter_gws_(iteration_size),
+          reductions_per_wi(reduction_size_per_wi)
+    {
+    }
+
+    void operator()(sycl::nd_item<1> it) const
+    {
+        const std::size_t iter_gid = it.get_group(0) % iter_gws_;
+        const std::size_t reduction_batch_id = it.get_group(0) / iter_gws_;
+
+        const std::size_t reduction_lid = it.get_local_id(0);
+        const std::size_t wg =
+            it.get_local_range(0); //   0 <= reduction_lid < wg
+
+        // work-items operate over input with indices
+        //   inp_data_id = reduction_batch_id * wg * reductions_per_wi + m * wg
+        //   + reduction_lid
+        // for 0 <= m < reductions_per_wi
+
+        const auto &inp_out_iter_offsets_ = inp_out_iter_indexer_(iter_gid);
+        const auto &inp_iter_offset = inp_out_iter_offsets_.get_first_offset();
+        const auto &out_iter_offset = inp_out_iter_offsets_.get_second_offset();
+
+        outT local_red_val(identity_);
+        std::size_t arg_reduce_gid0 =
+            reduction_lid + reduction_batch_id * wg * reductions_per_wi;
+        std::size_t arg_reduce_gid_max = std::min(
+            reduction_max_gid_, arg_reduce_gid0 + reductions_per_wi * wg);
+
+        for (std::size_t arg_reduce_gid = arg_reduce_gid0;
+             arg_reduce_gid < arg_reduce_gid_max; arg_reduce_gid += wg)
+        {
+            auto inp_reduction_offset =
+                inp_reduced_dims_indexer_(arg_reduce_gid);
+            auto inp_offset = inp_iter_offset + inp_reduction_offset;
+
+            using dpctl::tensor::type_utils::convert_impl;
+            outT val;
+            if constexpr (su_ns::IsLogicalAnd<outT, ReductionOp>::value ||
+                          su_ns::IsLogicalOr<outT, ReductionOp>::value)
+            {
+                // handle nans
+                val = convert_impl<bool, argT>(inp_[inp_offset]);
+            }
+            else {
+                val = convert_impl<outT, argT>(inp_[inp_offset]);
+            }
+
+            local_red_val = reduction_op_(local_red_val, val);
+        }
+
+        auto work_group = it.get_group();
+        // This only works if reduction_op_ is from small set of operators
+        outT red_val_over_wg;
+        if constexpr (su_ns::IsLogicalAnd<outT, ReductionOp>::value) {
+            red_val_over_wg = static_cast<outT>(
+                sycl::all_of_group(work_group, local_red_val));
+        }
+        else if constexpr (su_ns::IsLogicalOr<outT, ReductionOp>::value) {
+            red_val_over_wg = static_cast<outT>(
+                sycl::any_of_group(work_group, local_red_val));
+        }
+        else {
+            red_val_over_wg = sycl::reduce_over_group(work_group, local_red_val,
+                                                      identity_, reduction_op_);
+        }
+
+        if (work_group.leader()) {
+            sycl::atomic_ref<outT, sycl::memory_order::relaxed,
+                             sycl::memory_scope::device,
+                             sycl::access::address_space::global_space>
+                res_ref(out_[out_iter_offset]);
+            if constexpr (su_ns::IsPlus<outT, ReductionOp>::value) {
+                res_ref += red_val_over_wg;
+            }
+            else if constexpr (su_ns::IsMaximum<outT, ReductionOp>::value) {
+                res_ref.fetch_max(red_val_over_wg);
+            }
+            else if constexpr (su_ns::IsMinimum<outT, ReductionOp>::value) {
+                res_ref.fetch_min(red_val_over_wg);
+            }
+            else if constexpr (su_ns::IsLogicalAnd<outT, ReductionOp>::value) {
+                res_ref.fetch_and(red_val_over_wg);
+            }
+            else if constexpr (su_ns::IsLogicalOr<outT, ReductionOp>::value) {
+                res_ref.fetch_or(red_val_over_wg);
+            }
+            else {
+                outT read_val = res_ref.load();
+                outT new_val{};
+                do {
+                    new_val = reduction_op_(read_val, red_val_over_wg);
+                } while (!res_ref.compare_exchange_strong(read_val, new_val));
+            }
+        }
+    }
+};
+
+/* === Reduction, using custom_reduce_over_group, and sycl::atomic_ref === */
+
+template <typename argT,
+          typename outT,
+          typename ReductionOp,
+          typename InputOutputIterIndexerT,
+          typename InputRedIndexerT,
+          typename SlmT>
+struct CustomReductionOverGroupWithAtomicFunctor
+{
+private:
+    const argT *inp_ = nullptr;
+    outT *out_ = nullptr;
+    ReductionOp reduction_op_;
+    outT identity_;
+    InputOutputIterIndexerT inp_out_iter_indexer_;
+    InputRedIndexerT inp_reduced_dims_indexer_;
+    SlmT local_mem_;
+    std::size_t reduction_max_gid_ = 0;
+    std::size_t iter_gws_ = 1;
+    std::size_t reductions_per_wi = 16;
+
+public:
+    CustomReductionOverGroupWithAtomicFunctor(
+        const argT *data,
+        outT *res,
+        const ReductionOp &reduction_op,
+        const outT &identity_val,
+        const InputOutputIterIndexerT &arg_res_iter_indexer,
+        const InputRedIndexerT &arg_reduced_dims_indexer,
+        SlmT local_mem,
+        std::size_t reduction_size,
+        std::size_t iteration_size,
+        std::size_t reduction_size_per_wi)
+        : inp_(data), out_(res), reduction_op_(reduction_op),
+          identity_(identity_val), inp_out_iter_indexer_(arg_res_iter_indexer),
+          inp_reduced_dims_indexer_(arg_reduced_dims_indexer),
+          local_mem_(local_mem), reduction_max_gid_(reduction_size),
+          iter_gws_(iteration_size), reductions_per_wi(reduction_size_per_wi)
+    {
+    }
+
+    void operator()(sycl::nd_item<1> it) const
+    {
+        const std::size_t iter_gid = it.get_group(0) % iter_gws_;
+        const std::size_t reduction_batch_id = it.get_group(0) / iter_gws_;
+
+        const std::size_t reduction_lid = it.get_local_id(0);
+        const std::size_t wg =
+            it.get_local_range(0); //   0 <= reduction_lid < wg
+
+        // work-items operate over input with indices
+        //   inp_data_id = reduction_batch_id * wg * reductions_per_wi + m * wg
+        //   + reduction_lid
+        // for 0 <= m < reductions_per_wi
+
+        const auto &inp_out_iter_offsets_ = inp_out_iter_indexer_(iter_gid);
+        const auto &inp_iter_offset = inp_out_iter_offsets_.get_first_offset();
+        const auto &out_iter_offset = inp_out_iter_offsets_.get_second_offset();
+
+        outT local_red_val(identity_);
+        std::size_t arg_reduce_gid0 =
+            reduction_lid + reduction_batch_id * wg * reductions_per_wi;
+        std::size_t arg_reduce_gid_max = std::min(
+            reduction_max_gid_, arg_reduce_gid0 + reductions_per_wi * wg);
+
+        for (std::size_t arg_reduce_gid = arg_reduce_gid0;
+             arg_reduce_gid < arg_reduce_gid_max; arg_reduce_gid += wg)
+        {
+            auto inp_reduction_offset =
+                inp_reduced_dims_indexer_(arg_reduce_gid);
+            auto inp_offset = inp_iter_offset + inp_reduction_offset;
+
+            using dpctl::tensor::type_utils::convert_impl;
+            outT val;
+            if constexpr (su_ns::IsLogicalAnd<outT, ReductionOp>::value ||
+                          su_ns::IsLogicalOr<outT, ReductionOp>::value)
+            {
+                // handle nans
+                val = convert_impl<bool, argT>(inp_[inp_offset]);
+            }
+            else {
+                val = convert_impl<outT, argT>(inp_[inp_offset]);
+            }
+
+            local_red_val = reduction_op_(local_red_val, val);
+        }
+
+        auto work_group = it.get_group();
+        outT red_val_over_wg = su_ns::custom_reduce_over_group(
+            work_group, local_mem_, local_red_val, reduction_op_);
+
+        if (work_group.leader()) {
+            sycl::atomic_ref<outT, sycl::memory_order::relaxed,
+                             sycl::memory_scope::device,
+                             sycl::access::address_space::global_space>
+                res_ref(out_[out_iter_offset]);
+            // retain these checks in case a reduce_over_group work-around is
+            // needed
+            if constexpr (su_ns::IsSyclPlus<outT, ReductionOp>::value) {
+                res_ref += red_val_over_wg;
+            }
+            else if constexpr (su_ns::IsSyclMaximum<outT, ReductionOp>::value) {
+                res_ref.fetch_max(red_val_over_wg);
+            }
+            else if constexpr (su_ns::IsSyclMinimum<outT, ReductionOp>::value) {
+                res_ref.fetch_min(red_val_over_wg);
+            }
+            else if constexpr (su_ns::IsSyclLogicalAnd<outT,
+                                                       ReductionOp>::value) {
+                res_ref.fetch_and(red_val_over_wg);
+            }
+            else if constexpr (su_ns::IsSyclLogicalOr<outT, ReductionOp>::value)
+            {
+                res_ref.fetch_or(red_val_over_wg);
+            }
+            else {
+                outT read_val = res_ref.load();
+                outT new_val{};
+                do {
+                    new_val = reduction_op_(read_val, red_val_over_wg);
+                } while (!res_ref.compare_exchange_strong(read_val, new_val));
+            }
+        }
+    }
+};
+
+template <typename argT,
+          typename outT,
+          typename ReductionOp,
+          typename InputOutputIterIndexerT,
+          typename InputRedIndexerT>
+struct ReductionOverGroupNoAtomicFunctor
+{
+private:
+    const argT *inp_ = nullptr;
+    outT *out_ = nullptr;
+    ReductionOp reduction_op_;
+    outT identity_;
+    InputOutputIterIndexerT inp_out_iter_indexer_;
+    InputRedIndexerT inp_reduced_dims_indexer_;
+    std::size_t reduction_max_gid_ = 0;
+    std::size_t iter_gws_ = 1;
+    std::size_t reductions_per_wi = 16;
+
+public:
+    ReductionOverGroupNoAtomicFunctor(
+        const argT *data,
+        outT *res,
+        const ReductionOp &reduction_op,
+        const outT &identity_val,
+        const InputOutputIterIndexerT &arg_res_iter_indexer,
+        const InputRedIndexerT &arg_reduced_dims_indexer,
+        std::size_t reduction_size,
+        std::size_t iteration_size,
+        std::size_t reduction_size_per_wi)
+        : inp_(data), out_(res), reduction_op_(reduction_op),
+          identity_(identity_val), inp_out_iter_indexer_(arg_res_iter_indexer),
+          inp_reduced_dims_indexer_(arg_reduced_dims_indexer),
+          reduction_max_gid_(reduction_size), iter_gws_(iteration_size),
+          reductions_per_wi(reduction_size_per_wi)
+    {
+    }
+
+    void operator()(sycl::nd_item<1> it) const
+    {
+        const std::size_t reduction_lid = it.get_local_id(0);
+        const std::size_t wg =
+            it.get_local_range(0); //   0 <= reduction_lid < wg
+
+        const std::size_t iter_gid = it.get_group(0) % iter_gws_;
+        const std::size_t reduction_batch_id = it.get_group(0) / iter_gws_;
+        const std::size_t n_reduction_groups =
+            it.get_group_range(0) / iter_gws_;
+
+        // work-items operates over input with indices
+        //   inp_data_id = reduction_batch_id * wg * reductions_per_wi + m * wg
+        //   + reduction_lid
+        // for 0 <= m < reductions_per_wi
+
+        const auto &inp_out_iter_offsets_ = inp_out_iter_indexer_(iter_gid);
+        const auto &inp_iter_offset = inp_out_iter_offsets_.get_first_offset();
+        const auto &out_iter_offset = inp_out_iter_offsets_.get_second_offset();
+
+        outT local_red_val(identity_);
+        std::size_t arg_reduce_gid0 =
+            reduction_lid + reduction_batch_id * wg * reductions_per_wi;
+        for (std::size_t m = 0; m < reductions_per_wi; ++m) {
+            std::size_t arg_reduce_gid = arg_reduce_gid0 + m * wg;
+
+            if (arg_reduce_gid < reduction_max_gid_) {
+                auto inp_reduction_offset =
+                    inp_reduced_dims_indexer_(arg_reduce_gid);
+                auto inp_offset = inp_iter_offset + inp_reduction_offset;
+
+                using dpctl::tensor::type_utils::convert_impl;
+                outT val;
+                if constexpr (su_ns::IsLogicalAnd<outT, ReductionOp>::value ||
+                              su_ns::IsLogicalOr<outT, ReductionOp>::value)
+                {
+                    // handle nans
+                    val = convert_impl<bool, argT>(inp_[inp_offset]);
+                }
+                else {
+                    val = convert_impl<outT, argT>(inp_[inp_offset]);
+                }
+
+                local_red_val = reduction_op_(local_red_val, val);
+            }
+        }
+
+        auto work_group = it.get_group();
+        // This only works if reduction_op_ is from small set of operators
+        outT red_val_over_wg;
+        if constexpr (su_ns::IsLogicalAnd<outT, ReductionOp>::value) {
+            red_val_over_wg = sycl::all_of_group(work_group, local_red_val);
+        }
+        else if constexpr (su_ns::IsLogicalOr<outT, ReductionOp>::value) {
+            red_val_over_wg = sycl::any_of_group(work_group, local_red_val);
+        }
+        else {
+            red_val_over_wg = sycl::reduce_over_group(work_group, local_red_val,
+                                                      identity_, reduction_op_);
+        }
+
+        if (work_group.leader()) {
+            // each group writes to a different memory location
+            out_[out_iter_offset * n_reduction_groups + reduction_batch_id] =
+                red_val_over_wg;
+        }
+    }
+};
+
+/* = Reduction, using custom_reduce_over_group and not using atomic_ref*/
+
+template <typename argT,
+          typename outT,
+          typename ReductionOp,
+          typename InputOutputIterIndexerT,
+          typename InputRedIndexerT,
+          typename SlmT>
+struct CustomReductionOverGroupNoAtomicFunctor
+{
+private:
+    const argT *inp_ = nullptr;
+    outT *out_ = nullptr;
+    ReductionOp reduction_op_;
+    outT identity_;
+    InputOutputIterIndexerT inp_out_iter_indexer_;
+    InputRedIndexerT inp_reduced_dims_indexer_;
+    SlmT local_mem_;
+    std::size_t reduction_max_gid_ = 0;
+    std::size_t iter_gws_ = 1;
+    std::size_t reductions_per_wi = 16;
+
+public:
+    CustomReductionOverGroupNoAtomicFunctor(
+        const argT *data,
+        outT *res,
+        const ReductionOp &reduction_op,
+        const outT &identity_val,
+        const InputOutputIterIndexerT &arg_res_iter_indexer,
+        const InputRedIndexerT &arg_reduced_dims_indexer,
+        SlmT local_mem,
+        std::size_t reduction_size,
+        std::size_t iteration_size,
+        std::size_t reduction_size_per_wi)
+        : inp_(data), out_(res), reduction_op_(reduction_op),
+          identity_(identity_val), inp_out_iter_indexer_(arg_res_iter_indexer),
+          inp_reduced_dims_indexer_(arg_reduced_dims_indexer),
+          local_mem_(local_mem), reduction_max_gid_(reduction_size),
+          iter_gws_(iteration_size), reductions_per_wi(reduction_size_per_wi)
+    {
+    }
+
+    void operator()(sycl::nd_item<1> it) const
+    {
+        const std::size_t reduction_lid = it.get_local_id(0);
+        const std::size_t wg =
+            it.get_local_range(0); //   0 <= reduction_lid < wg
+
+        const std::size_t iter_gid = it.get_group(0) % iter_gws_;
+        const std::size_t reduction_batch_id = it.get_group(0) / iter_gws_;
+        const std::size_t n_reduction_groups =
+            it.get_group_range(0) / iter_gws_;
+
+        // work-items operates over input with indices
+        //   inp_data_id = reduction_batch_id * wg * reductions_per_wi + m * wg
+        //   + reduction_lid
+        // for 0 <= m < reductions_per_wi
+
+        auto inp_out_iter_offsets_ = inp_out_iter_indexer_(iter_gid);
+        const auto &inp_iter_offset = inp_out_iter_offsets_.get_first_offset();
+        const auto &out_iter_offset = inp_out_iter_offsets_.get_second_offset();
+
+        outT local_red_val(identity_);
+        std::size_t arg_reduce_gid0 =
+            reduction_lid + reduction_batch_id * wg * reductions_per_wi;
+        for (std::size_t m = 0; m < reductions_per_wi; ++m) {
+            std::size_t arg_reduce_gid = arg_reduce_gid0 + m * wg;
+
+            if (arg_reduce_gid < reduction_max_gid_) {
+                auto inp_reduction_offset =
+                    inp_reduced_dims_indexer_(arg_reduce_gid);
+                auto inp_offset = inp_iter_offset + inp_reduction_offset;
+
+                using dpctl::tensor::type_utils::convert_impl;
+                outT val;
+                if constexpr (std::is_same_v<ReductionOp,
+                                             sycl::logical_and<outT>> ||
+                              std::is_same_v<ReductionOp,
+                                             sycl::logical_or<outT>>)
+                {
+                    // handle nans
+                    val = convert_impl<bool, argT>(inp_[inp_offset]);
+                }
+                else {
+                    val = convert_impl<outT, argT>(inp_[inp_offset]);
+                }
+
+                local_red_val = reduction_op_(local_red_val, val);
+            }
+        }
+
+        auto work_group = it.get_group();
+        // This only works if reduction_op_ is from small set of operators
+        outT red_val_over_wg = su_ns::custom_reduce_over_group(
+            work_group, local_mem_, local_red_val, reduction_op_);
+
+        if (work_group.leader()) {
+            // each group writes to a different memory location
+            out_[out_iter_offset * n_reduction_groups + reduction_batch_id] =
+                red_val_over_wg;
+        }
+    }
+};
+
+template <
+    typename argTy,
+    typename resTy,
+    typename ReductionOpT,
+    typename InputOutputIterIndexerT,
+    typename ReductionIndexerT,
+    template <typename T1, typename T2, typename T3, typename T4, typename T5>
+    class kernel_name_token>
+sycl::event
+    sequential_reduction(sycl::queue &exec_q,
+                         const argTy *arg,
+                         resTy *res,
+                         resTy identity_val,
+                         std::size_t iter_nelems,
+                         std::size_t reduction_nelems,
+                         const InputOutputIterIndexerT &in_out_iter_indexer,
+                         const ReductionIndexerT &reduction_indexer,
+                         const std::vector<sycl::event> &depends)
+{
+    sycl::event red_ev = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+
+        using KernelName =
+            class kernel_name_token<argTy, resTy, ReductionOpT,
+                                    InputOutputIterIndexerT, ReductionIndexerT>;
+
+        cgh.parallel_for<KernelName>(
+            sycl::range<1>(iter_nelems),
+            SequentialReduction<argTy, resTy, ReductionOpT,
+                                InputOutputIterIndexerT, ReductionIndexerT>(
+                arg, res, ReductionOpT(), identity_val, in_out_iter_indexer,
+                reduction_indexer, reduction_nelems));
+    });
+
+    return red_ev;
+}
+
+template <typename BasedKernelName>
+class custom_reduction_wrapper;
+
+template <
+    typename argTy,
+    typename resTy,
+    typename ReductionOpT,
+    typename InputOutputIterIndexerT,
+    typename ReductionIndexerT,
+    template <typename T1, typename T2, typename T3, typename T4, typename T5>
+    class kernel_name_token>
+sycl::event
+    submit_atomic_reduction(sycl::queue &exec_q,
+                            const argTy *arg,
+                            resTy *res,
+                            resTy identity_val,
+                            std::size_t wg,
+                            std::size_t iter_nelems,
+                            std::size_t reduction_nelems,
+                            std::size_t reductions_per_wi,
+                            std::size_t reduction_groups,
+                            const InputOutputIterIndexerT &in_out_iter_indexer,
+                            const ReductionIndexerT &reduction_indexer,
+                            const std::vector<sycl::event> &depends)
+{
+    sycl::event red_ev = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+
+        auto globalRange = sycl::range<1>{iter_nelems * reduction_groups * wg};
+        auto localRange = sycl::range<1>{wg};
+        auto ndRange = sycl::nd_range<1>(globalRange, localRange);
+
+        if constexpr (can_use_reduce_over_group<ReductionOpT, resTy>::value) {
+            using KernelName =
+                class kernel_name_token<argTy, resTy, ReductionOpT,
+                                        InputOutputIterIndexerT,
+                                        ReductionIndexerT>;
+
+            cgh.parallel_for<KernelName>(
+                ndRange,
+                ReductionOverGroupWithAtomicFunctor<argTy, resTy, ReductionOpT,
+                                                    InputOutputIterIndexerT,
+                                                    ReductionIndexerT>(
+                    arg, res, ReductionOpT(), identity_val, in_out_iter_indexer,
+                    reduction_indexer, reduction_nelems, iter_nelems,
+                    reductions_per_wi));
+        }
+        else {
+            using SlmT = sycl::local_accessor<resTy, 1>;
+            SlmT local_memory = SlmT(localRange, cgh);
+
+            using KernelName = class custom_reduction_wrapper<
+                kernel_name_token<argTy, resTy, ReductionOpT,
+                                  InputOutputIterIndexerT, ReductionIndexerT>>;
+
+            cgh.parallel_for<KernelName>(
+                ndRange,
+                CustomReductionOverGroupWithAtomicFunctor<
+                    argTy, resTy, ReductionOpT, InputOutputIterIndexerT,
+                    ReductionIndexerT, SlmT>(
+                    arg, res, ReductionOpT(), identity_val, in_out_iter_indexer,
+                    reduction_indexer, local_memory, reduction_nelems,
+                    iter_nelems, reductions_per_wi));
+        }
+    });
+    return red_ev;
+}
+
+template <typename T1, typename T2, typename T3>
+class reduction_over_group_with_atomics_init_krn;
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5>
+class reduction_seq_krn;
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5>
+class reduction_over_group_with_atomics_krn;
+
+typedef sycl::event (*reduction_strided_impl_fn_ptr)(
+    sycl::queue &,
+    std::size_t,
+    std::size_t,
+    const char *,
+    char *,
+    int,
+    const ssize_t *,
+    ssize_t,
+    ssize_t,
+    int,
+    const ssize_t *,
+    ssize_t,
+    const std::vector<sycl::event> &);
+
+using dpctl::tensor::sycl_utils::choose_workgroup_size;
+
+template <typename argTy, typename resTy, typename ReductionOpT>
+sycl::event reduction_over_group_with_atomics_strided_impl(
+    sycl::queue &exec_q,
+    std::size_t iter_nelems,      // number of reductions    (num. of rows in a
+                                  // matrix when reducing over rows)
+    std::size_t reduction_nelems, // size of each reduction  (length of rows,
+                                  // i.e. number of columns)
+    const char *arg_cp,
+    char *res_cp,
+    int iter_nd,
+    const ssize_t *iter_shape_and_strides,
+    ssize_t iter_arg_offset,
+    ssize_t iter_res_offset,
+    int red_nd,
+    const ssize_t *reduction_shape_stride,
+    ssize_t reduction_arg_offset,
+    const std::vector<sycl::event> &depends)
+{
+    const argTy *arg_tp = reinterpret_cast<const argTy *>(arg_cp);
+    resTy *res_tp = reinterpret_cast<resTy *>(res_cp);
+
+    static constexpr resTy identity_val =
+        su_ns::Identity<ReductionOpT, resTy>::value;
+
+    const sycl::device &d = exec_q.get_device();
+    const auto &sg_sizes = d.get_info<sycl::info::device::sub_group_sizes>();
+    std::size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes);
+
+    if (reduction_nelems < wg) {
+        using InputOutputIterIndexerT =
+            dpctl::tensor::offset_utils::TwoOffsets_StridedIndexer;
+        using ReductionIndexerT = dpctl::tensor::offset_utils::StridedIndexer;
+
+        const InputOutputIterIndexerT in_out_iter_indexer{
+            iter_nd, iter_arg_offset, iter_res_offset, iter_shape_and_strides};
+        const ReductionIndexerT reduction_indexer{red_nd, reduction_arg_offset,
+                                                  reduction_shape_stride};
+
+        sycl::event comp_ev =
+            sequential_reduction<argTy, resTy, ReductionOpT,
+                                 InputOutputIterIndexerT, ReductionIndexerT,
+                                 reduction_seq_krn>(
+                exec_q, arg_tp, res_tp, identity_val, iter_nelems,
+                reduction_nelems, in_out_iter_indexer, reduction_indexer,
+                depends);
+
+        return comp_ev;
+    }
+    else {
+        sycl::event res_init_ev = exec_q.submit([&](sycl::handler &cgh) {
+            using IndexerT =
+                dpctl::tensor::offset_utils::UnpackedStridedIndexer;
+
+            const ssize_t *const &res_shape = iter_shape_and_strides;
+            const ssize_t *const &res_strides =
+                iter_shape_and_strides + 2 * iter_nd;
+            const IndexerT res_indexer(iter_nd, iter_res_offset, res_shape,
+                                       res_strides);
+            using InitKernelName =
+                class reduction_over_group_with_atomics_init_krn<resTy, argTy,
+                                                                 ReductionOpT>;
+            cgh.depends_on(depends);
+
+            cgh.parallel_for<InitKernelName>(
+                sycl::range<1>(iter_nelems), [=](sycl::id<1> id) {
+                    auto res_offset = res_indexer(id[0]);
+                    res_tp[res_offset] = identity_val;
+                });
+        });
+
+        using InputOutputIterIndexerT =
+            dpctl::tensor::offset_utils::TwoOffsets_StridedIndexer;
+        using ReductionIndexerT = dpctl::tensor::offset_utils::StridedIndexer;
+
+        const InputOutputIterIndexerT in_out_iter_indexer{
+            iter_nd, iter_arg_offset, iter_res_offset, iter_shape_and_strides};
+        const ReductionIndexerT reduction_indexer{red_nd, reduction_arg_offset,
+                                                  reduction_shape_stride};
+
+        static constexpr std::size_t preferred_reductions_per_wi = 8;
+        std::size_t reductions_per_wi =
+            (reduction_nelems < preferred_reductions_per_wi * wg)
+                ? std::max<std::size_t>(1, (reduction_nelems + wg - 1) / wg)
+                : preferred_reductions_per_wi;
+
+        std::size_t reduction_groups =
+            (reduction_nelems + reductions_per_wi * wg - 1) /
+            (reductions_per_wi * wg);
+
+        sycl::event comp_ev =
+            submit_atomic_reduction<argTy, resTy, ReductionOpT,
+                                    InputOutputIterIndexerT, ReductionIndexerT,
+                                    reduction_over_group_with_atomics_krn>(
+                exec_q, arg_tp, res_tp, identity_val, wg, iter_nelems,
+                reduction_nelems, reductions_per_wi, reduction_groups,
+                in_out_iter_indexer, reduction_indexer, {res_init_ev});
+
+        return comp_ev;
+    }
+}
+
+// Contig
+
+typedef sycl::event (*reduction_contig_impl_fn_ptr)(
+    sycl::queue &,
+    std::size_t,
+    std::size_t,
+    const char *,
+    char *,
+    ssize_t,
+    ssize_t,
+    ssize_t,
+    const std::vector<sycl::event> &);
+
+/* @brief Reduce rows in a matrix */
+template <typename argTy, typename resTy, typename ReductionOpT>
+sycl::event reduction_axis1_over_group_with_atomics_contig_impl(
+    sycl::queue &exec_q,
+    std::size_t iter_nelems,      // number of reductions    (num. of rows in a
+                                  // matrix when reducing over rows)
+    std::size_t reduction_nelems, // size of each reduction  (length of rows,
+                                  // i.e. number of columns)
+    const char *arg_cp,
+    char *res_cp,
+    ssize_t iter_arg_offset,
+    ssize_t iter_res_offset,
+    ssize_t reduction_arg_offset,
+    const std::vector<sycl::event> &depends)
+{
+    const argTy *arg_tp = reinterpret_cast<const argTy *>(arg_cp) +
+                          iter_arg_offset + reduction_arg_offset;
+    resTy *res_tp = reinterpret_cast<resTy *>(res_cp) + iter_res_offset;
+
+    static constexpr resTy identity_val =
+        su_ns::Identity<ReductionOpT, resTy>::value;
+
+    const sycl::device &d = exec_q.get_device();
+    const auto &sg_sizes = d.get_info<sycl::info::device::sub_group_sizes>();
+    std::size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes);
+
+    if (reduction_nelems < wg) {
+        using InputIterIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer;
+        using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+        using InputOutputIterIndexerT =
+            dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
+                InputIterIndexerT, NoOpIndexerT>;
+        using ReductionIndexerT = NoOpIndexerT;
+
+        const InputOutputIterIndexerT in_out_iter_indexer{
+            InputIterIndexerT{/* size */ iter_nelems,
+                              /* step */ reduction_nelems},
+            NoOpIndexerT{}};
+        static constexpr ReductionIndexerT reduction_indexer{};
+
+        sycl::event comp_ev =
+            sequential_reduction<argTy, resTy, ReductionOpT,
+                                 InputOutputIterIndexerT, ReductionIndexerT,
+                                 reduction_seq_krn>(
+                exec_q, arg_tp, res_tp, identity_val, iter_nelems,
+                reduction_nelems, in_out_iter_indexer, reduction_indexer,
+                depends);
+
+        return comp_ev;
+    }
+    else {
+        sycl::event res_init_ev = exec_q.fill<resTy>(
+            res_tp, resTy(identity_val), iter_nelems, depends);
+
+        using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+        using RowsIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer;
+        using InputOutputIterIndexerT =
+            dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
+                RowsIndexerT, NoOpIndexerT>;
+        using ReductionIndexerT = NoOpIndexerT;
+
+        const RowsIndexerT rows_indexer{/* size */ iter_nelems,
+                                        /* step */ reduction_nelems};
+        static constexpr NoOpIndexerT result_indexer{};
+        const InputOutputIterIndexerT in_out_iter_indexer{rows_indexer,
+                                                          result_indexer};
+        static constexpr ReductionIndexerT reduction_indexer{};
+
+        static constexpr std::size_t preferred_reductions_per_wi = 8;
+        std::size_t reductions_per_wi =
+            (reduction_nelems < preferred_reductions_per_wi * wg)
+                ? std::max<std::size_t>(1, (reduction_nelems + wg - 1) / wg)
+                : preferred_reductions_per_wi;
+
+        std::size_t reduction_groups =
+            (reduction_nelems + reductions_per_wi * wg - 1) /
+            (reductions_per_wi * wg);
+
+        sycl::event comp_ev =
+            submit_atomic_reduction<argTy, resTy, ReductionOpT,
+                                    InputOutputIterIndexerT, ReductionIndexerT,
+                                    reduction_over_group_with_atomics_krn>(
+                exec_q, arg_tp, res_tp, identity_val, wg, iter_nelems,
+                reduction_nelems, reductions_per_wi, reduction_groups,
+                in_out_iter_indexer, reduction_indexer, {res_init_ev});
+
+        return comp_ev;
+    }
+}
+
+/* @brief Reduce rows in a matrix */
+template <typename argTy, typename resTy, typename ReductionOpT>
+sycl::event reduction_axis0_over_group_with_atomics_contig_impl(
+    sycl::queue &exec_q,
+    std::size_t iter_nelems,      // number of reductions    (num. of cols in a
+                                  // matrix when reducing over cols)
+    std::size_t reduction_nelems, // size of each reduction  (length of cols,
+                                  // i.e. number of rows)
+    const char *arg_cp,
+    char *res_cp,
+    ssize_t iter_arg_offset,
+    ssize_t iter_res_offset,
+    ssize_t reduction_arg_offset,
+    const std::vector<sycl::event> &depends)
+{
+    const argTy *arg_tp = reinterpret_cast<const argTy *>(arg_cp) +
+                          iter_arg_offset + reduction_arg_offset;
+    resTy *res_tp = reinterpret_cast<resTy *>(res_cp) + iter_res_offset;
+
+    static constexpr resTy identity_val =
+        su_ns::Identity<ReductionOpT, resTy>::value;
+
+    const sycl::device &d = exec_q.get_device();
+    const auto &sg_sizes = d.get_info<sycl::info::device::sub_group_sizes>();
+    std::size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes);
+
+    if (reduction_nelems < wg) {
+        using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+        using InputOutputIterIndexerT =
+            dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
+                NoOpIndexerT, NoOpIndexerT>;
+        using ReductionIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer;
+
+        const InputOutputIterIndexerT in_out_iter_indexer{NoOpIndexerT{},
+                                                          NoOpIndexerT{}};
+        const ReductionIndexerT reduction_indexer{/* size */ reduction_nelems,
+                                                  /* step */ iter_nelems};
+
+        sycl::event comp_ev =
+            sequential_reduction<argTy, resTy, ReductionOpT,
+                                 InputOutputIterIndexerT, ReductionIndexerT,
+                                 reduction_seq_krn>(
+                exec_q, arg_tp, res_tp, identity_val, iter_nelems,
+                reduction_nelems, in_out_iter_indexer, reduction_indexer,
+                depends);
+
+        return comp_ev;
+    }
+    else {
+        sycl::event res_init_ev = exec_q.fill<resTy>(
+            res_tp, resTy(identity_val), iter_nelems, depends);
+
+        using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+        using ColsIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer;
+        using InputOutputIterIndexerT =
+            dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
+                NoOpIndexerT, NoOpIndexerT>;
+        using ReductionIndexerT = ColsIndexerT;
+
+        static constexpr NoOpIndexerT columns_indexer{};
+        static constexpr NoOpIndexerT result_indexer{};
+        const InputOutputIterIndexerT in_out_iter_indexer{columns_indexer,
+                                                          result_indexer};
+        const ReductionIndexerT reduction_indexer{/* size */ reduction_nelems,
+                                                  /* step */ iter_nelems};
+
+        static constexpr std::size_t preferred_reductions_per_wi = 8;
+        std::size_t reductions_per_wi =
+            (reduction_nelems < preferred_reductions_per_wi * wg)
+                ? std::max<std::size_t>(1, (reduction_nelems + wg - 1) / wg)
+                : preferred_reductions_per_wi;
+
+        std::size_t reduction_groups =
+            (reduction_nelems + reductions_per_wi * wg - 1) /
+            (reductions_per_wi * wg);
+
+        sycl::event comp_ev =
+            submit_atomic_reduction<argTy, resTy, ReductionOpT,
+                                    InputOutputIterIndexerT, ReductionIndexerT,
+                                    reduction_over_group_with_atomics_krn>(
+                exec_q, arg_tp, res_tp, identity_val, wg, iter_nelems,
+                reduction_nelems, reductions_per_wi, reduction_groups,
+                in_out_iter_indexer, reduction_indexer, {res_init_ev});
+
+        return comp_ev;
+    }
+}
+
+/* = Reduction, using sycl::reduce_over_group, but not using atomic_ref = */
+
+template <
+    typename argTy,
+    typename resTy,
+    typename ReductionOpT,
+    typename InputOutputIterIndexerT,
+    typename ReductionIndexerT,
+    template <typename T1, typename T2, typename T3, typename T4, typename T5>
+    class kernel_name_token>
+sycl::event submit_no_atomic_reduction(
+    sycl::queue &exec_q,
+    const argTy *arg,
+    resTy *res,
+    resTy identity_val,
+    std::size_t wg,
+    std::size_t iter_nelems,
+    std::size_t reduction_nelems,
+    std::size_t reductions_per_wi,
+    std::size_t reduction_groups,
+    const InputOutputIterIndexerT &in_out_iter_indexer,
+    const ReductionIndexerT &reduction_indexer,
+    const std::vector<sycl::event> &depends)
+{
+    sycl::event red_ev = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+
+        auto globalRange = sycl::range<1>{iter_nelems * reduction_groups * wg};
+        auto localRange = sycl::range<1>{wg};
+        auto ndRange = sycl::nd_range<1>(globalRange, localRange);
+
+        if constexpr (can_use_reduce_over_group<ReductionOpT, resTy>::value) {
+            using KernelName =
+                class kernel_name_token<argTy, resTy, ReductionOpT,
+                                        InputOutputIterIndexerT,
+                                        ReductionIndexerT>;
+
+            cgh.parallel_for<KernelName>(
+                ndRange,
+                ReductionOverGroupNoAtomicFunctor<argTy, resTy, ReductionOpT,
+                                                  InputOutputIterIndexerT,
+                                                  ReductionIndexerT>(
+                    arg, res, ReductionOpT(), identity_val, in_out_iter_indexer,
+                    reduction_indexer, reduction_nelems, iter_nelems,
+                    reductions_per_wi));
+        }
+        else {
+            using SlmT = sycl::local_accessor<resTy, 1>;
+            SlmT local_memory = SlmT(localRange, cgh);
+            using KernelName = class custom_reduction_wrapper<
+                kernel_name_token<argTy, resTy, ReductionOpT,
+                                  InputOutputIterIndexerT, ReductionIndexerT>>;
+
+            cgh.parallel_for<KernelName>(
+                ndRange,
+                CustomReductionOverGroupNoAtomicFunctor<
+                    argTy, resTy, ReductionOpT, InputOutputIterIndexerT,
+                    ReductionIndexerT, SlmT>(
+                    arg, res, ReductionOpT(), identity_val, in_out_iter_indexer,
+                    reduction_indexer, local_memory, reduction_nelems,
+                    iter_nelems, reductions_per_wi));
+        }
+    });
+    return red_ev;
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5>
+class reduction_over_group_temps_krn;
+
+typedef sycl::event (*reduction_strided_impl_fn_ptr)(
+    sycl::queue &,
+    std::size_t,
+    std::size_t,
+    const char *,
+    char *,
+    int,
+    const ssize_t *,
+    ssize_t,
+    ssize_t,
+    int,
+    const ssize_t *,
+    ssize_t,
+    const std::vector<sycl::event> &);
+
+template <typename T1, typename T2, typename T3>
+class reduction_over_group_temps_empty_krn;
+
+template <typename argTy, typename resTy, typename ReductionOpT>
+sycl::event reduction_over_group_temps_strided_impl(
+    sycl::queue &exec_q,
+    std::size_t iter_nelems,      // number of reductions    (num. of rows in a
+                                  // matrix when reducing over rows)
+    std::size_t reduction_nelems, // size of each reduction  (length of rows,
+                                  // i.e. number of columns)
+    const char *arg_cp,
+    char *res_cp,
+    int iter_nd,
+    const ssize_t *iter_shape_and_strides,
+    ssize_t iter_arg_offset,
+    ssize_t iter_res_offset,
+    int red_nd,
+    const ssize_t *reduction_shape_stride,
+    ssize_t reduction_arg_offset,
+    const std::vector<sycl::event> &depends)
+{
+    const argTy *arg_tp = reinterpret_cast<const argTy *>(arg_cp);
+    resTy *res_tp = reinterpret_cast<resTy *>(res_cp);
+
+    static constexpr resTy identity_val =
+        su_ns::Identity<ReductionOpT, resTy>::value;
+
+    if (reduction_nelems == 0) {
+        sycl::event res_init_ev = exec_q.submit([&](sycl::handler &cgh) {
+            using IndexerT =
+                dpctl::tensor::offset_utils::UnpackedStridedIndexer;
+
+            const ssize_t *const &res_shape = iter_shape_and_strides;
+            const ssize_t *const &res_strides =
+                iter_shape_and_strides + 2 * iter_nd;
+            const IndexerT res_indexer(iter_nd, iter_res_offset, res_shape,
+                                       res_strides);
+            using InitKernelName =
+                class reduction_over_group_temps_empty_krn<resTy, argTy,
+                                                           ReductionOpT>;
+            cgh.depends_on(depends);
+
+            cgh.parallel_for<InitKernelName>(
+                sycl::range<1>(iter_nelems), [=](sycl::id<1> id) {
+                    auto res_offset = res_indexer(id[0]);
+                    res_tp[res_offset] = identity_val;
+                });
+        });
+
+        return res_init_ev;
+    }
+
+    const sycl::device &d = exec_q.get_device();
+    const auto &sg_sizes = d.get_info<sycl::info::device::sub_group_sizes>();
+    std::size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes);
+
+    if (reduction_nelems < wg) {
+        using InputOutputIterIndexerT =
+            dpctl::tensor::offset_utils::TwoOffsets_StridedIndexer;
+        using ReductionIndexerT = dpctl::tensor::offset_utils::StridedIndexer;
+
+        const InputOutputIterIndexerT in_out_iter_indexer{
+            iter_nd, iter_arg_offset, iter_res_offset, iter_shape_and_strides};
+        const ReductionIndexerT reduction_indexer{red_nd, reduction_arg_offset,
+                                                  reduction_shape_stride};
+
+        sycl::event comp_ev =
+            sequential_reduction<argTy, resTy, ReductionOpT,
+                                 InputOutputIterIndexerT, ReductionIndexerT,
+                                 reduction_seq_krn>(
+                exec_q, arg_tp, res_tp, identity_val, iter_nelems,
+                reduction_nelems, in_out_iter_indexer, reduction_indexer,
+                depends);
+
+        return comp_ev;
+    }
+
+    static constexpr std::size_t preferred_reductions_per_wi = 8;
+    // prevents running out of resources on CPU
+    std::size_t max_wg = reduction_detail::get_work_group_size(d);
+
+    std::size_t reductions_per_wi(preferred_reductions_per_wi);
+    if (reduction_nelems <= preferred_reductions_per_wi * max_wg) {
+        // Perform reduction using one 1 work-group per iteration,
+        // can output directly to res
+
+        using InputOutputIterIndexerT =
+            dpctl::tensor::offset_utils::TwoOffsets_StridedIndexer;
+        using ReductionIndexerT = dpctl::tensor::offset_utils::StridedIndexer;
+
+        const InputOutputIterIndexerT in_out_iter_indexer{
+            iter_nd, iter_arg_offset, iter_res_offset, iter_shape_and_strides};
+        const ReductionIndexerT reduction_indexer{red_nd, reduction_arg_offset,
+                                                  reduction_shape_stride};
+
+        if (iter_nelems == 1) {
+            // increase GPU occupancy
+            wg = max_wg;
+        }
+        reductions_per_wi =
+            std::max<std::size_t>(1, (reduction_nelems + wg - 1) / wg);
+
+        std::size_t reduction_groups =
+            (reduction_nelems + reductions_per_wi * wg - 1) /
+            (reductions_per_wi * wg);
+        assert(reduction_groups == 1);
+
+        sycl::event comp_ev = submit_no_atomic_reduction<
+            argTy, resTy, ReductionOpT, InputOutputIterIndexerT,
+            ReductionIndexerT, reduction_over_group_temps_krn>(
+            exec_q, arg_tp, res_tp, identity_val, wg, iter_nelems,
+            reduction_nelems, reductions_per_wi, reduction_groups,
+            in_out_iter_indexer, reduction_indexer, depends);
+
+        return comp_ev;
+    }
+    else {
+        // more than one work-groups is needed, requires a temporary
+        std::size_t reduction_groups =
+            (reduction_nelems + preferred_reductions_per_wi * wg - 1) /
+            (preferred_reductions_per_wi * wg);
+        assert(reduction_groups > 1);
+
+        std::size_t second_iter_reduction_groups_ =
+            (reduction_groups + preferred_reductions_per_wi * wg - 1) /
+            (preferred_reductions_per_wi * wg);
+
+        const std::size_t tmp_alloc_size =
+            iter_nelems * (reduction_groups + second_iter_reduction_groups_);
+        auto tmp_owner = dpctl::tensor::alloc_utils::smart_malloc_device<resTy>(
+            tmp_alloc_size, exec_q);
+
+        resTy *partially_reduced_tmp = tmp_owner.get();
+        resTy *partially_reduced_tmp2 =
+            partially_reduced_tmp + reduction_groups * iter_nelems;
+        ;
+
+        sycl::event first_reduction_ev;
+        {
+            using InputIndexerT = dpctl::tensor::offset_utils::StridedIndexer;
+            using ResIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+            using InputOutputIterIndexerT =
+                dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
+                    InputIndexerT, ResIndexerT>;
+            using ReductionIndexerT =
+                dpctl::tensor::offset_utils::StridedIndexer;
+
+            // Only 2*iter_nd entries describing shape and strides of
+            // iterated dimensions of input array from
+            // iter_shape_and_strides are going to be accessed by
+            // inp_indexer
+            const InputIndexerT inp_indexer(iter_nd, iter_arg_offset,
+                                            iter_shape_and_strides);
+            static constexpr ResIndexerT noop_tmp_indexer{};
+
+            const InputOutputIterIndexerT in_out_iter_indexer{inp_indexer,
+                                                              noop_tmp_indexer};
+            const ReductionIndexerT reduction_indexer{
+                red_nd, reduction_arg_offset, reduction_shape_stride};
+
+            first_reduction_ev = submit_no_atomic_reduction<
+                argTy, resTy, ReductionOpT, InputOutputIterIndexerT,
+                ReductionIndexerT, reduction_over_group_temps_krn>(
+                exec_q, arg_tp, partially_reduced_tmp, identity_val, wg,
+                iter_nelems, reduction_nelems, preferred_reductions_per_wi,
+                reduction_groups, in_out_iter_indexer, reduction_indexer,
+                depends);
+        }
+
+        std::size_t remaining_reduction_nelems = reduction_groups;
+
+        resTy *temp_arg = partially_reduced_tmp;
+        resTy *temp2_arg = partially_reduced_tmp2;
+        sycl::event dependent_ev = first_reduction_ev;
+
+        while (remaining_reduction_nelems >
+               preferred_reductions_per_wi * max_wg) {
+            std::size_t reduction_groups_ =
+                (remaining_reduction_nelems + preferred_reductions_per_wi * wg -
+                 1) /
+                (preferred_reductions_per_wi * wg);
+            assert(reduction_groups_ > 1);
+
+            // keep reducing
+            sycl::event partial_reduction_ev;
+            {
+                using InputIndexerT =
+                    dpctl::tensor::offset_utils::Strided1DIndexer;
+                using ResIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+                using InputOutputIterIndexerT =
+                    dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
+                        InputIndexerT, ResIndexerT>;
+                using ReductionIndexerT =
+                    dpctl::tensor::offset_utils::NoOpIndexer;
+
+                const InputIndexerT inp_indexer{/* size */ iter_nelems,
+                                                /* step */ reduction_groups_};
+                static constexpr ResIndexerT res_iter_indexer{};
+
+                const InputOutputIterIndexerT in_out_iter_indexer{
+                    inp_indexer, res_iter_indexer};
+                static constexpr ReductionIndexerT reduction_indexer{};
+
+                partial_reduction_ev = submit_no_atomic_reduction<
+                    resTy, resTy, ReductionOpT, InputOutputIterIndexerT,
+                    ReductionIndexerT, reduction_over_group_temps_krn>(
+                    exec_q, temp_arg, temp2_arg, identity_val, wg, iter_nelems,
+                    remaining_reduction_nelems, preferred_reductions_per_wi,
+                    reduction_groups_, in_out_iter_indexer, reduction_indexer,
+                    {dependent_ev});
+            }
+
+            remaining_reduction_nelems = reduction_groups_;
+            std::swap(temp_arg, temp2_arg);
+            dependent_ev = std::move(partial_reduction_ev);
+        }
+
+        // final reduction to res
+        using InputIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer;
+        using ResIndexerT = dpctl::tensor::offset_utils::UnpackedStridedIndexer;
+        using InputOutputIterIndexerT =
+            dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
+                InputIndexerT, ResIndexerT>;
+        using ReductionIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+
+        const InputIndexerT inp_indexer{/* size */ iter_nelems,
+                                        /* step */ remaining_reduction_nelems};
+        const ResIndexerT res_iter_indexer{
+            iter_nd, iter_res_offset,
+            /* shape */ iter_shape_and_strides,
+            /* strides */ iter_shape_and_strides + 2 * iter_nd};
+
+        const InputOutputIterIndexerT in_out_iter_indexer{inp_indexer,
+                                                          res_iter_indexer};
+        static constexpr ReductionIndexerT reduction_indexer{};
+
+        wg = max_wg;
+        reductions_per_wi = std::max<std::size_t>(
+            1, (remaining_reduction_nelems + wg - 1) / wg);
+
+        reduction_groups =
+            (remaining_reduction_nelems + reductions_per_wi * wg - 1) /
+            (reductions_per_wi * wg);
+        assert(reduction_groups == 1);
+
+        sycl::event final_reduction_ev = submit_no_atomic_reduction<
+            resTy, resTy, ReductionOpT, InputOutputIterIndexerT,
+            ReductionIndexerT, reduction_over_group_temps_krn>(
+            exec_q, temp_arg, res_tp, identity_val, wg, iter_nelems,
+            remaining_reduction_nelems, reductions_per_wi, reduction_groups,
+            in_out_iter_indexer, reduction_indexer, {dependent_ev});
+
+        sycl::event cleanup_host_task_event =
+            dpctl::tensor::alloc_utils::async_smart_free(
+                exec_q, {final_reduction_ev}, tmp_owner);
+
+        // FIXME: do not return host-task event
+        //   Instead collect all host-tasks to a list
+
+        return cleanup_host_task_event;
+    }
+}
+
+template <typename argTy, typename resTy, typename ReductionOpT>
+sycl::event reduction_axis1_over_group_temps_contig_impl(
+    sycl::queue &exec_q,
+    std::size_t iter_nelems,      // number of reductions    (num. of rows in a
+                                  // matrix when reducing over rows)
+    std::size_t reduction_nelems, // size of each reduction  (length of rows,
+                                  // i.e. number of columns)
+    const char *arg_cp,
+    char *res_cp,
+    ssize_t iter_arg_offset,
+    ssize_t iter_res_offset,
+    ssize_t reduction_arg_offset,
+    const std::vector<sycl::event> &depends)
+{
+    const argTy *arg_tp = reinterpret_cast<const argTy *>(arg_cp) +
+                          iter_arg_offset + reduction_arg_offset;
+    resTy *res_tp = reinterpret_cast<resTy *>(res_cp) + iter_res_offset;
+
+    static constexpr resTy identity_val =
+        su_ns::Identity<ReductionOpT, resTy>::value;
+
+    if (reduction_nelems == 0) {
+        sycl::event res_init_ev = exec_q.fill<resTy>(
+            res_tp, resTy(identity_val), iter_nelems, depends);
+
+        return res_init_ev;
+    }
+
+    const sycl::device &d = exec_q.get_device();
+    const auto &sg_sizes = d.get_info<sycl::info::device::sub_group_sizes>();
+    std::size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes);
+
+    if (reduction_nelems < wg) {
+        using InputIterIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer;
+        using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+        using InputOutputIterIndexerT =
+            dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
+                InputIterIndexerT, NoOpIndexerT>;
+        using ReductionIndexerT = NoOpIndexerT;
+
+        const InputOutputIterIndexerT in_out_iter_indexer{
+            InputIterIndexerT{/* size */ iter_nelems,
+                              /* step */ reduction_nelems},
+            NoOpIndexerT{}};
+        static constexpr ReductionIndexerT reduction_indexer{};
+
+        sycl::event comp_ev =
+            sequential_reduction<argTy, resTy, ReductionOpT,
+                                 InputOutputIterIndexerT, ReductionIndexerT,
+                                 reduction_seq_krn>(
+                exec_q, arg_tp, res_tp, identity_val, iter_nelems,
+                reduction_nelems, in_out_iter_indexer, reduction_indexer,
+                depends);
+
+        return comp_ev;
+    }
+
+    static constexpr std::size_t preferred_reductions_per_wi = 8;
+    // prevents running out of resources on CPU
+    std::size_t max_wg = reduction_detail::get_work_group_size(d);
+
+    std::size_t reductions_per_wi(preferred_reductions_per_wi);
+    if (reduction_nelems <= preferred_reductions_per_wi * max_wg) {
+        // Perform reduction using one 1 work-group per iteration,
+        // can output directly to res
+
+        using InputIterIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer;
+        using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+        using InputOutputIterIndexerT =
+            dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
+                InputIterIndexerT, NoOpIndexerT>;
+        using ReductionIndexerT = NoOpIndexerT;
+
+        const InputOutputIterIndexerT in_out_iter_indexer{
+            InputIterIndexerT{/* size */ iter_nelems,
+                              /* step */ reduction_nelems},
+            NoOpIndexerT{}};
+        static constexpr ReductionIndexerT reduction_indexer{};
+
+        if (iter_nelems == 1) {
+            // increase GPU occupancy
+            wg = max_wg;
+        }
+        reductions_per_wi =
+            std::max<std::size_t>(1, (reduction_nelems + wg - 1) / wg);
+
+        std::size_t reduction_groups =
+            (reduction_nelems + reductions_per_wi * wg - 1) /
+            (reductions_per_wi * wg);
+        assert(reduction_groups == 1);
+
+        sycl::event comp_ev = submit_no_atomic_reduction<
+            argTy, resTy, ReductionOpT, InputOutputIterIndexerT,
+            ReductionIndexerT, reduction_over_group_temps_krn>(
+            exec_q, arg_tp, res_tp, identity_val, wg, iter_nelems,
+            reduction_nelems, reductions_per_wi, reduction_groups,
+            in_out_iter_indexer, reduction_indexer, depends);
+
+        return comp_ev;
+    }
+    else {
+        // more than one work-groups is needed, requires a temporary
+        std::size_t reduction_groups =
+            (reduction_nelems + preferred_reductions_per_wi * wg - 1) /
+            (preferred_reductions_per_wi * wg);
+        assert(reduction_groups > 1);
+
+        std::size_t second_iter_reduction_groups_ =
+            (reduction_groups + preferred_reductions_per_wi * wg - 1) /
+            (preferred_reductions_per_wi * wg);
+
+        const std::size_t tmp_alloc_size =
+            iter_nelems * (reduction_groups + second_iter_reduction_groups_);
+        auto tmp_owner = dpctl::tensor::alloc_utils::smart_malloc_device<resTy>(
+            tmp_alloc_size, exec_q);
+        resTy *partially_reduced_tmp = tmp_owner.get();
+        resTy *partially_reduced_tmp2 =
+            partially_reduced_tmp + reduction_groups * iter_nelems;
+
+        sycl::event first_reduction_ev;
+        {
+            using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+            using RowsIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer;
+            using InputOutputIterIndexerT =
+                dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
+                    RowsIndexerT, NoOpIndexerT>;
+            using ReductionIndexerT = NoOpIndexerT;
+
+            const RowsIndexerT rows_indexer{/* size */ iter_nelems,
+                                            /* step */ reduction_nelems};
+            static constexpr NoOpIndexerT noop_tmp_indexer{};
+            const InputOutputIterIndexerT in_out_iter_indexer{rows_indexer,
+                                                              noop_tmp_indexer};
+            static constexpr ReductionIndexerT reduction_indexer{};
+
+            first_reduction_ev = submit_no_atomic_reduction<
+                argTy, resTy, ReductionOpT, InputOutputIterIndexerT,
+                ReductionIndexerT, reduction_over_group_temps_krn>(
+                exec_q, arg_tp, partially_reduced_tmp, identity_val, wg,
+                iter_nelems, reduction_nelems, preferred_reductions_per_wi,
+                reduction_groups, in_out_iter_indexer, reduction_indexer,
+                depends);
+        }
+
+        std::size_t remaining_reduction_nelems = reduction_groups;
+
+        resTy *temp_arg = partially_reduced_tmp;
+        resTy *temp2_arg = partially_reduced_tmp2;
+        sycl::event dependent_ev = first_reduction_ev;
+
+        while (remaining_reduction_nelems >
+               preferred_reductions_per_wi * max_wg) {
+            std::size_t reduction_groups_ =
+                (remaining_reduction_nelems + preferred_reductions_per_wi * wg -
+                 1) /
+                (preferred_reductions_per_wi * wg);
+            assert(reduction_groups_ > 1);
+
+            // keep reducing
+            using InputIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer;
+            using ResIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+            using InputOutputIterIndexerT =
+                dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
+                    InputIndexerT, ResIndexerT>;
+            using ReductionIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+
+            const InputIndexerT inp_indexer{/* size */ iter_nelems,
+                                            /* step */ reduction_groups_};
+            static constexpr ResIndexerT res_iter_indexer{};
+
+            const InputOutputIterIndexerT in_out_iter_indexer{inp_indexer,
+                                                              res_iter_indexer};
+            static constexpr ReductionIndexerT reduction_indexer{};
+
+            sycl::event partial_reduction_ev = submit_no_atomic_reduction<
+                resTy, resTy, ReductionOpT, InputOutputIterIndexerT,
+                ReductionIndexerT, reduction_over_group_temps_krn>(
+                exec_q, temp_arg, temp2_arg, identity_val, wg, iter_nelems,
+                remaining_reduction_nelems, preferred_reductions_per_wi,
+                reduction_groups_, in_out_iter_indexer, reduction_indexer,
+                {dependent_ev});
+
+            remaining_reduction_nelems = reduction_groups_;
+            std::swap(temp_arg, temp2_arg);
+            dependent_ev = std::move(partial_reduction_ev);
+        }
+
+        // final reduction to res
+        using InputIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer;
+        using ResIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+        using InputOutputIterIndexerT =
+            dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
+                InputIndexerT, ResIndexerT>;
+        using ReductionIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+
+        const InputIndexerT inp_indexer{/* size */ iter_nelems,
+                                        /* step */ remaining_reduction_nelems};
+        static constexpr ResIndexerT res_iter_indexer{};
+
+        const InputOutputIterIndexerT in_out_iter_indexer{inp_indexer,
+                                                          res_iter_indexer};
+        static constexpr ReductionIndexerT reduction_indexer{};
+
+        wg = max_wg;
+        reductions_per_wi = std::max<std::size_t>(
+            1, (remaining_reduction_nelems + wg - 1) / wg);
+
+        reduction_groups =
+            (remaining_reduction_nelems + reductions_per_wi * wg - 1) /
+            (reductions_per_wi * wg);
+        assert(reduction_groups == 1);
+
+        sycl::event final_reduction_ev = submit_no_atomic_reduction<
+            resTy, resTy, ReductionOpT, InputOutputIterIndexerT,
+            ReductionIndexerT, reduction_over_group_temps_krn>(
+            exec_q, temp_arg, res_tp, identity_val, wg, iter_nelems,
+            remaining_reduction_nelems, reductions_per_wi, reduction_groups,
+            in_out_iter_indexer, reduction_indexer, {dependent_ev});
+
+        sycl::event cleanup_host_task_event =
+            dpctl::tensor::alloc_utils::async_smart_free(
+                exec_q, {final_reduction_ev}, tmp_owner);
+
+        // FIXME: do not return host-task event
+        //   Instead collect all host-tasks to a list
+
+        return cleanup_host_task_event;
+    }
+}
+
+template <typename argTy, typename resTy, typename ReductionOpT>
+sycl::event reduction_axis0_over_group_temps_contig_impl(
+    sycl::queue &exec_q,
+    std::size_t iter_nelems,      // number of reductions    (num. of rows in a
+                                  // matrix when reducing over rows)
+    std::size_t reduction_nelems, // size of each reduction  (length of rows,
+                                  // i.e. number of columns)
+    const char *arg_cp,
+    char *res_cp,
+    ssize_t iter_arg_offset,
+    ssize_t iter_res_offset,
+    ssize_t reduction_arg_offset,
+    const std::vector<sycl::event> &depends)
+{
+    const argTy *arg_tp = reinterpret_cast<const argTy *>(arg_cp) +
+                          iter_arg_offset + reduction_arg_offset;
+    resTy *res_tp = reinterpret_cast<resTy *>(res_cp) + iter_res_offset;
+
+    static constexpr resTy identity_val =
+        su_ns::Identity<ReductionOpT, resTy>::value;
+
+    if (reduction_nelems == 0) {
+        sycl::event res_init_ev = exec_q.fill<resTy>(
+            res_tp, resTy(identity_val), iter_nelems, depends);
+
+        return res_init_ev;
+    }
+
+    const sycl::device &d = exec_q.get_device();
+    const auto &sg_sizes = d.get_info<sycl::info::device::sub_group_sizes>();
+    std::size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes);
+
+    if (reduction_nelems < wg) {
+        using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+        using InputOutputIterIndexerT =
+            dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
+                NoOpIndexerT, NoOpIndexerT>;
+        using ReductionIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer;
+
+        const InputOutputIterIndexerT in_out_iter_indexer{NoOpIndexerT{},
+                                                          NoOpIndexerT{}};
+        const ReductionIndexerT reduction_indexer{/* size */ reduction_nelems,
+                                                  /* step */ iter_nelems};
+
+        sycl::event comp_ev =
+            sequential_reduction<argTy, resTy, ReductionOpT,
+                                 InputOutputIterIndexerT, ReductionIndexerT,
+                                 reduction_seq_krn>(
+                exec_q, arg_tp, res_tp, identity_val, iter_nelems,
+                reduction_nelems, in_out_iter_indexer, reduction_indexer,
+                depends);
+
+        return comp_ev;
+    }
+
+    static constexpr std::size_t preferred_reductions_per_wi = 8;
+    // prevents running out of resources on CPU
+    std::size_t max_wg = reduction_detail::get_work_group_size(d);
+
+    std::size_t reductions_per_wi(preferred_reductions_per_wi);
+    if (reduction_nelems <= preferred_reductions_per_wi * max_wg) {
+        // Perform reduction using one 1 work-group per iteration,
+        // can output directly to res
+
+        using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+        using ColsIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer;
+        using InputOutputIterIndexerT =
+            dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
+                NoOpIndexerT, NoOpIndexerT>;
+        using ReductionIndexerT = ColsIndexerT;
+
+        static constexpr NoOpIndexerT columns_indexer{};
+        static constexpr NoOpIndexerT result_indexer{};
+        const InputOutputIterIndexerT in_out_iter_indexer{columns_indexer,
+                                                          result_indexer};
+        const ReductionIndexerT reduction_indexer{/* size */ reduction_nelems,
+                                                  /* step */ iter_nelems};
+
+        if (iter_nelems == 1) {
+            // increase GPU occupancy
+            wg = max_wg;
+        }
+        reductions_per_wi =
+            std::max<std::size_t>(1, (reduction_nelems + wg - 1) / wg);
+
+        std::size_t reduction_groups =
+            (reduction_nelems + reductions_per_wi * wg - 1) /
+            (reductions_per_wi * wg);
+        assert(reduction_groups == 1);
+
+        sycl::event comp_ev = submit_no_atomic_reduction<
+            argTy, resTy, ReductionOpT, InputOutputIterIndexerT,
+            ReductionIndexerT, reduction_over_group_temps_krn>(
+            exec_q, arg_tp, res_tp, identity_val, wg, iter_nelems,
+            reduction_nelems, reductions_per_wi, reduction_groups,
+            in_out_iter_indexer, reduction_indexer, depends);
+
+        return comp_ev;
+    }
+    else {
+        // more than one work-groups is needed, requires a temporary
+        std::size_t reduction_groups =
+            (reduction_nelems + preferred_reductions_per_wi * wg - 1) /
+            (preferred_reductions_per_wi * wg);
+        assert(reduction_groups > 1);
+
+        std::size_t second_iter_reduction_groups_ =
+            (reduction_groups + preferred_reductions_per_wi * wg - 1) /
+            (preferred_reductions_per_wi * wg);
+
+        const std::size_t tmp_alloc_size =
+            iter_nelems * (reduction_groups + second_iter_reduction_groups_);
+
+        auto tmp_owner = dpctl::tensor::alloc_utils::smart_malloc_device<resTy>(
+            tmp_alloc_size, exec_q);
+
+        resTy *partially_reduced_tmp = tmp_owner.get();
+        resTy *partially_reduced_tmp2 =
+            partially_reduced_tmp + reduction_groups * iter_nelems;
+
+        sycl::event first_reduction_ev;
+        {
+            using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+            using ColsIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer;
+            using InputOutputIterIndexerT =
+                dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
+                    NoOpIndexerT, NoOpIndexerT>;
+            using ReductionIndexerT = ColsIndexerT;
+
+            static constexpr NoOpIndexerT columns_indexer{};
+            static constexpr NoOpIndexerT noop_tmp_indexer{};
+            const InputOutputIterIndexerT in_out_iter_indexer{columns_indexer,
+                                                              noop_tmp_indexer};
+            const ReductionIndexerT reduction_indexer{
+                /* size */ reduction_nelems,
+                /* step */ iter_nelems};
+
+            first_reduction_ev = submit_no_atomic_reduction<
+                argTy, resTy, ReductionOpT, InputOutputIterIndexerT,
+                ReductionIndexerT, reduction_over_group_temps_krn>(
+                exec_q, arg_tp, partially_reduced_tmp, identity_val, wg,
+                iter_nelems, reduction_nelems, preferred_reductions_per_wi,
+                reduction_groups, in_out_iter_indexer, reduction_indexer,
+                depends);
+        }
+
+        std::size_t remaining_reduction_nelems = reduction_groups;
+
+        resTy *temp_arg = partially_reduced_tmp;
+        resTy *temp2_arg = partially_reduced_tmp2;
+        sycl::event dependent_ev = first_reduction_ev;
+
+        while (remaining_reduction_nelems >
+               preferred_reductions_per_wi * max_wg) {
+            std::size_t reduction_groups_ =
+                (remaining_reduction_nelems + preferred_reductions_per_wi * wg -
+                 1) /
+                (preferred_reductions_per_wi * wg);
+            assert(reduction_groups_ > 1);
+
+            // keep reducing
+            using InputIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer;
+            using ResIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+            using InputOutputIterIndexerT =
+                dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
+                    InputIndexerT, ResIndexerT>;
+            using ReductionIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+
+            const InputIndexerT inp_indexer{/* size */ iter_nelems,
+                                            /* step */ reduction_groups_};
+            static constexpr ResIndexerT res_iter_indexer{};
+
+            const InputOutputIterIndexerT in_out_iter_indexer{inp_indexer,
+                                                              res_iter_indexer};
+            static constexpr ReductionIndexerT reduction_indexer{};
+
+            sycl::event partial_reduction_ev = submit_no_atomic_reduction<
+                resTy, resTy, ReductionOpT, InputOutputIterIndexerT,
+                ReductionIndexerT, reduction_over_group_temps_krn>(
+                exec_q, temp_arg, temp2_arg, identity_val, wg, iter_nelems,
+                remaining_reduction_nelems, preferred_reductions_per_wi,
+                reduction_groups_, in_out_iter_indexer, reduction_indexer,
+                {dependent_ev});
+
+            remaining_reduction_nelems = reduction_groups_;
+            std::swap(temp_arg, temp2_arg);
+            dependent_ev = std::move(partial_reduction_ev);
+        }
+
+        // final reduction to res
+        using InputIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer;
+        using ResIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+        using InputOutputIterIndexerT =
+            dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
+                InputIndexerT, ResIndexerT>;
+        using ReductionIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+
+        const InputIndexerT inp_indexer{/* size */ iter_nelems,
+                                        /* step */ remaining_reduction_nelems};
+        static constexpr ResIndexerT res_iter_indexer{};
+
+        const InputOutputIterIndexerT in_out_iter_indexer{inp_indexer,
+                                                          res_iter_indexer};
+        static constexpr ReductionIndexerT reduction_indexer{};
+
+        wg = max_wg;
+        reductions_per_wi = std::max<std::size_t>(
+            1, (remaining_reduction_nelems + wg - 1) / wg);
+
+        reduction_groups =
+            (remaining_reduction_nelems + reductions_per_wi * wg - 1) /
+            (reductions_per_wi * wg);
+        assert(reduction_groups == 1);
+
+        sycl::event final_reduction_ev = submit_no_atomic_reduction<
+            resTy, resTy, ReductionOpT, InputOutputIterIndexerT,
+            ReductionIndexerT, reduction_over_group_temps_krn>(
+            exec_q, temp_arg, res_tp, identity_val, wg, iter_nelems,
+            remaining_reduction_nelems, reductions_per_wi, reduction_groups,
+            in_out_iter_indexer, reduction_indexer, {dependent_ev});
+
+        sycl::event cleanup_host_task_event =
+            dpctl::tensor::alloc_utils::async_smart_free(
+                exec_q, {final_reduction_ev}, tmp_owner);
+
+        // FIXME: do not return host-task event
+        //   Instead collect all host-tasks to a list
+
+        return cleanup_host_task_event;
+    }
+}
+
+// Argmax and Argmin
+
+/* Sequential search reduction */
+
+template <typename argT,
+          typename outT,
+          typename ReductionOp,
+          typename IdxReductionOp,
+          typename InputOutputIterIndexerT,
+          typename InputRedIndexerT>
+struct SequentialSearchReduction
+{
+private:
+    const argT *inp_ = nullptr;
+    outT *out_ = nullptr;
+    ReductionOp reduction_op_;
+    argT identity_;
+    IdxReductionOp idx_reduction_op_;
+    outT idx_identity_;
+    InputOutputIterIndexerT inp_out_iter_indexer_;
+    InputRedIndexerT inp_reduced_dims_indexer_;
+    std::size_t reduction_max_gid_ = 0;
+
+public:
+    SequentialSearchReduction(
+        const argT *inp,
+        outT *res,
+        const ReductionOp &reduction_op,
+        const argT &identity_val,
+        const IdxReductionOp &idx_reduction_op,
+        const outT &idx_identity_val,
+        const InputOutputIterIndexerT &arg_res_iter_indexer,
+        const InputRedIndexerT &arg_reduced_dims_indexer,
+        std::size_t reduction_size)
+        : inp_(inp), out_(res), reduction_op_(reduction_op),
+          identity_(identity_val), idx_reduction_op_(idx_reduction_op),
+          idx_identity_(idx_identity_val),
+          inp_out_iter_indexer_(arg_res_iter_indexer),
+          inp_reduced_dims_indexer_(arg_reduced_dims_indexer),
+          reduction_max_gid_(reduction_size)
+    {
+    }
+
+    void operator()(sycl::id<1> id) const
+    {
+
+        auto const &inp_out_iter_offsets_ = inp_out_iter_indexer_(id[0]);
+        const ssize_t &inp_iter_offset =
+            inp_out_iter_offsets_.get_first_offset();
+        const ssize_t &out_iter_offset =
+            inp_out_iter_offsets_.get_second_offset();
+
+        argT red_val(identity_);
+        outT idx_val(idx_identity_);
+        for (std::size_t m = 0; m < reduction_max_gid_; ++m) {
+            const ssize_t inp_reduction_offset = inp_reduced_dims_indexer_(m);
+            const ssize_t inp_offset = inp_iter_offset + inp_reduction_offset;
+
+            argT val = inp_[inp_offset];
+            if (val == red_val) {
+                idx_val = idx_reduction_op_(idx_val, static_cast<outT>(m));
+            }
+            else {
+                if constexpr (su_ns::IsMinimum<argT, ReductionOp>::value) {
+                    using dpctl::tensor::type_utils::is_complex;
+                    if constexpr (is_complex<argT>::value) {
+                        using dpctl::tensor::math_utils::less_complex;
+                        // less_complex always returns false for NaNs, so check
+                        if (less_complex<argT>(val, red_val) ||
+                            std::isnan(std::real(val)) ||
+                            std::isnan(std::imag(val)))
+                        {
+                            red_val = val;
+                            idx_val = static_cast<outT>(m);
+                        }
+                    }
+                    else if constexpr (std::is_floating_point_v<argT> ||
+                                       std::is_same_v<argT, sycl::half>)
+                    {
+                        if (val < red_val || std::isnan(val)) {
+                            red_val = val;
+                            idx_val = static_cast<outT>(m);
+                        }
+                    }
+                    else {
+                        if (val < red_val) {
+                            red_val = val;
+                            idx_val = static_cast<outT>(m);
+                        }
+                    }
+                }
+                else if constexpr (su_ns::IsMaximum<argT, ReductionOp>::value) {
+                    using dpctl::tensor::type_utils::is_complex;
+                    if constexpr (is_complex<argT>::value) {
+                        using dpctl::tensor::math_utils::greater_complex;
+                        if (greater_complex<argT>(val, red_val) ||
+                            std::isnan(std::real(val)) ||
+                            std::isnan(std::imag(val)))
+                        {
+                            red_val = val;
+                            idx_val = static_cast<outT>(m);
+                        }
+                    }
+                    else if constexpr (std::is_floating_point_v<argT> ||
+                                       std::is_same_v<argT, sycl::half>)
+                    {
+                        if (val > red_val || std::isnan(val)) {
+                            red_val = val;
+                            idx_val = static_cast<outT>(m);
+                        }
+                    }
+                    else {
+                        if (val > red_val) {
+                            red_val = val;
+                            idx_val = static_cast<outT>(m);
+                        }
+                    }
+                }
+            }
+        }
+        out_[out_iter_offset] = idx_val;
+    }
+};
+
+/* = Search reduction using reduce_over_group*/
+
+template <typename argT,
+          typename outT,
+          typename ReductionOp,
+          typename IdxReductionOp,
+          typename InputOutputIterIndexerT,
+          typename InputRedIndexerT,
+          bool First,
+          bool Last>
+struct SearchReduction
+{
+private:
+    const argT *inp_ = nullptr;
+    argT *vals_ = nullptr;
+    const outT *inds_ = nullptr;
+    outT *out_ = nullptr;
+    ReductionOp reduction_op_;
+    argT identity_;
+    IdxReductionOp idx_reduction_op_;
+    outT idx_identity_;
+    InputOutputIterIndexerT inp_out_iter_indexer_;
+    InputRedIndexerT inp_reduced_dims_indexer_;
+    std::size_t reduction_max_gid_ = 0;
+    std::size_t iter_gws_ = 1;
+    std::size_t reductions_per_wi = 16;
+
+public:
+    SearchReduction(const argT *data,
+                    argT *vals,
+                    const outT *inds,
+                    outT *res,
+                    const ReductionOp &reduction_op,
+                    const argT &identity_val,
+                    const IdxReductionOp &idx_reduction_op,
+                    const outT &idx_identity_val,
+                    const InputOutputIterIndexerT &arg_res_iter_indexer,
+                    const InputRedIndexerT &arg_reduced_dims_indexer,
+                    std::size_t reduction_size,
+                    std::size_t iteration_size,
+                    std::size_t reduction_size_per_wi)
+        : inp_(data), vals_(vals), inds_(inds), out_(res),
+          reduction_op_(reduction_op), identity_(identity_val),
+          idx_reduction_op_(idx_reduction_op), idx_identity_(idx_identity_val),
+          inp_out_iter_indexer_(arg_res_iter_indexer),
+          inp_reduced_dims_indexer_(arg_reduced_dims_indexer),
+          reduction_max_gid_(reduction_size), iter_gws_(iteration_size),
+          reductions_per_wi(reduction_size_per_wi)
+    {
+    }
+
+    void operator()(sycl::nd_item<1> it) const
+    {
+        const std::size_t reduction_lid = it.get_local_id(0);
+        const std::size_t wg =
+            it.get_local_range(0); //   0 <= reduction_lid < wg
+
+        const std::size_t iter_gid = it.get_group(0) % iter_gws_;
+        const std::size_t reduction_batch_id = it.get_group(0) / iter_gws_;
+        const std::size_t n_reduction_groups =
+            it.get_group_range(0) / iter_gws_;
+
+        // work-items operates over input with indices
+        //   inp_data_id = reduction_batch_id * wg * reductions_per_wi + m * wg
+        //   + reduction_lid
+        // for 0 <= m < reductions_per_wi
+
+        const auto &inp_out_iter_offsets_ = inp_out_iter_indexer_(iter_gid);
+        const auto &inp_iter_offset = inp_out_iter_offsets_.get_first_offset();
+        const auto &out_iter_offset = inp_out_iter_offsets_.get_second_offset();
+
+        argT local_red_val(identity_);
+        outT local_idx(idx_identity_);
+        std::size_t arg_reduce_gid0 =
+            reduction_lid + reduction_batch_id * wg * reductions_per_wi;
+        for (std::size_t m = 0; m < reductions_per_wi; ++m) {
+            std::size_t arg_reduce_gid = arg_reduce_gid0 + m * wg;
+
+            if (arg_reduce_gid < reduction_max_gid_) {
+                auto inp_reduction_offset =
+                    inp_reduced_dims_indexer_(arg_reduce_gid);
+                auto inp_offset = inp_iter_offset + inp_reduction_offset;
+
+                argT val = inp_[inp_offset];
+                if (val == local_red_val) {
+                    if constexpr (!First) {
+                        local_idx =
+                            idx_reduction_op_(local_idx, inds_[inp_offset]);
+                    }
+                    else {
+                        local_idx = idx_reduction_op_(
+                            local_idx, static_cast<outT>(arg_reduce_gid));
+                    }
+                }
+                else {
+                    if constexpr (su_ns::IsMinimum<argT, ReductionOp>::value) {
+                        if (val < local_red_val) {
+                            local_red_val = val;
+                            if constexpr (!First) {
+                                local_idx = inds_[inp_offset];
+                            }
+                            else {
+                                local_idx = static_cast<outT>(arg_reduce_gid);
+                            }
+                        }
+                    }
+                    else if constexpr (su_ns::IsMaximum<argT,
+                                                        ReductionOp>::value) {
+                        if (val > local_red_val) {
+                            local_red_val = val;
+                            if constexpr (!First) {
+                                local_idx = inds_[inp_offset];
+                            }
+                            else {
+                                local_idx = static_cast<outT>(arg_reduce_gid);
+                            }
+                        }
+                    }
+                }
+            }
+        }
+
+        auto work_group = it.get_group();
+        // This only works if reduction_op_ is from small set of operators
+        argT red_val_over_wg = sycl::reduce_over_group(
+            work_group, local_red_val, identity_, reduction_op_);
+
+        if constexpr (std::is_integral_v<argT>) {
+            local_idx =
+                (red_val_over_wg == local_red_val) ? local_idx : idx_identity_;
+        }
+        else {
+            local_idx =
+                (red_val_over_wg == local_red_val ||
+                 std::isnan(red_val_over_wg) || std::isnan(local_red_val))
+                    ? local_idx
+                    : idx_identity_;
+        }
+        outT idx_over_wg = sycl::reduce_over_group(
+            work_group, local_idx, idx_identity_, idx_reduction_op_);
+
+        if (work_group.leader()) {
+            // each group writes to a different memory location
+            if constexpr (!Last) {
+                // if not the final reduction, write value corresponding to
+                // an index to a temporary
+                vals_[out_iter_offset * n_reduction_groups +
+                      reduction_batch_id] = red_val_over_wg;
+            }
+            out_[out_iter_offset * n_reduction_groups + reduction_batch_id] =
+                idx_over_wg;
+        }
+    }
+};
+
+/* = Search reduction using custom_reduce_over_group*/
+
+template <typename argT,
+          typename outT,
+          typename ReductionOp,
+          typename IdxReductionOp,
+          typename InputOutputIterIndexerT,
+          typename InputRedIndexerT,
+          typename SlmT,
+          bool First,
+          bool Last>
+struct CustomSearchReduction
+{
+private:
+    const argT *inp_ = nullptr;
+    argT *vals_ = nullptr;
+    const outT *inds_ = nullptr;
+    outT *out_ = nullptr;
+    ReductionOp reduction_op_;
+    argT identity_;
+    IdxReductionOp idx_reduction_op_;
+    outT idx_identity_;
+    InputOutputIterIndexerT inp_out_iter_indexer_;
+    InputRedIndexerT inp_reduced_dims_indexer_;
+    SlmT local_mem_;
+    std::size_t reduction_max_gid_ = 0;
+    std::size_t iter_gws_ = 1;
+    std::size_t reductions_per_wi = 16;
+
+public:
+    CustomSearchReduction(const argT *data,
+                          argT *vals,
+                          outT *inds,
+                          outT *res,
+                          const ReductionOp &reduction_op,
+                          const argT &identity_val,
+                          const IdxReductionOp &idx_reduction_op,
+                          const outT &idx_identity_val,
+                          const InputOutputIterIndexerT &arg_res_iter_indexer,
+                          const InputRedIndexerT &arg_reduced_dims_indexer,
+                          SlmT local_mem,
+                          std::size_t reduction_size,
+                          std::size_t iteration_size,
+                          std::size_t reduction_size_per_wi)
+        : inp_(data), vals_(vals), inds_(inds), out_(res),
+          reduction_op_(reduction_op), identity_(identity_val),
+          idx_reduction_op_(idx_reduction_op), idx_identity_(idx_identity_val),
+          inp_out_iter_indexer_(arg_res_iter_indexer),
+          inp_reduced_dims_indexer_(arg_reduced_dims_indexer),
+          local_mem_(local_mem), reduction_max_gid_(reduction_size),
+          iter_gws_(iteration_size), reductions_per_wi(reduction_size_per_wi)
+    {
+    }
+
+    void operator()(sycl::nd_item<1> it) const
+    {
+        const std::size_t reduction_lid = it.get_local_id(0);
+        const std::size_t wg =
+            it.get_local_range(0); //   0 <= reduction_lid < wg
+
+        const std::size_t iter_gid = it.get_group(0) % iter_gws_;
+        const std::size_t reduction_batch_id = it.get_group(0) / iter_gws_;
+        const std::size_t n_reduction_groups =
+            it.get_group_range(0) / iter_gws_;
+
+        // work-items operates over input with indices
+        //   inp_data_id = reduction_batch_id * wg * reductions_per_wi + m * wg
+        //   + reduction_lid
+        // for 0 <= m < reductions_per_wi
+
+        const auto &inp_out_iter_offsets_ = inp_out_iter_indexer_(iter_gid);
+        const auto &inp_iter_offset = inp_out_iter_offsets_.get_first_offset();
+        const auto &out_iter_offset = inp_out_iter_offsets_.get_second_offset();
+
+        argT local_red_val(identity_);
+        outT local_idx(idx_identity_);
+        std::size_t arg_reduce_gid0 =
+            reduction_lid + reduction_batch_id * wg * reductions_per_wi;
+        for (std::size_t m = 0; m < reductions_per_wi; ++m) {
+            std::size_t arg_reduce_gid = arg_reduce_gid0 + m * wg;
+
+            if (arg_reduce_gid < reduction_max_gid_) {
+                auto inp_reduction_offset =
+                    inp_reduced_dims_indexer_(arg_reduce_gid);
+                auto inp_offset = inp_iter_offset + inp_reduction_offset;
+
+                argT val = inp_[inp_offset];
+                if (val == local_red_val) {
+                    if constexpr (!First) {
+                        local_idx =
+                            idx_reduction_op_(local_idx, inds_[inp_offset]);
+                    }
+                    else {
+                        local_idx = idx_reduction_op_(
+                            local_idx, static_cast<outT>(arg_reduce_gid));
+                    }
+                }
+                else {
+                    if constexpr (su_ns::IsMinimum<argT, ReductionOp>::value) {
+                        using dpctl::tensor::type_utils::is_complex;
+                        if constexpr (is_complex<argT>::value) {
+                            using dpctl::tensor::math_utils::less_complex;
+                            // less_complex always returns false for NaNs, so
+                            // check
+                            if (less_complex<argT>(val, local_red_val) ||
+                                std::isnan(std::real(val)) ||
+                                std::isnan(std::imag(val)))
+                            {
+                                local_red_val = val;
+                                if constexpr (!First) {
+                                    local_idx = inds_[inp_offset];
+                                }
+                                else {
+                                    local_idx =
+                                        static_cast<outT>(arg_reduce_gid);
+                                }
+                            }
+                        }
+                        else if constexpr (std::is_floating_point_v<argT> ||
+                                           std::is_same_v<argT, sycl::half>)
+                        {
+                            if (val < local_red_val || std::isnan(val)) {
+                                local_red_val = val;
+                                if constexpr (!First) {
+                                    local_idx = inds_[inp_offset];
+                                }
+                                else {
+                                    local_idx =
+                                        static_cast<outT>(arg_reduce_gid);
+                                }
+                            }
+                        }
+                        else {
+                            if (val < local_red_val) {
+                                local_red_val = val;
+                                if constexpr (!First) {
+                                    local_idx = inds_[inp_offset];
+                                }
+                                else {
+                                    local_idx =
+                                        static_cast<outT>(arg_reduce_gid);
+                                }
+                            }
+                        }
+                    }
+                    else if constexpr (su_ns::IsMaximum<argT,
+                                                        ReductionOp>::value) {
+                        using dpctl::tensor::type_utils::is_complex;
+                        if constexpr (is_complex<argT>::value) {
+                            using dpctl::tensor::math_utils::greater_complex;
+                            if (greater_complex<argT>(val, local_red_val) ||
+                                std::isnan(std::real(val)) ||
+                                std::isnan(std::imag(val)))
+                            {
+                                local_red_val = val;
+                                if constexpr (!First) {
+                                    local_idx = inds_[inp_offset];
+                                }
+                                else {
+                                    local_idx =
+                                        static_cast<outT>(arg_reduce_gid);
+                                }
+                            }
+                        }
+                        else if constexpr (std::is_floating_point_v<argT> ||
+                                           std::is_same_v<argT, sycl::half>)
+                        {
+                            if (val > local_red_val || std::isnan(val)) {
+                                local_red_val = val;
+                                if constexpr (!First) {
+                                    local_idx = inds_[inp_offset];
+                                }
+                                else {
+                                    local_idx =
+                                        static_cast<outT>(arg_reduce_gid);
+                                }
+                            }
+                        }
+                        else {
+                            if (val > local_red_val) {
+                                local_red_val = val;
+                                if constexpr (!First) {
+                                    local_idx = inds_[inp_offset];
+                                }
+                                else {
+                                    local_idx =
+                                        static_cast<outT>(arg_reduce_gid);
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        }
+
+        auto work_group = it.get_group();
+        // This only works if reduction_op_ is from small set of operators
+        argT red_val_over_wg = su_ns::custom_reduce_over_group(
+            work_group, local_mem_, local_red_val, reduction_op_);
+
+        using dpctl::tensor::type_utils::is_complex;
+        if constexpr (is_complex<argT>::value) {
+            // equality does not hold for NaNs, so check here
+            local_idx = (red_val_over_wg == local_red_val ||
+                         std::isnan(std::real(local_red_val)) ||
+                         std::isnan(std::imag(local_red_val)))
+                            ? local_idx
+                            : idx_identity_;
+        }
+        else if constexpr (std::is_floating_point_v<argT> ||
+                           std::is_same_v<argT, sycl::half>)
+        {
+            // equality does not hold for NaNs, so check here
+            local_idx =
+                (red_val_over_wg == local_red_val || std::isnan(local_red_val))
+                    ? local_idx
+                    : idx_identity_;
+        }
+        else {
+            local_idx =
+                red_val_over_wg == local_red_val ? local_idx : idx_identity_;
+        }
+        outT idx_over_wg = sycl::reduce_over_group(
+            work_group, local_idx, idx_identity_, idx_reduction_op_);
+        if (work_group.leader()) {
+            // each group writes to a different memory location
+            if constexpr (!Last) {
+                // if not the final reduction, write value corresponding to
+                // an index to a temporary
+                vals_[out_iter_offset * n_reduction_groups +
+                      reduction_batch_id] = red_val_over_wg;
+            }
+            out_[out_iter_offset * n_reduction_groups + reduction_batch_id] =
+                idx_over_wg;
+        }
+    }
+};
+
+typedef sycl::event (*search_strided_impl_fn_ptr)(
+    sycl::queue &,
+    std::size_t,
+    std::size_t,
+    const char *,
+    char *,
+    int,
+    const ssize_t *,
+    ssize_t,
+    ssize_t,
+    int,
+    const ssize_t *,
+    ssize_t,
+    const std::vector<sycl::event> &);
+
+template <typename T1,
+          typename T2,
+          typename T3,
+          typename T4,
+          typename T5,
+          typename T6>
+class search_seq_strided_krn;
+
+template <typename T1,
+          typename T2,
+          typename T3,
+          typename T4,
+          typename T5,
+          typename T6>
+class search_seq_contig_krn;
+
+template <typename T1,
+          typename T2,
+          typename T3,
+          typename T4,
+          typename T5,
+          typename T6,
+          bool b1,
+          bool b2>
+class search_over_group_krn;
+
+template <typename T1,
+          typename T2,
+          typename T3,
+          typename T4,
+          typename T5,
+          typename T6,
+          typename T7,
+          bool b1,
+          bool b2>
+class custom_search_over_group_krn;
+
+template <typename T1, typename T2, typename T3>
+class search_empty_krn;
+
+template <typename argTy,
+          typename resTy,
+          typename ReductionOpT,
+          typename IndexOpT,
+          typename InputOutputIterIndexerT,
+          typename ReductionIndexerT,
+          bool First,
+          bool Last>
+sycl::event
+    submit_search_reduction(sycl::queue &exec_q,
+                            const argTy *arg,
+                            argTy *arg_tmp,
+                            resTy *res_tmp,
+                            resTy *res,
+                            argTy identity_val,
+                            resTy idx_identity_val,
+                            std::size_t wg,
+                            std::size_t iter_nelems,
+                            std::size_t reduction_nelems,
+                            std::size_t reductions_per_wi,
+                            std::size_t reduction_groups,
+                            const InputOutputIterIndexerT &in_out_iter_indexer,
+                            const ReductionIndexerT &reduction_indexer,
+                            const std::vector<sycl::event> &depends)
+{
+    sycl::event red_ev = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+
+        auto globalRange = sycl::range<1>{iter_nelems * reduction_groups * wg};
+        auto localRange = sycl::range<1>{wg};
+        auto ndRange = sycl::nd_range<1>(globalRange, localRange);
+
+        if constexpr (can_use_reduce_over_group<ReductionOpT, resTy>::value) {
+            using KernelName =
+                class search_over_group_krn<argTy, resTy, ReductionOpT,
+                                            IndexOpT, InputOutputIterIndexerT,
+                                            ReductionIndexerT, First, Last>;
+            cgh.parallel_for<KernelName>(
+                ndRange, SearchReduction<argTy, resTy, ReductionOpT, IndexOpT,
+                                         InputOutputIterIndexerT,
+                                         ReductionIndexerT, First, Last>(
+                             arg, arg_tmp, res_tmp, res, ReductionOpT(),
+                             identity_val, IndexOpT(), idx_identity_val,
+                             in_out_iter_indexer, reduction_indexer,
+                             reduction_nelems, iter_nelems, reductions_per_wi));
+        }
+        else {
+            using SlmT = sycl::local_accessor<argTy, 1>;
+            SlmT local_memory = SlmT(localRange, cgh);
+            using KernelName = class custom_search_over_group_krn<
+                argTy, resTy, ReductionOpT, IndexOpT, InputOutputIterIndexerT,
+                ReductionIndexerT, SlmT, First, Last>;
+            cgh.parallel_for<KernelName>(
+                ndRange,
+                CustomSearchReduction<argTy, resTy, ReductionOpT, IndexOpT,
+                                      InputOutputIterIndexerT,
+                                      ReductionIndexerT, SlmT, First, Last>(
+                    arg, arg_tmp, res_tmp, res, ReductionOpT(), identity_val,
+                    IndexOpT(), idx_identity_val, in_out_iter_indexer,
+                    reduction_indexer, local_memory, reduction_nelems,
+                    iter_nelems, reductions_per_wi));
+        }
+    });
+    return red_ev;
+}
+
+template <typename argTy,
+          typename resTy,
+          typename ReductionOpT,
+          typename IndexOpT>
+sycl::event search_over_group_temps_strided_impl(
+    sycl::queue &exec_q,
+    std::size_t iter_nelems,      // number of reductions    (num. of rows in a
+                                  // matrix when reducing over rows)
+    std::size_t reduction_nelems, // size of each reduction  (length of rows,
+                                  // i.e. number of columns)
+    const char *arg_cp,
+    char *res_cp,
+    int iter_nd,
+    const ssize_t *iter_shape_and_strides,
+    ssize_t iter_arg_offset,
+    ssize_t iter_res_offset,
+    int red_nd,
+    const ssize_t *reduction_shape_stride,
+    ssize_t reduction_arg_offset,
+    const std::vector<sycl::event> &depends)
+{
+    const argTy *arg_tp = reinterpret_cast<const argTy *>(arg_cp);
+    resTy *res_tp = reinterpret_cast<resTy *>(res_cp);
+
+    static constexpr argTy identity_val =
+        su_ns::Identity<ReductionOpT, argTy>::value;
+    static constexpr resTy idx_identity_val =
+        su_ns::Identity<IndexOpT, resTy>::value;
+
+    if (reduction_nelems == 0) {
+        sycl::event res_init_ev = exec_q.submit([&](sycl::handler &cgh) {
+            using IndexerT =
+                dpctl::tensor::offset_utils::UnpackedStridedIndexer;
+
+            const ssize_t *const &res_shape = iter_shape_and_strides;
+            const ssize_t *const &res_strides =
+                iter_shape_and_strides + 2 * iter_nd;
+            const IndexerT res_indexer(iter_nd, iter_res_offset, res_shape,
+                                       res_strides);
+            using InitKernelName =
+                class search_empty_krn<resTy, argTy, ReductionOpT>;
+            cgh.depends_on(depends);
+
+            cgh.parallel_for<InitKernelName>(
+                sycl::range<1>(iter_nelems), [=](sycl::id<1> id) {
+                    auto res_offset = res_indexer(id[0]);
+                    res_tp[res_offset] = idx_identity_val;
+                });
+        });
+
+        return res_init_ev;
+    }
+
+    const sycl::device &d = exec_q.get_device();
+    const auto &sg_sizes = d.get_info<sycl::info::device::sub_group_sizes>();
+    std::size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes);
+
+    if (reduction_nelems < wg) {
+        using InputOutputIterIndexerT =
+            dpctl::tensor::offset_utils::TwoOffsets_StridedIndexer;
+        using ReductionIndexerT = dpctl::tensor::offset_utils::StridedIndexer;
+
+        const InputOutputIterIndexerT in_out_iter_indexer{
+            iter_nd, iter_arg_offset, iter_res_offset, iter_shape_and_strides};
+        const ReductionIndexerT reduction_indexer{red_nd, reduction_arg_offset,
+                                                  reduction_shape_stride};
+
+        sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) {
+            cgh.depends_on(depends);
+
+            cgh.parallel_for<class search_seq_strided_krn<
+                argTy, resTy, ReductionOpT, IndexOpT, InputOutputIterIndexerT,
+                ReductionIndexerT>>(
+                sycl::range<1>(iter_nelems),
+                SequentialSearchReduction<argTy, resTy, ReductionOpT, IndexOpT,
+                                          InputOutputIterIndexerT,
+                                          ReductionIndexerT>(
+                    arg_tp, res_tp, ReductionOpT(), identity_val, IndexOpT(),
+                    idx_identity_val, in_out_iter_indexer, reduction_indexer,
+                    reduction_nelems));
+        });
+
+        return comp_ev;
+    }
+
+    static constexpr std::size_t preferred_reductions_per_wi = 4;
+    // prevents running out of resources on CPU
+    std::size_t max_wg = reduction_detail::get_work_group_size(d);
+
+    std::size_t reductions_per_wi(preferred_reductions_per_wi);
+    if (reduction_nelems <= preferred_reductions_per_wi * max_wg) {
+        // Perform reduction using one 1 work-group per iteration,
+        // can output directly to res
+
+        using InputOutputIterIndexerT =
+            dpctl::tensor::offset_utils::TwoOffsets_StridedIndexer;
+        using ReductionIndexerT = dpctl::tensor::offset_utils::StridedIndexer;
+
+        const InputOutputIterIndexerT in_out_iter_indexer{
+            iter_nd, iter_arg_offset, iter_res_offset, iter_shape_and_strides};
+        const ReductionIndexerT reduction_indexer{red_nd, reduction_arg_offset,
+                                                  reduction_shape_stride};
+
+        if (iter_nelems == 1) {
+            // increase GPU occupancy
+            wg = max_wg;
+        }
+        reductions_per_wi =
+            std::max<std::size_t>(1, (reduction_nelems + wg - 1) / wg);
+
+        std::size_t reduction_groups =
+            (reduction_nelems + reductions_per_wi * wg - 1) /
+            (reductions_per_wi * wg);
+        assert(reduction_groups == 1);
+
+        sycl::event comp_ev =
+            submit_search_reduction<argTy, resTy, ReductionOpT, IndexOpT,
+                                    InputOutputIterIndexerT, ReductionIndexerT,
+                                    true, true>(
+                exec_q, arg_tp, nullptr, nullptr, res_tp, identity_val,
+                idx_identity_val, wg, iter_nelems, reduction_nelems,
+                reductions_per_wi, reduction_groups, in_out_iter_indexer,
+                reduction_indexer, depends);
+
+        return comp_ev;
+    }
+    else {
+        // more than one work-groups is needed, requires a temporary
+        std::size_t reduction_groups =
+            (reduction_nelems + preferred_reductions_per_wi * wg - 1) /
+            (preferred_reductions_per_wi * wg);
+        assert(reduction_groups > 1);
+
+        std::size_t second_iter_reduction_groups_ =
+            (reduction_groups + preferred_reductions_per_wi * wg - 1) /
+            (preferred_reductions_per_wi * wg);
+
+        const std::size_t tmp_alloc_size =
+            iter_nelems * (reduction_groups + second_iter_reduction_groups_);
+        auto tmp_owner = dpctl::tensor::alloc_utils::smart_malloc_device<resTy>(
+            tmp_alloc_size, exec_q);
+
+        resTy *partially_reduced_tmp = tmp_owner.get();
+        resTy *partially_reduced_tmp2 =
+            partially_reduced_tmp + reduction_groups * iter_nelems;
+
+        auto val_tmp_owner =
+            dpctl::tensor::alloc_utils::smart_malloc_device<argTy>(
+                tmp_alloc_size, exec_q);
+
+        argTy *partially_reduced_vals_tmp = val_tmp_owner.get();
+        argTy *partially_reduced_vals_tmp2 =
+            partially_reduced_vals_tmp + reduction_groups * iter_nelems;
+
+        sycl::event first_reduction_ev;
+        {
+            using InputIndexerT = dpctl::tensor::offset_utils::StridedIndexer;
+            using ResIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+            using InputOutputIterIndexerT =
+                dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
+                    InputIndexerT, ResIndexerT>;
+            using ReductionIndexerT =
+                dpctl::tensor::offset_utils::StridedIndexer;
+
+            // Only 2*iter_nd entries describing shape and strides of iterated
+            // dimensions of input array from iter_shape_and_strides are going
+            // to be accessed by inp_indexer
+            const InputIndexerT inp_indexer(iter_nd, iter_arg_offset,
+                                            iter_shape_and_strides);
+            static constexpr ResIndexerT noop_tmp_indexer{};
+
+            const InputOutputIterIndexerT in_out_iter_indexer{inp_indexer,
+                                                              noop_tmp_indexer};
+            const ReductionIndexerT reduction_indexer{
+                red_nd, reduction_arg_offset, reduction_shape_stride};
+
+            first_reduction_ev =
+                submit_search_reduction<argTy, resTy, ReductionOpT, IndexOpT,
+                                        InputOutputIterIndexerT,
+                                        ReductionIndexerT, true, false>(
+                    exec_q, arg_tp, partially_reduced_vals_tmp, nullptr,
+                    partially_reduced_tmp, identity_val, idx_identity_val, wg,
+                    iter_nelems, reduction_nelems, reductions_per_wi,
+                    reduction_groups, in_out_iter_indexer, reduction_indexer,
+                    depends);
+        }
+
+        std::size_t remaining_reduction_nelems = reduction_groups;
+
+        resTy *temp_arg = partially_reduced_tmp;
+        resTy *temp2_arg = partially_reduced_tmp2;
+
+        argTy *vals_temp_arg = partially_reduced_vals_tmp;
+        argTy *vals_temp2_arg = partially_reduced_vals_tmp2;
+
+        sycl::event dependent_ev = first_reduction_ev;
+
+        while (remaining_reduction_nelems >
+               preferred_reductions_per_wi * max_wg) {
+            std::size_t reduction_groups_ =
+                (remaining_reduction_nelems + preferred_reductions_per_wi * wg -
+                 1) /
+                (preferred_reductions_per_wi * wg);
+            assert(reduction_groups_ > 1);
+
+            // keep reducing
+            using InputIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer;
+            using ResIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+            using InputOutputIterIndexerT =
+                dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
+                    InputIndexerT, ResIndexerT>;
+            using ReductionIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+
+            const InputIndexerT inp_indexer{/* size */ iter_nelems,
+                                            /* step */ reduction_groups_};
+            static constexpr ResIndexerT res_iter_indexer{};
+
+            const InputOutputIterIndexerT in_out_iter_indexer{inp_indexer,
+                                                              res_iter_indexer};
+            static constexpr ReductionIndexerT reduction_indexer{};
+
+            sycl::event partial_reduction_ev =
+                submit_search_reduction<argTy, resTy, ReductionOpT, IndexOpT,
+                                        InputOutputIterIndexerT,
+                                        ReductionIndexerT, false, false>(
+                    exec_q, vals_temp_arg, vals_temp2_arg, temp_arg, temp2_arg,
+                    identity_val, idx_identity_val, wg, iter_nelems,
+                    remaining_reduction_nelems, preferred_reductions_per_wi,
+                    reduction_groups_, in_out_iter_indexer, reduction_indexer,
+                    {dependent_ev});
+
+            remaining_reduction_nelems = reduction_groups_;
+            std::swap(temp_arg, temp2_arg);
+            std::swap(vals_temp_arg, vals_temp2_arg);
+            dependent_ev = partial_reduction_ev;
+        }
+
+        // final reduction to res
+        using InputIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer;
+        using ResIndexerT = dpctl::tensor::offset_utils::UnpackedStridedIndexer;
+        using InputOutputIterIndexerT =
+            dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
+                InputIndexerT, ResIndexerT>;
+        using ReductionIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+
+        const InputIndexerT inp_indexer{/* size */ iter_nelems,
+                                        /* step */ remaining_reduction_nelems};
+        const ResIndexerT res_iter_indexer{
+            iter_nd, iter_res_offset,
+            /* shape */ iter_shape_and_strides,
+            /* strides */ iter_shape_and_strides + 2 * iter_nd};
+
+        const InputOutputIterIndexerT in_out_iter_indexer{inp_indexer,
+                                                          res_iter_indexer};
+        static constexpr ReductionIndexerT reduction_indexer{};
+
+        wg = max_wg;
+        reductions_per_wi = std::max<std::size_t>(
+            1, (remaining_reduction_nelems + wg - 1) / wg);
+
+        reduction_groups =
+            (remaining_reduction_nelems + reductions_per_wi * wg - 1) /
+            (reductions_per_wi * wg);
+        assert(reduction_groups == 1);
+
+        sycl::event final_reduction_ev =
+            submit_search_reduction<argTy, resTy, ReductionOpT, IndexOpT,
+                                    InputOutputIterIndexerT, ReductionIndexerT,
+                                    false, true>(
+                exec_q, vals_temp_arg, nullptr, temp_arg, res_tp, identity_val,
+                idx_identity_val, wg, iter_nelems, remaining_reduction_nelems,
+                reductions_per_wi, reduction_groups, in_out_iter_indexer,
+                reduction_indexer, {dependent_ev});
+
+        sycl::event cleanup_host_task_event =
+            dpctl::tensor::alloc_utils::async_smart_free(
+                exec_q, {final_reduction_ev}, tmp_owner, val_tmp_owner);
+
+        // FIXME: do not return host-task event
+        //   Instead collect all host-tasks to a list
+
+        return cleanup_host_task_event;
+    }
+}
+
+typedef sycl::event (*search_contig_impl_fn_ptr)(
+    sycl::queue &,
+    std::size_t,
+    std::size_t,
+    const char *,
+    char *,
+    ssize_t,
+    ssize_t,
+    ssize_t,
+    const std::vector<sycl::event> &);
+
+template <typename argTy,
+          typename resTy,
+          typename ReductionOpT,
+          typename IndexOpT>
+sycl::event search_axis1_over_group_temps_contig_impl(
+    sycl::queue &exec_q,
+    std::size_t iter_nelems,      // number of reductions    (num. of rows in a
+                                  // matrix when reducing over rows)
+    std::size_t reduction_nelems, // size of each reduction  (length of rows,
+                                  // i.e. number of columns)
+    const char *arg_cp,
+    char *res_cp,
+    ssize_t iter_arg_offset,
+    ssize_t iter_res_offset,
+    ssize_t reduction_arg_offset,
+    const std::vector<sycl::event> &depends)
+{
+    const argTy *arg_tp = reinterpret_cast<const argTy *>(arg_cp) +
+                          iter_arg_offset + reduction_arg_offset;
+    resTy *res_tp = reinterpret_cast<resTy *>(res_cp) + iter_res_offset;
+
+    static constexpr argTy identity_val =
+        su_ns::Identity<ReductionOpT, argTy>::value;
+    static constexpr resTy idx_identity_val =
+        su_ns::Identity<IndexOpT, resTy>::value;
+
+    if (reduction_nelems == 0) {
+        sycl::event res_init_ev = exec_q.fill<resTy>(
+            res_tp, resTy(idx_identity_val), iter_nelems, depends);
+
+        return res_init_ev;
+    }
+
+    const sycl::device &d = exec_q.get_device();
+    const auto &sg_sizes = d.get_info<sycl::info::device::sub_group_sizes>();
+    std::size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes);
+
+    if (reduction_nelems < wg) {
+        using InputIterIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer;
+        using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+        using InputOutputIterIndexerT =
+            dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
+                InputIterIndexerT, NoOpIndexerT>;
+        using ReductionIndexerT = NoOpIndexerT;
+
+        const InputOutputIterIndexerT in_out_iter_indexer{
+            InputIterIndexerT{/* size */ iter_nelems,
+                              /* step */ reduction_nelems},
+            NoOpIndexerT{}};
+        static constexpr ReductionIndexerT reduction_indexer{};
+
+        sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) {
+            cgh.depends_on(depends);
+
+            cgh.parallel_for<class search_seq_contig_krn<
+                argTy, resTy, ReductionOpT, IndexOpT, InputOutputIterIndexerT,
+                ReductionIndexerT>>(
+                sycl::range<1>(iter_nelems),
+                SequentialSearchReduction<argTy, resTy, ReductionOpT, IndexOpT,
+                                          InputOutputIterIndexerT,
+                                          ReductionIndexerT>(
+                    arg_tp, res_tp, ReductionOpT(), identity_val, IndexOpT(),
+                    idx_identity_val, in_out_iter_indexer, reduction_indexer,
+                    reduction_nelems));
+        });
+
+        return comp_ev;
+    }
+
+    static constexpr std::size_t preferred_reductions_per_wi = 8;
+    // prevents running out of resources on CPU
+    std::size_t max_wg = reduction_detail::get_work_group_size(d);
+
+    std::size_t reductions_per_wi(preferred_reductions_per_wi);
+    if (reduction_nelems <= preferred_reductions_per_wi * max_wg) {
+        // Perform reduction using one 1 work-group per iteration,
+        // can output directly to res
+        using InputIterIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer;
+        using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+        using InputOutputIterIndexerT =
+            dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
+                InputIterIndexerT, NoOpIndexerT>;
+        using ReductionIndexerT = NoOpIndexerT;
+
+        const InputOutputIterIndexerT in_out_iter_indexer{
+            InputIterIndexerT{/* size */ iter_nelems,
+                              /* step */ reduction_nelems},
+            NoOpIndexerT{}};
+        static constexpr ReductionIndexerT reduction_indexer{};
+
+        if (iter_nelems == 1) {
+            // increase GPU occupancy
+            wg = max_wg;
+        }
+        reductions_per_wi =
+            std::max<std::size_t>(1, (reduction_nelems + wg - 1) / wg);
+
+        std::size_t reduction_groups =
+            (reduction_nelems + reductions_per_wi * wg - 1) /
+            (reductions_per_wi * wg);
+        assert(reduction_groups == 1);
+
+        sycl::event comp_ev =
+            submit_search_reduction<argTy, resTy, ReductionOpT, IndexOpT,
+                                    InputOutputIterIndexerT, ReductionIndexerT,
+                                    true, true>(
+                exec_q, arg_tp, nullptr, nullptr, res_tp, identity_val,
+                idx_identity_val, wg, iter_nelems, reduction_nelems,
+                reductions_per_wi, reduction_groups, in_out_iter_indexer,
+                reduction_indexer, depends);
+
+        return comp_ev;
+    }
+    else {
+        // more than one work-groups is needed, requires a temporary
+        std::size_t reduction_groups =
+            (reduction_nelems + preferred_reductions_per_wi * wg - 1) /
+            (preferred_reductions_per_wi * wg);
+        assert(reduction_groups > 1);
+
+        std::size_t second_iter_reduction_groups_ =
+            (reduction_groups + preferred_reductions_per_wi * wg - 1) /
+            (preferred_reductions_per_wi * wg);
+
+        const std::size_t tmp_alloc_size =
+            iter_nelems * (reduction_groups + second_iter_reduction_groups_);
+        auto tmp_owner = dpctl::tensor::alloc_utils::smart_malloc_device<resTy>(
+            tmp_alloc_size, exec_q);
+        resTy *partially_reduced_tmp = tmp_owner.get();
+        resTy *partially_reduced_tmp2 =
+            partially_reduced_tmp + reduction_groups * iter_nelems;
+
+        auto val_tmp_owner =
+            dpctl::tensor::alloc_utils::smart_malloc_device<argTy>(
+                tmp_alloc_size, exec_q);
+        argTy *partially_reduced_vals_tmp = val_tmp_owner.get();
+        argTy *partially_reduced_vals_tmp2 =
+            partially_reduced_vals_tmp + reduction_groups * iter_nelems;
+
+        sycl::event first_reduction_ev;
+        {
+            using InputIterIndexerT =
+                dpctl::tensor::offset_utils::Strided1DIndexer;
+            using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+            using InputOutputIterIndexerT =
+                dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
+                    InputIterIndexerT, NoOpIndexerT>;
+            using ReductionIndexerT = NoOpIndexerT;
+
+            const InputOutputIterIndexerT in_out_iter_indexer{
+                InputIterIndexerT{/* size */ iter_nelems,
+                                  /* step */ reduction_nelems},
+                NoOpIndexerT{}};
+            static constexpr ReductionIndexerT reduction_indexer{};
+
+            first_reduction_ev =
+                submit_search_reduction<argTy, resTy, ReductionOpT, IndexOpT,
+                                        InputOutputIterIndexerT,
+                                        ReductionIndexerT, true, false>(
+                    exec_q, arg_tp, partially_reduced_vals_tmp, nullptr,
+                    partially_reduced_tmp, identity_val, idx_identity_val, wg,
+                    iter_nelems, reduction_nelems, preferred_reductions_per_wi,
+                    reduction_groups, in_out_iter_indexer, reduction_indexer,
+                    depends);
+        }
+
+        std::size_t remaining_reduction_nelems = reduction_groups;
+
+        resTy *temp_arg = partially_reduced_tmp;
+        resTy *temp2_arg = partially_reduced_tmp2;
+
+        argTy *vals_temp_arg = partially_reduced_vals_tmp;
+        argTy *vals_temp2_arg = partially_reduced_vals_tmp2;
+
+        sycl::event dependent_ev = first_reduction_ev;
+
+        while (remaining_reduction_nelems >
+               preferred_reductions_per_wi * max_wg) {
+            std::size_t reduction_groups_ =
+                (remaining_reduction_nelems + preferred_reductions_per_wi * wg -
+                 1) /
+                (preferred_reductions_per_wi * wg);
+            assert(reduction_groups_ > 1);
+
+            // keep reducing
+            using InputIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer;
+            using ResIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+            using InputOutputIterIndexerT =
+                dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
+                    InputIndexerT, ResIndexerT>;
+            using ReductionIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+
+            const InputIndexerT inp_indexer{/* size */ iter_nelems,
+                                            /* step */ reduction_groups_};
+            static constexpr ResIndexerT res_iter_indexer{};
+
+            const InputOutputIterIndexerT in_out_iter_indexer{inp_indexer,
+                                                              res_iter_indexer};
+            static constexpr ReductionIndexerT reduction_indexer{};
+
+            sycl::event partial_reduction_ev =
+                submit_search_reduction<argTy, resTy, ReductionOpT, IndexOpT,
+                                        InputOutputIterIndexerT,
+                                        ReductionIndexerT, false, false>(
+                    exec_q, vals_temp_arg, vals_temp2_arg, temp_arg, temp2_arg,
+                    identity_val, idx_identity_val, wg, iter_nelems,
+                    remaining_reduction_nelems, preferred_reductions_per_wi,
+                    reduction_groups_, in_out_iter_indexer, reduction_indexer,
+                    {dependent_ev});
+
+            remaining_reduction_nelems = reduction_groups_;
+            std::swap(temp_arg, temp2_arg);
+            std::swap(vals_temp_arg, vals_temp2_arg);
+            dependent_ev = partial_reduction_ev;
+        }
+
+        // final reduction to res
+        using InputIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer;
+        using ResIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+        using InputOutputIterIndexerT =
+            dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
+                InputIndexerT, ResIndexerT>;
+        using ReductionIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+
+        const InputIndexerT inp_indexer{/* size */ iter_nelems,
+                                        /* step */ remaining_reduction_nelems};
+        static constexpr ResIndexerT res_iter_indexer{};
+
+        const InputOutputIterIndexerT in_out_iter_indexer{inp_indexer,
+                                                          res_iter_indexer};
+        static constexpr ReductionIndexerT reduction_indexer{};
+
+        wg = max_wg;
+        reductions_per_wi = std::max<std::size_t>(
+            1, (remaining_reduction_nelems + wg - 1) / wg);
+
+        reduction_groups =
+            (remaining_reduction_nelems + reductions_per_wi * wg - 1) /
+            (reductions_per_wi * wg);
+        assert(reduction_groups == 1);
+
+        sycl::event final_reduction_ev =
+            submit_search_reduction<argTy, resTy, ReductionOpT, IndexOpT,
+                                    InputOutputIterIndexerT, ReductionIndexerT,
+                                    false, true>(
+                exec_q, vals_temp_arg, nullptr, temp_arg, res_tp, identity_val,
+                idx_identity_val, wg, iter_nelems, remaining_reduction_nelems,
+                reductions_per_wi, reduction_groups, in_out_iter_indexer,
+                reduction_indexer, {dependent_ev});
+
+        sycl::event cleanup_host_task_event =
+            dpctl::tensor::alloc_utils::async_smart_free(
+                exec_q, {final_reduction_ev}, tmp_owner, val_tmp_owner);
+
+        // FIXME: do not return host-task event
+        //   Instead collect all host-tasks to a list
+
+        return cleanup_host_task_event;
+    }
+}
+
+template <typename argTy,
+          typename resTy,
+          typename ReductionOpT,
+          typename IndexOpT>
+sycl::event search_axis0_over_group_temps_contig_impl(
+    sycl::queue &exec_q,
+    std::size_t iter_nelems,      // number of reductions    (num. of rows in a
+                                  // matrix when reducing over rows)
+    std::size_t reduction_nelems, // size of each reduction  (length of rows,
+                                  // i.e. number of columns)
+    const char *arg_cp,
+    char *res_cp,
+    ssize_t iter_arg_offset,
+    ssize_t iter_res_offset,
+    ssize_t reduction_arg_offset,
+    const std::vector<sycl::event> &depends)
+{
+    const argTy *arg_tp = reinterpret_cast<const argTy *>(arg_cp) +
+                          iter_arg_offset + reduction_arg_offset;
+    resTy *res_tp = reinterpret_cast<resTy *>(res_cp) + iter_res_offset;
+
+    static constexpr argTy identity_val =
+        su_ns::Identity<ReductionOpT, argTy>::value;
+    static constexpr resTy idx_identity_val =
+        su_ns::Identity<IndexOpT, resTy>::value;
+
+    if (reduction_nelems == 0) {
+        sycl::event res_init_ev = exec_q.fill<resTy>(
+            res_tp, resTy(idx_identity_val), iter_nelems, depends);
+
+        return res_init_ev;
+    }
+
+    const sycl::device &d = exec_q.get_device();
+    const auto &sg_sizes = d.get_info<sycl::info::device::sub_group_sizes>();
+    std::size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes);
+
+    if (reduction_nelems < wg) {
+        using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+        using InputOutputIterIndexerT =
+            dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
+                NoOpIndexerT, NoOpIndexerT>;
+        using ReductionIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer;
+
+        const InputOutputIterIndexerT in_out_iter_indexer{NoOpIndexerT{},
+                                                          NoOpIndexerT{}};
+        const ReductionIndexerT reduction_indexer{/* size */ reduction_nelems,
+                                                  /* step */ iter_nelems};
+
+        sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) {
+            cgh.depends_on(depends);
+
+            using KernelName =
+                class search_seq_contig_krn<argTy, resTy, ReductionOpT,
+                                            IndexOpT, InputOutputIterIndexerT,
+                                            ReductionIndexerT>;
+
+            sycl::range<1> iter_range{iter_nelems};
+
+            cgh.parallel_for<KernelName>(
+                iter_range,
+                SequentialSearchReduction<argTy, resTy, ReductionOpT, IndexOpT,
+                                          InputOutputIterIndexerT,
+                                          ReductionIndexerT>(
+                    arg_tp, res_tp, ReductionOpT(), identity_val, IndexOpT(),
+                    idx_identity_val, in_out_iter_indexer, reduction_indexer,
+                    reduction_nelems));
+        });
+
+        return comp_ev;
+    }
+
+    static constexpr std::size_t preferred_reductions_per_wi = 8;
+    // prevents running out of resources on CPU
+    std::size_t max_wg = reduction_detail::get_work_group_size(d);
+
+    std::size_t reductions_per_wi(preferred_reductions_per_wi);
+    if (reduction_nelems <= preferred_reductions_per_wi * max_wg) {
+        // Perform reduction using one 1 work-group per iteration,
+        // can output directly to res
+        using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+        using ColsIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer;
+        using InputOutputIterIndexerT =
+            dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
+                NoOpIndexerT, NoOpIndexerT>;
+        using ReductionIndexerT = ColsIndexerT;
+
+        static constexpr NoOpIndexerT columns_indexer{};
+        static constexpr NoOpIndexerT result_indexer{};
+        const InputOutputIterIndexerT in_out_iter_indexer{columns_indexer,
+                                                          result_indexer};
+        const ReductionIndexerT reduction_indexer{/* size */ reduction_nelems,
+                                                  /* step */ iter_nelems};
+
+        if (iter_nelems == 1) {
+            // increase GPU occupancy
+            wg = max_wg;
+        }
+        reductions_per_wi =
+            std::max<std::size_t>(1, (reduction_nelems + wg - 1) / wg);
+
+        std::size_t reduction_groups =
+            (reduction_nelems + reductions_per_wi * wg - 1) /
+            (reductions_per_wi * wg);
+        assert(reduction_groups == 1);
+
+        sycl::event comp_ev =
+            submit_search_reduction<argTy, resTy, ReductionOpT, IndexOpT,
+                                    InputOutputIterIndexerT, ReductionIndexerT,
+                                    true, true>(
+                exec_q, arg_tp, nullptr, nullptr, res_tp, identity_val,
+                idx_identity_val, wg, iter_nelems, reduction_nelems,
+                reductions_per_wi, reduction_groups, in_out_iter_indexer,
+                reduction_indexer, depends);
+
+        return comp_ev;
+    }
+    else {
+        // more than one work-groups is needed, requires a temporary
+        std::size_t reduction_groups =
+            (reduction_nelems + preferred_reductions_per_wi * wg - 1) /
+            (preferred_reductions_per_wi * wg);
+        assert(reduction_groups > 1);
+
+        std::size_t second_iter_reduction_groups_ =
+            (reduction_groups + preferred_reductions_per_wi * wg - 1) /
+            (preferred_reductions_per_wi * wg);
+
+        const std::size_t tmp_alloc_size =
+            iter_nelems * (reduction_groups + second_iter_reduction_groups_);
+        auto tmp_owner = dpctl::tensor::alloc_utils::smart_malloc_device<resTy>(
+            tmp_alloc_size, exec_q);
+
+        resTy *partially_reduced_tmp = tmp_owner.get();
+        resTy *partially_reduced_tmp2 =
+            partially_reduced_tmp + reduction_groups * iter_nelems;
+
+        auto vals_tmp_owner =
+            dpctl::tensor::alloc_utils::smart_malloc_device<argTy>(
+                tmp_alloc_size, exec_q);
+        argTy *partially_reduced_vals_tmp = vals_tmp_owner.get();
+        argTy *partially_reduced_vals_tmp2 =
+            partially_reduced_vals_tmp + reduction_groups * iter_nelems;
+
+        sycl::event first_reduction_ev;
+        {
+            using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+            using ColsIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer;
+            using InputOutputIterIndexerT =
+                dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
+                    NoOpIndexerT, NoOpIndexerT>;
+            using ReductionIndexerT = ColsIndexerT;
+
+            static constexpr NoOpIndexerT columns_indexer{};
+            static constexpr NoOpIndexerT result_indexer{};
+            const InputOutputIterIndexerT in_out_iter_indexer{columns_indexer,
+                                                              result_indexer};
+            const ReductionIndexerT reduction_indexer{
+                /* size */ reduction_nelems,
+                /* step */ iter_nelems};
+
+            first_reduction_ev =
+                submit_search_reduction<argTy, resTy, ReductionOpT, IndexOpT,
+                                        InputOutputIterIndexerT,
+                                        ReductionIndexerT, true, false>(
+                    exec_q, arg_tp, partially_reduced_vals_tmp, nullptr,
+                    partially_reduced_tmp, identity_val, idx_identity_val, wg,
+                    iter_nelems, reduction_nelems, preferred_reductions_per_wi,
+                    reduction_groups, in_out_iter_indexer, reduction_indexer,
+                    depends);
+        }
+
+        std::size_t remaining_reduction_nelems = reduction_groups;
+
+        resTy *temp_arg = partially_reduced_tmp;
+        resTy *temp2_arg = partially_reduced_tmp2;
+
+        argTy *vals_temp_arg = partially_reduced_vals_tmp;
+        argTy *vals_temp2_arg = partially_reduced_vals_tmp2;
+
+        sycl::event dependent_ev = first_reduction_ev;
+
+        while (remaining_reduction_nelems >
+               preferred_reductions_per_wi * max_wg) {
+            std::size_t reduction_groups_ =
+                (remaining_reduction_nelems + preferred_reductions_per_wi * wg -
+                 1) /
+                (preferred_reductions_per_wi * wg);
+            assert(reduction_groups_ > 1);
+
+            // keep reducing
+            using InputIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer;
+            using ResIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+            using InputOutputIterIndexerT =
+                dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
+                    InputIndexerT, ResIndexerT>;
+            using ReductionIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+
+            const InputIndexerT inp_indexer{/* size */ iter_nelems,
+                                            /* step */ reduction_groups_};
+            static constexpr ResIndexerT res_iter_indexer{};
+
+            const InputOutputIterIndexerT in_out_iter_indexer{inp_indexer,
+                                                              res_iter_indexer};
+            static constexpr ReductionIndexerT reduction_indexer{};
+
+            sycl::event partial_reduction_ev =
+                submit_search_reduction<argTy, resTy, ReductionOpT, IndexOpT,
+                                        InputOutputIterIndexerT,
+                                        ReductionIndexerT, false, false>(
+                    exec_q, vals_temp_arg, vals_temp2_arg, temp_arg, temp2_arg,
+                    identity_val, idx_identity_val, wg, iter_nelems,
+                    remaining_reduction_nelems, preferred_reductions_per_wi,
+                    reduction_groups_, in_out_iter_indexer, reduction_indexer,
+                    {dependent_ev});
+
+            remaining_reduction_nelems = reduction_groups_;
+            std::swap(temp_arg, temp2_arg);
+            std::swap(vals_temp_arg, vals_temp2_arg);
+            dependent_ev = partial_reduction_ev;
+        }
+
+        // final reduction to res
+        using InputIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer;
+        using ResIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+        using InputOutputIterIndexerT =
+            dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
+                InputIndexerT, ResIndexerT>;
+        using ReductionIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+
+        const InputIndexerT inp_indexer{/* size */ iter_nelems,
+                                        /* step */ remaining_reduction_nelems};
+        static constexpr ResIndexerT res_iter_indexer{};
+
+        const InputOutputIterIndexerT in_out_iter_indexer{inp_indexer,
+                                                          res_iter_indexer};
+        static constexpr ReductionIndexerT reduction_indexer{};
+
+        wg = max_wg;
+        reductions_per_wi = std::max<std::size_t>(
+            1, (remaining_reduction_nelems + wg - 1) / wg);
+
+        reduction_groups =
+            (remaining_reduction_nelems + reductions_per_wi * wg - 1) /
+            (reductions_per_wi * wg);
+        assert(reduction_groups == 1);
+
+        sycl::event final_reduction_ev =
+            submit_search_reduction<argTy, resTy, ReductionOpT, IndexOpT,
+                                    InputOutputIterIndexerT, ReductionIndexerT,
+                                    false, true>(
+                exec_q, vals_temp_arg, nullptr, temp_arg, res_tp, identity_val,
+                idx_identity_val, wg, iter_nelems, remaining_reduction_nelems,
+                reductions_per_wi, reduction_groups, in_out_iter_indexer,
+                reduction_indexer, {dependent_ev});
+
+        sycl::event cleanup_host_task_event =
+            dpctl::tensor::alloc_utils::async_smart_free(
+                exec_q, {final_reduction_ev}, tmp_owner, vals_tmp_owner);
+
+        // FIXME: do not return host-task event
+        //   Instead collect all host-tasks to a list
+
+        return cleanup_host_task_event;
+    }
+}
+
+} // namespace dpctl::tensor::kernels
diff --git a/dpctl_ext/tensor/libtensor/source/reductions/all.cpp b/dpctl_ext/tensor/libtensor/source/reductions/all.cpp
new file mode 100644
index 000000000000..a901b9e1d9a3
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/reductions/all.cpp
@@ -0,0 +1,164 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_reductions_impl
+/// extension.
+//===---------------------------------------------------------------------===//
+
+#include <cstdint>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "kernels/reductions.hpp"
+#include "reduction_atomic_support.hpp"
+#include "reduction_over_axis.hpp"
+#include "utils/type_dispatch.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace impl
+{
+
+using dpctl::tensor::kernels::reduction_strided_impl_fn_ptr;
+static reduction_strided_impl_fn_ptr
+    all_reduction_strided_dispatch_vector[td_ns::num_types];
+
+using dpctl::tensor::kernels::reduction_contig_impl_fn_ptr;
+static reduction_contig_impl_fn_ptr
+    all_reduction_axis1_contig_dispatch_vector[td_ns::num_types];
+static reduction_contig_impl_fn_ptr
+    all_reduction_axis0_contig_dispatch_vector[td_ns::num_types];
+
+template <typename fnT, typename srcTy>
+struct AllStridedFactory
+{
+    fnT get() const
+    {
+        using dstTy = std::int32_t;
+        using ReductionOpT = sycl::logical_and<dstTy>;
+        return dpctl::tensor::kernels::
+            reduction_over_group_with_atomics_strided_impl<srcTy, dstTy,
+                                                           ReductionOpT>;
+    }
+};
+
+template <typename fnT, typename srcTy>
+struct AllAxis1ContigFactory
+{
+    fnT get() const
+    {
+        using dstTy = std::int32_t;
+        using ReductionOpT = sycl::logical_and<dstTy>;
+        return dpctl::tensor::kernels::
+            reduction_axis1_over_group_with_atomics_contig_impl<srcTy, dstTy,
+                                                                ReductionOpT>;
+    }
+};
+
+template <typename fnT, typename srcTy>
+struct AllAxis0ContigFactory
+{
+    fnT get() const
+    {
+        using dstTy = std::int32_t;
+        using ReductionOpT = sycl::logical_and<dstTy>;
+        return dpctl::tensor::kernels::
+            reduction_axis0_over_group_with_atomics_contig_impl<srcTy, dstTy,
+                                                                ReductionOpT>;
+    }
+};
+
+void populate_all_dispatch_vectors(void)
+{
+    using td_ns::DispatchVectorBuilder;
+
+    DispatchVectorBuilder<reduction_strided_impl_fn_ptr, AllStridedFactory,
+                          td_ns::num_types>
+        all_dvb1;
+    all_dvb1.populate_dispatch_vector(all_reduction_strided_dispatch_vector);
+
+    DispatchVectorBuilder<reduction_contig_impl_fn_ptr, AllAxis1ContigFactory,
+                          td_ns::num_types>
+        all_dvb2;
+    all_dvb2.populate_dispatch_vector(
+        all_reduction_axis1_contig_dispatch_vector);
+
+    DispatchVectorBuilder<reduction_contig_impl_fn_ptr, AllAxis0ContigFactory,
+                          td_ns::num_types>
+        all_dvb3;
+    all_dvb3.populate_dispatch_vector(
+        all_reduction_axis0_contig_dispatch_vector);
+};
+
+using atomic_support::atomic_support_fn_ptr_t;
+using atomic_support::check_atomic_support;
+static atomic_support_fn_ptr_t all_atomic_support =
+    check_atomic_support<std::int32_t>;
+
+} // namespace impl
+
+void init_all(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        impl::populate_all_dispatch_vectors();
+        using impl::all_reduction_axis0_contig_dispatch_vector;
+        using impl::all_reduction_axis1_contig_dispatch_vector;
+        using impl::all_reduction_strided_dispatch_vector;
+
+        using impl::all_atomic_support;
+
+        auto all_pyapi = [&](const arrayT &src, int trailing_dims_to_reduce,
+                             const arrayT &dst, sycl::queue &exec_q,
+                             const event_vecT &depends = {}) {
+            return py_boolean_reduction(
+                src, trailing_dims_to_reduce, dst, exec_q, depends,
+                all_reduction_axis1_contig_dispatch_vector,
+                all_reduction_axis0_contig_dispatch_vector,
+                all_reduction_strided_dispatch_vector, all_atomic_support);
+        };
+        m.def("_all", all_pyapi, "", py::arg("src"),
+              py::arg("trailing_dims_to_reduce"), py::arg("dst"),
+              py::arg("sycl_queue"), py::arg("depends") = py::list());
+    }
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/reductions/all.hpp b/dpctl_ext/tensor/libtensor/source/reductions/all.hpp
new file mode 100644
index 000000000000..5fb184e37c66
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/reductions/all.hpp
@@ -0,0 +1,46 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_reductions_impl
+/// extension.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern void init_all(py::module_ m);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/reductions/any.cpp b/dpctl_ext/tensor/libtensor/source/reductions/any.cpp
new file mode 100644
index 000000000000..6859e46cbc4a
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/reductions/any.cpp
@@ -0,0 +1,164 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_reductions_impl
+/// extension.
+//===---------------------------------------------------------------------===//
+
+#include <cstdint>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "kernels/reductions.hpp"
+#include "reduction_atomic_support.hpp"
+#include "reduction_over_axis.hpp"
+#include "utils/type_dispatch.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace impl
+{
+
+using dpctl::tensor::kernels::reduction_strided_impl_fn_ptr;
+static reduction_strided_impl_fn_ptr
+    any_reduction_strided_dispatch_vector[td_ns::num_types];
+
+using dpctl::tensor::kernels::reduction_contig_impl_fn_ptr;
+static reduction_contig_impl_fn_ptr
+    any_reduction_axis1_contig_dispatch_vector[td_ns::num_types];
+static reduction_contig_impl_fn_ptr
+    any_reduction_axis0_contig_dispatch_vector[td_ns::num_types];
+
+template <typename fnT, typename srcTy>
+struct AnyStridedFactory
+{
+    fnT get() const
+    {
+        using dstTy = std::int32_t;
+        using ReductionOpT = sycl::logical_or<dstTy>;
+        return dpctl::tensor::kernels::
+            reduction_over_group_with_atomics_strided_impl<srcTy, dstTy,
+                                                           ReductionOpT>;
+    }
+};
+
+template <typename fnT, typename srcTy>
+struct AnyAxis1ContigFactory
+{
+    fnT get() const
+    {
+        using dstTy = std::int32_t;
+        using ReductionOpT = sycl::logical_or<dstTy>;
+        return dpctl::tensor::kernels::
+            reduction_axis1_over_group_with_atomics_contig_impl<srcTy, dstTy,
+                                                                ReductionOpT>;
+    }
+};
+
+template <typename fnT, typename srcTy>
+struct AnyAxis0ContigFactory
+{
+    fnT get() const
+    {
+        using dstTy = std::int32_t;
+        using ReductionOpT = sycl::logical_or<dstTy>;
+        return dpctl::tensor::kernels::
+            reduction_axis0_over_group_with_atomics_contig_impl<srcTy, dstTy,
+                                                                ReductionOpT>;
+    }
+};
+
+void populate_any_dispatch_vectors(void)
+{
+    using td_ns::DispatchVectorBuilder;
+
+    DispatchVectorBuilder<reduction_strided_impl_fn_ptr, AnyStridedFactory,
+                          td_ns::num_types>
+        any_dvb1;
+    any_dvb1.populate_dispatch_vector(any_reduction_strided_dispatch_vector);
+
+    DispatchVectorBuilder<reduction_contig_impl_fn_ptr, AnyAxis1ContigFactory,
+                          td_ns::num_types>
+        any_dvb2;
+    any_dvb2.populate_dispatch_vector(
+        any_reduction_axis1_contig_dispatch_vector);
+
+    DispatchVectorBuilder<reduction_contig_impl_fn_ptr, AnyAxis0ContigFactory,
+                          td_ns::num_types>
+        any_dvb3;
+    any_dvb3.populate_dispatch_vector(
+        any_reduction_axis0_contig_dispatch_vector);
+};
+
+using atomic_support::atomic_support_fn_ptr_t;
+using atomic_support::check_atomic_support;
+static atomic_support_fn_ptr_t any_atomic_support =
+    check_atomic_support<std::int32_t>;
+
+} // namespace impl
+
+void init_any(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        impl::populate_any_dispatch_vectors();
+        using impl::any_reduction_axis0_contig_dispatch_vector;
+        using impl::any_reduction_axis1_contig_dispatch_vector;
+        using impl::any_reduction_strided_dispatch_vector;
+
+        using impl::any_atomic_support;
+
+        auto any_pyapi = [&](const arrayT &src, int trailing_dims_to_reduce,
+                             const arrayT &dst, sycl::queue &exec_q,
+                             const event_vecT &depends = {}) {
+            return py_boolean_reduction(
+                src, trailing_dims_to_reduce, dst, exec_q, depends,
+                any_reduction_axis1_contig_dispatch_vector,
+                any_reduction_axis0_contig_dispatch_vector,
+                any_reduction_strided_dispatch_vector, any_atomic_support);
+        };
+        m.def("_any", any_pyapi, "", py::arg("src"),
+              py::arg("trailing_dims_to_reduce"), py::arg("dst"),
+              py::arg("sycl_queue"), py::arg("depends") = py::list());
+    }
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/reductions/any.hpp b/dpctl_ext/tensor/libtensor/source/reductions/any.hpp
new file mode 100644
index 000000000000..4e368a674615
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/reductions/any.hpp
@@ -0,0 +1,46 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_reductions_impl
+/// extension.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern void init_any(py::module_ m);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/reductions/argmax.cpp b/dpctl_ext/tensor/libtensor/source/reductions/argmax.cpp
new file mode 100644
index 000000000000..10fc49759168
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/reductions/argmax.cpp
@@ -0,0 +1,279 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_reductions_impl
+/// extension.
+//===---------------------------------------------------------------------===//
+
+#include <complex>
+#include <cstdint>
+#include <type_traits>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "kernels/reductions.hpp"
+#include "reduction_over_axis.hpp"
+#include "utils/sycl_utils.hpp"
+#include "utils/type_dispatch_building.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+namespace su_ns = dpctl::tensor::sycl_utils;
+
+namespace impl
+{
+
+using dpctl::tensor::kernels::search_strided_impl_fn_ptr;
+static search_strided_impl_fn_ptr
+    argmax_over_axis_strided_temps_dispatch_table[td_ns::num_types]
+                                                 [td_ns::num_types];
+
+using dpctl::tensor::kernels::search_contig_impl_fn_ptr;
+static search_contig_impl_fn_ptr
+    argmax_over_axis1_contig_temps_dispatch_table[td_ns::num_types]
+                                                 [td_ns::num_types];
+using dpctl::tensor::kernels::search_contig_impl_fn_ptr;
+static search_contig_impl_fn_ptr
+    argmax_over_axis0_contig_temps_dispatch_table[td_ns::num_types]
+                                                 [td_ns::num_types];
+
+template <typename argTy, typename outTy>
+struct TypePairSupportForArgmaxReductionTemps
+{
+
+    static constexpr bool is_defined = std::disjunction<
+        td_ns::TypePairDefinedEntry<argTy, bool, outTy, std::int64_t>,
+        // input int8_t
+        td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, std::int64_t>,
+
+        // input uint8_t
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, std::int64_t>,
+
+        // input int16_t
+        td_ns::TypePairDefinedEntry<argTy, std::int16_t, outTy, std::int64_t>,
+
+        // input uint16_t
+        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, std::int64_t>,
+
+        // input int32_t
+        td_ns::TypePairDefinedEntry<argTy, std::int32_t, outTy, std::int64_t>,
+        // input uint32_t
+        td_ns::TypePairDefinedEntry<argTy, std::uint32_t, outTy, std::int64_t>,
+
+        // input int64_t
+        td_ns::TypePairDefinedEntry<argTy, std::int64_t, outTy, std::int64_t>,
+
+        // input uint32_t
+        td_ns::TypePairDefinedEntry<argTy, std::uint64_t, outTy, std::int64_t>,
+
+        // input half
+        td_ns::TypePairDefinedEntry<argTy, sycl::half, outTy, std::int64_t>,
+
+        // input float
+        td_ns::TypePairDefinedEntry<argTy, float, outTy, std::int64_t>,
+
+        // input double
+        td_ns::TypePairDefinedEntry<argTy, double, outTy, std::int64_t>,
+
+        // input std::complex
+        td_ns::TypePairDefinedEntry<argTy,
+                                    std::complex<float>,
+                                    outTy,
+                                    std::int64_t>,
+
+        td_ns::TypePairDefinedEntry<argTy,
+                                    std::complex<double>,
+                                    outTy,
+                                    std::int64_t>,
+
+        // fall-through
+        td_ns::NotDefinedEntry>::is_defined;
+};
+
+template <typename fnT, typename srcTy, typename dstTy>
+struct ArgmaxOverAxisTempsStridedFactory
+{
+    fnT get() const
+    {
+        if constexpr (TypePairSupportForArgmaxReductionTemps<srcTy,
+                                                             dstTy>::is_defined)
+        {
+            if constexpr (std::is_integral_v<srcTy> &&
+                          !std::is_same_v<srcTy, bool>) {
+                // op for values
+                using ReductionOpT = sycl::maximum<srcTy>;
+                // op for indices
+                using IndexOpT = sycl::minimum<dstTy>;
+                return dpctl::tensor::kernels::
+                    search_over_group_temps_strided_impl<
+                        srcTy, dstTy, ReductionOpT, IndexOpT>;
+            }
+            else {
+                // op for values
+                using ReductionOpT = su_ns::Maximum<srcTy>;
+                // op for indices
+                using IndexOpT = sycl::minimum<dstTy>;
+                return dpctl::tensor::kernels::
+                    search_over_group_temps_strided_impl<
+                        srcTy, dstTy, ReductionOpT, IndexOpT>;
+            }
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+template <typename fnT, typename srcTy, typename dstTy>
+struct ArgmaxOverAxis1TempsContigFactory
+{
+    fnT get() const
+    {
+        if constexpr (TypePairSupportForArgmaxReductionTemps<srcTy,
+                                                             dstTy>::is_defined)
+        {
+            if constexpr (std::is_integral_v<srcTy> &&
+                          !std::is_same_v<srcTy, bool>) {
+                // op for values
+                using ReductionOpT = sycl::maximum<srcTy>;
+                // op for indices
+                using IndexOpT = sycl::minimum<dstTy>;
+                return dpctl::tensor::kernels::
+                    search_axis1_over_group_temps_contig_impl<
+                        srcTy, dstTy, ReductionOpT, IndexOpT>;
+            }
+            else {
+                // op for values
+                using ReductionOpT = su_ns::Maximum<srcTy>;
+                // op for indices
+                using IndexOpT = sycl::minimum<dstTy>;
+                return dpctl::tensor::kernels::
+                    search_axis1_over_group_temps_contig_impl<
+                        srcTy, dstTy, ReductionOpT, IndexOpT>;
+            }
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+template <typename fnT, typename srcTy, typename dstTy>
+struct ArgmaxOverAxis0TempsContigFactory
+{
+    fnT get() const
+    {
+        if constexpr (TypePairSupportForArgmaxReductionTemps<srcTy,
+                                                             dstTy>::is_defined)
+        {
+            if constexpr (std::is_integral_v<srcTy> &&
+                          !std::is_same_v<srcTy, bool>) {
+                // op for values
+                using ReductionOpT = sycl::maximum<srcTy>;
+                // op for indices
+                using IndexOpT = sycl::minimum<dstTy>;
+                return dpctl::tensor::kernels::
+                    search_axis0_over_group_temps_contig_impl<
+                        srcTy, dstTy, ReductionOpT, IndexOpT>;
+            }
+            else {
+                // op for values
+                using ReductionOpT = su_ns::Maximum<srcTy>;
+                // op for indices
+                using IndexOpT = sycl::minimum<dstTy>;
+                return dpctl::tensor::kernels::
+                    search_axis0_over_group_temps_contig_impl<
+                        srcTy, dstTy, ReductionOpT, IndexOpT>;
+            }
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+void populate_argmax_over_axis_dispatch_tables(void)
+{
+    using td_ns::DispatchTableBuilder;
+
+    DispatchTableBuilder<search_strided_impl_fn_ptr,
+                         ArgmaxOverAxisTempsStridedFactory, td_ns::num_types>
+        dtb1;
+    dtb1.populate_dispatch_table(argmax_over_axis_strided_temps_dispatch_table);
+
+    DispatchTableBuilder<search_contig_impl_fn_ptr,
+                         ArgmaxOverAxis1TempsContigFactory, td_ns::num_types>
+        dtb2;
+    dtb2.populate_dispatch_table(argmax_over_axis1_contig_temps_dispatch_table);
+
+    DispatchTableBuilder<search_contig_impl_fn_ptr,
+                         ArgmaxOverAxis0TempsContigFactory, td_ns::num_types>
+        dtb3;
+    dtb3.populate_dispatch_table(argmax_over_axis0_contig_temps_dispatch_table);
+}
+
+} // namespace impl
+
+void init_argmax(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        using impl::populate_argmax_over_axis_dispatch_tables;
+        populate_argmax_over_axis_dispatch_tables();
+        using impl::argmax_over_axis0_contig_temps_dispatch_table;
+        using impl::argmax_over_axis1_contig_temps_dispatch_table;
+        using impl::argmax_over_axis_strided_temps_dispatch_table;
+
+        auto argmax_pyapi = [&](const arrayT &src, int trailing_dims_to_reduce,
+                                const arrayT &dst, sycl::queue &exec_q,
+                                const event_vecT &depends = {}) {
+            return py_search_over_axis(
+                src, trailing_dims_to_reduce, dst, exec_q, depends,
+                argmax_over_axis_strided_temps_dispatch_table,
+                argmax_over_axis0_contig_temps_dispatch_table,
+                argmax_over_axis1_contig_temps_dispatch_table);
+        };
+        m.def("_argmax_over_axis", argmax_pyapi, "", py::arg("src"),
+              py::arg("trailing_dims_to_reduce"), py::arg("dst"),
+              py::arg("sycl_queue"), py::arg("depends") = py::list());
+    }
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/reductions/argmax.hpp b/dpctl_ext/tensor/libtensor/source/reductions/argmax.hpp
new file mode 100644
index 000000000000..3274f8c7d0cb
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/reductions/argmax.hpp
@@ -0,0 +1,46 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_reductions_impl
+/// extension.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern void init_argmax(py::module_ m);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/reductions/argmin.cpp b/dpctl_ext/tensor/libtensor/source/reductions/argmin.cpp
new file mode 100644
index 000000000000..ec4637b62d49
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/reductions/argmin.cpp
@@ -0,0 +1,279 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_reductions_impl
+/// extension.
+//===---------------------------------------------------------------------===//
+
+#include <complex>
+#include <cstdint>
+#include <type_traits>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "kernels/reductions.hpp"
+#include "reduction_over_axis.hpp"
+#include "utils/sycl_utils.hpp"
+#include "utils/type_dispatch_building.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+namespace su_ns = dpctl::tensor::sycl_utils;
+
+namespace impl
+{
+
+using dpctl::tensor::kernels::search_strided_impl_fn_ptr;
+static search_strided_impl_fn_ptr
+    argmin_over_axis_strided_temps_dispatch_table[td_ns::num_types]
+                                                 [td_ns::num_types];
+
+using dpctl::tensor::kernels::search_contig_impl_fn_ptr;
+static search_contig_impl_fn_ptr
+    argmin_over_axis1_contig_temps_dispatch_table[td_ns::num_types]
+                                                 [td_ns::num_types];
+using dpctl::tensor::kernels::search_contig_impl_fn_ptr;
+static search_contig_impl_fn_ptr
+    argmin_over_axis0_contig_temps_dispatch_table[td_ns::num_types]
+                                                 [td_ns::num_types];
+
+template <typename argTy, typename outTy>
+struct TypePairSupportForArgminReductionTemps
+{
+
+    static constexpr bool is_defined = std::disjunction<
+        td_ns::TypePairDefinedEntry<argTy, bool, outTy, std::int64_t>,
+        // input int8_t
+        td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, std::int64_t>,
+
+        // input uint8_t
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, std::int64_t>,
+
+        // input int16_t
+        td_ns::TypePairDefinedEntry<argTy, std::int16_t, outTy, std::int64_t>,
+
+        // input uint16_t
+        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, std::int64_t>,
+
+        // input int32_t
+        td_ns::TypePairDefinedEntry<argTy, std::int32_t, outTy, std::int64_t>,
+        // input uint32_t
+        td_ns::TypePairDefinedEntry<argTy, std::uint32_t, outTy, std::int64_t>,
+
+        // input int64_t
+        td_ns::TypePairDefinedEntry<argTy, std::int64_t, outTy, std::int64_t>,
+
+        // input uint32_t
+        td_ns::TypePairDefinedEntry<argTy, std::uint64_t, outTy, std::int64_t>,
+
+        // input half
+        td_ns::TypePairDefinedEntry<argTy, sycl::half, outTy, std::int64_t>,
+
+        // input float
+        td_ns::TypePairDefinedEntry<argTy, float, outTy, std::int64_t>,
+
+        // input double
+        td_ns::TypePairDefinedEntry<argTy, double, outTy, std::int64_t>,
+
+        // input std::complex
+        td_ns::TypePairDefinedEntry<argTy,
+                                    std::complex<float>,
+                                    outTy,
+                                    std::int64_t>,
+
+        td_ns::TypePairDefinedEntry<argTy,
+                                    std::complex<double>,
+                                    outTy,
+                                    std::int64_t>,
+
+        // fall-through
+        td_ns::NotDefinedEntry>::is_defined;
+};
+
+template <typename fnT, typename srcTy, typename dstTy>
+struct ArgminOverAxisTempsStridedFactory
+{
+    fnT get() const
+    {
+        if constexpr (TypePairSupportForArgminReductionTemps<srcTy,
+                                                             dstTy>::is_defined)
+        {
+            if constexpr (std::is_integral_v<srcTy> &&
+                          !std::is_same_v<srcTy, bool>) {
+                // op for values
+                using ReductionOpT = sycl::minimum<srcTy>;
+                // op for indices
+                using IndexOpT = sycl::minimum<dstTy>;
+                return dpctl::tensor::kernels::
+                    search_over_group_temps_strided_impl<
+                        srcTy, dstTy, ReductionOpT, IndexOpT>;
+            }
+            else {
+                // op for values
+                using ReductionOpT = su_ns::Minimum<srcTy>;
+                // op for indices
+                using IndexOpT = sycl::minimum<dstTy>;
+                return dpctl::tensor::kernels::
+                    search_over_group_temps_strided_impl<
+                        srcTy, dstTy, ReductionOpT, IndexOpT>;
+            }
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+template <typename fnT, typename srcTy, typename dstTy>
+struct ArgminOverAxis1TempsContigFactory
+{
+    fnT get() const
+    {
+        if constexpr (TypePairSupportForArgminReductionTemps<srcTy,
+                                                             dstTy>::is_defined)
+        {
+            if constexpr (std::is_integral_v<srcTy> &&
+                          !std::is_same_v<srcTy, bool>) {
+                // op for values
+                using ReductionOpT = sycl::minimum<srcTy>;
+                // op for indices
+                using IndexOpT = sycl::minimum<dstTy>;
+                return dpctl::tensor::kernels::
+                    search_axis1_over_group_temps_contig_impl<
+                        srcTy, dstTy, ReductionOpT, IndexOpT>;
+            }
+            else {
+                // op for values
+                using ReductionOpT = su_ns::Minimum<srcTy>;
+                // op for indices
+                using IndexOpT = sycl::minimum<dstTy>;
+                return dpctl::tensor::kernels::
+                    search_axis1_over_group_temps_contig_impl<
+                        srcTy, dstTy, ReductionOpT, IndexOpT>;
+            }
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+template <typename fnT, typename srcTy, typename dstTy>
+struct ArgminOverAxis0TempsContigFactory
+{
+    fnT get() const
+    {
+        if constexpr (TypePairSupportForArgminReductionTemps<srcTy,
+                                                             dstTy>::is_defined)
+        {
+            if constexpr (std::is_integral_v<srcTy> &&
+                          !std::is_same_v<srcTy, bool>) {
+                // op for values
+                using ReductionOpT = sycl::minimum<srcTy>;
+                // op for indices
+                using IndexOpT = sycl::minimum<dstTy>;
+                return dpctl::tensor::kernels::
+                    search_axis0_over_group_temps_contig_impl<
+                        srcTy, dstTy, ReductionOpT, IndexOpT>;
+            }
+            else {
+                // op for values
+                using ReductionOpT = su_ns::Minimum<srcTy>;
+                // op for indices
+                using IndexOpT = sycl::minimum<dstTy>;
+                return dpctl::tensor::kernels::
+                    search_axis0_over_group_temps_contig_impl<
+                        srcTy, dstTy, ReductionOpT, IndexOpT>;
+            }
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+void populate_argmin_over_axis_dispatch_tables(void)
+{
+    using td_ns::DispatchTableBuilder;
+
+    DispatchTableBuilder<search_strided_impl_fn_ptr,
+                         ArgminOverAxisTempsStridedFactory, td_ns::num_types>
+        dtb1;
+    dtb1.populate_dispatch_table(argmin_over_axis_strided_temps_dispatch_table);
+
+    DispatchTableBuilder<search_contig_impl_fn_ptr,
+                         ArgminOverAxis1TempsContigFactory, td_ns::num_types>
+        dtb2;
+    dtb2.populate_dispatch_table(argmin_over_axis1_contig_temps_dispatch_table);
+
+    DispatchTableBuilder<search_contig_impl_fn_ptr,
+                         ArgminOverAxis0TempsContigFactory, td_ns::num_types>
+        dtb3;
+    dtb3.populate_dispatch_table(argmin_over_axis0_contig_temps_dispatch_table);
+}
+
+} // namespace impl
+
+void init_argmin(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        using impl::populate_argmin_over_axis_dispatch_tables;
+        populate_argmin_over_axis_dispatch_tables();
+        using impl::argmin_over_axis0_contig_temps_dispatch_table;
+        using impl::argmin_over_axis1_contig_temps_dispatch_table;
+        using impl::argmin_over_axis_strided_temps_dispatch_table;
+
+        auto argmin_pyapi = [&](const arrayT &src, int trailing_dims_to_reduce,
+                                const arrayT &dst, sycl::queue &exec_q,
+                                const event_vecT &depends = {}) {
+            return py_search_over_axis(
+                src, trailing_dims_to_reduce, dst, exec_q, depends,
+                argmin_over_axis_strided_temps_dispatch_table,
+                argmin_over_axis0_contig_temps_dispatch_table,
+                argmin_over_axis1_contig_temps_dispatch_table);
+        };
+        m.def("_argmin_over_axis", argmin_pyapi, "", py::arg("src"),
+              py::arg("trailing_dims_to_reduce"), py::arg("dst"),
+              py::arg("sycl_queue"), py::arg("depends") = py::list());
+    }
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/reductions/argmin.hpp b/dpctl_ext/tensor/libtensor/source/reductions/argmin.hpp
new file mode 100644
index 000000000000..1865c258a527
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/reductions/argmin.hpp
@@ -0,0 +1,46 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_reductions_impl
+/// extension.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern void init_argmin(py::module_ m);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/reductions/logsumexp.cpp b/dpctl_ext/tensor/libtensor/source/reductions/logsumexp.cpp
new file mode 100644
index 000000000000..75e4010bfd5b
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/reductions/logsumexp.cpp
@@ -0,0 +1,258 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_reductions_impl
+/// extension.
+//===---------------------------------------------------------------------===//
+
+#include <cstdint>
+#include <type_traits>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "kernels/reductions.hpp"
+#include "reduction_over_axis.hpp"
+#include "utils/sycl_utils.hpp"
+#include "utils/type_dispatch_building.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+namespace su_ns = dpctl::tensor::sycl_utils;
+
+namespace impl
+{
+
+using dpctl::tensor::kernels::reduction_strided_impl_fn_ptr;
+static reduction_strided_impl_fn_ptr
+    logsumexp_over_axis_strided_temps_dispatch_table[td_ns::num_types]
+                                                    [td_ns::num_types];
+
+using dpctl::tensor::kernels::reduction_contig_impl_fn_ptr;
+static reduction_contig_impl_fn_ptr
+    logsumexp_over_axis1_contig_temps_dispatch_table[td_ns::num_types]
+                                                    [td_ns::num_types];
+static reduction_contig_impl_fn_ptr
+    logsumexp_over_axis0_contig_temps_dispatch_table[td_ns::num_types]
+                                                    [td_ns::num_types];
+
+template <typename argTy, typename outTy>
+struct TypePairSupportDataForLogSumExpReductionTemps
+{
+
+    static constexpr bool is_defined = std::disjunction<
+#if 1
+        td_ns::TypePairDefinedEntry<argTy, bool, outTy, sycl::half>,
+        td_ns::TypePairDefinedEntry<argTy, bool, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, bool, outTy, double>,
+
+        // input int8_t
+        td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, sycl::half>,
+        td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, double>,
+
+        // input uint8_t
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, sycl::half>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, double>,
+
+        // input int16_t
+        td_ns::TypePairDefinedEntry<argTy, std::int16_t, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, std::int16_t, outTy, double>,
+
+        // input uint16_t
+        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, double>,
+
+        // input int32_t
+        td_ns::TypePairDefinedEntry<argTy, std::int32_t, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, std::int32_t, outTy, double>,
+
+        // input uint32_t
+        td_ns::TypePairDefinedEntry<argTy, std::uint32_t, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint32_t, outTy, double>,
+
+        // input int64_t
+        td_ns::TypePairDefinedEntry<argTy, std::int64_t, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, std::int64_t, outTy, double>,
+
+        // input uint64_t
+        td_ns::TypePairDefinedEntry<argTy, std::uint64_t, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint64_t, outTy, double>,
+        // input half
+        td_ns::TypePairDefinedEntry<argTy, sycl::half, outTy, sycl::half>,
+        td_ns::TypePairDefinedEntry<argTy, sycl::half, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, sycl::half, outTy, double>,
+
+        // input float
+        td_ns::TypePairDefinedEntry<argTy, float, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, float, outTy, double>,
+
+        // input double
+        td_ns::TypePairDefinedEntry<argTy, double, outTy, double>,
+#endif
+
+        // fall-through
+        td_ns::NotDefinedEntry>::is_defined;
+};
+
+template <typename fnT, typename srcTy, typename dstTy>
+struct LogSumExpOverAxisTempsStridedFactory
+{
+    fnT get() const
+    {
+        if constexpr (TypePairSupportDataForLogSumExpReductionTemps<
+                          srcTy, dstTy>::is_defined)
+        {
+            using ReductionOpT = su_ns::LogSumExp<dstTy>;
+            return dpctl::tensor::kernels::
+                reduction_over_group_temps_strided_impl<srcTy, dstTy,
+                                                        ReductionOpT>;
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+template <typename fnT, typename srcTy, typename dstTy>
+struct LogSumExpOverAxis1TempsContigFactory
+{
+    fnT get() const
+    {
+        if constexpr (TypePairSupportDataForLogSumExpReductionTemps<
+                          srcTy, dstTy>::is_defined)
+        {
+            using ReductionOpT = su_ns::LogSumExp<dstTy>;
+            return dpctl::tensor::kernels::
+                reduction_axis1_over_group_temps_contig_impl<srcTy, dstTy,
+                                                             ReductionOpT>;
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+template <typename fnT, typename srcTy, typename dstTy>
+struct LogSumExpOverAxis0TempsContigFactory
+{
+    fnT get() const
+    {
+        if constexpr (TypePairSupportDataForLogSumExpReductionTemps<
+                          srcTy, dstTy>::is_defined)
+        {
+            using ReductionOpT = su_ns::LogSumExp<dstTy>;
+            return dpctl::tensor::kernels::
+                reduction_axis0_over_group_temps_contig_impl<srcTy, dstTy,
+                                                             ReductionOpT>;
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+void populate_logsumexp_over_axis_dispatch_tables(void)
+{
+    using dpctl::tensor::kernels::reduction_contig_impl_fn_ptr;
+    using dpctl::tensor::kernels::reduction_strided_impl_fn_ptr;
+    using namespace td_ns;
+
+    DispatchTableBuilder<reduction_strided_impl_fn_ptr,
+                         LogSumExpOverAxisTempsStridedFactory, num_types>
+        dtb1;
+    dtb1.populate_dispatch_table(
+        logsumexp_over_axis_strided_temps_dispatch_table);
+
+    DispatchTableBuilder<reduction_contig_impl_fn_ptr,
+                         LogSumExpOverAxis1TempsContigFactory, td_ns::num_types>
+        dtb2;
+    dtb2.populate_dispatch_table(
+        logsumexp_over_axis1_contig_temps_dispatch_table);
+
+    DispatchTableBuilder<reduction_contig_impl_fn_ptr,
+                         LogSumExpOverAxis0TempsContigFactory, td_ns::num_types>
+        dtb3;
+    dtb3.populate_dispatch_table(
+        logsumexp_over_axis0_contig_temps_dispatch_table);
+}
+
+} // namespace impl
+
+void init_logsumexp(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        using impl::populate_logsumexp_over_axis_dispatch_tables;
+        populate_logsumexp_over_axis_dispatch_tables();
+        using impl::logsumexp_over_axis0_contig_temps_dispatch_table;
+        using impl::logsumexp_over_axis1_contig_temps_dispatch_table;
+        using impl::logsumexp_over_axis_strided_temps_dispatch_table;
+
+        using dpctl::tensor::kernels::reduction_contig_impl_fn_ptr;
+        using dpctl::tensor::kernels::reduction_strided_impl_fn_ptr;
+
+        auto logsumexp_pyapi = [&](const arrayT &src,
+                                   int trailing_dims_to_reduce,
+                                   const arrayT &dst, sycl::queue &exec_q,
+                                   const event_vecT &depends = {}) {
+            return py_tree_reduction_over_axis(
+                src, trailing_dims_to_reduce, dst, exec_q, depends,
+                logsumexp_over_axis_strided_temps_dispatch_table,
+                logsumexp_over_axis0_contig_temps_dispatch_table,
+                logsumexp_over_axis1_contig_temps_dispatch_table);
+        };
+        m.def("_logsumexp_over_axis", logsumexp_pyapi, "", py::arg("src"),
+              py::arg("trailing_dims_to_reduce"), py::arg("dst"),
+              py::arg("sycl_queue"), py::arg("depends") = py::list());
+
+        auto logsumexp_dtype_supported = [&](const py::dtype &input_dtype,
+                                             const py::dtype &output_dtype) {
+            return py_tree_reduction_dtype_supported(
+                input_dtype, output_dtype,
+                logsumexp_over_axis_strided_temps_dispatch_table);
+        };
+        m.def("_logsumexp_over_axis_dtype_supported", logsumexp_dtype_supported,
+              "", py::arg("arg_dtype"), py::arg("out_dtype"));
+    }
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/reductions/logsumexp.hpp b/dpctl_ext/tensor/libtensor/source/reductions/logsumexp.hpp
new file mode 100644
index 000000000000..2e2c19877db6
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/reductions/logsumexp.hpp
@@ -0,0 +1,46 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_reductions_impl
+/// extension.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern void init_logsumexp(py::module_ m);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/reductions/max.cpp b/dpctl_ext/tensor/libtensor/source/reductions/max.cpp
new file mode 100644
index 000000000000..d19ed226d3b4
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/reductions/max.cpp
@@ -0,0 +1,410 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_reductions_impl
+/// extension.
+//===---------------------------------------------------------------------===//
+
+#include <complex>
+#include <cstdint>
+#include <type_traits>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "kernels/reductions.hpp"
+#include "utils/sycl_utils.hpp"
+#include "utils/type_dispatch_building.hpp"
+
+#include "reduction_atomic_support.hpp"
+#include "reduction_over_axis.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+namespace su_ns = dpctl::tensor::sycl_utils;
+
+namespace impl
+{
+
+using dpctl::tensor::kernels::reduction_strided_impl_fn_ptr;
+static reduction_strided_impl_fn_ptr
+    max_over_axis_strided_atomic_dispatch_table[td_ns::num_types]
+                                               [td_ns::num_types];
+static reduction_strided_impl_fn_ptr
+    max_over_axis_strided_temps_dispatch_table[td_ns::num_types]
+                                              [td_ns::num_types];
+
+using dpctl::tensor::kernels::reduction_contig_impl_fn_ptr;
+static reduction_contig_impl_fn_ptr
+    max_over_axis1_contig_atomic_dispatch_table[td_ns::num_types]
+                                               [td_ns::num_types];
+static reduction_contig_impl_fn_ptr
+    max_over_axis0_contig_atomic_dispatch_table[td_ns::num_types]
+                                               [td_ns::num_types];
+static reduction_contig_impl_fn_ptr
+    max_over_axis1_contig_temps_dispatch_table[td_ns::num_types]
+                                              [td_ns::num_types];
+static reduction_contig_impl_fn_ptr
+    max_over_axis0_contig_temps_dispatch_table[td_ns::num_types]
+                                              [td_ns::num_types];
+
+/* @brief Types supported by max reduction code based on atomic_ref */
+template <typename argTy, typename outTy>
+struct TypePairSupportDataForMaxReductionAtomic
+{
+    /* value is true if a kernel for <argTy, outTy> must be instantiated, false
+     * otherwise */
+    static constexpr bool is_defined = std::disjunction<
+        // input int32
+        td_ns::TypePairDefinedEntry<argTy, std::int32_t, outTy, std::int32_t>,
+        // input uint32
+        td_ns::TypePairDefinedEntry<argTy, std::uint32_t, outTy, std::uint32_t>,
+        // input int64
+        td_ns::TypePairDefinedEntry<argTy, std::int64_t, outTy, std::int64_t>,
+        // input uint64
+        td_ns::TypePairDefinedEntry<argTy, std::uint64_t, outTy, std::uint64_t>,
+        // input float
+        td_ns::TypePairDefinedEntry<argTy, float, outTy, float>,
+        // input double
+        td_ns::TypePairDefinedEntry<argTy, double, outTy, double>,
+        // fall-through
+        td_ns::NotDefinedEntry>::is_defined;
+};
+
+template <typename argTy, typename outTy>
+struct TypePairSupportDataForMaxReductionTemps
+{
+    static constexpr bool is_defined = std::disjunction<
+        // input bool
+        td_ns::TypePairDefinedEntry<argTy, bool, outTy, bool>,
+        // input int8_t
+        td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, std::int8_t>,
+        // input uint8_t
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, std::uint8_t>,
+
+        // input int16_t
+        td_ns::TypePairDefinedEntry<argTy, std::int16_t, outTy, std::int16_t>,
+        // input uint16_t
+        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, std::uint16_t>,
+
+        // input int32_t
+        td_ns::TypePairDefinedEntry<argTy, std::int32_t, outTy, std::int32_t>,
+        // input uint32_t
+        td_ns::TypePairDefinedEntry<argTy, std::uint32_t, outTy, std::uint32_t>,
+
+        // input int64_t
+        td_ns::TypePairDefinedEntry<argTy, std::int64_t, outTy, std::int64_t>,
+
+        // input uint32_t
+        td_ns::TypePairDefinedEntry<argTy, std::uint64_t, outTy, std::uint64_t>,
+
+        // input half
+        td_ns::TypePairDefinedEntry<argTy, sycl::half, outTy, sycl::half>,
+
+        // input float
+        td_ns::TypePairDefinedEntry<argTy, float, outTy, float>,
+
+        // input double
+        td_ns::TypePairDefinedEntry<argTy, double, outTy, double>,
+
+        // input std::complex
+        td_ns::TypePairDefinedEntry<argTy,
+                                    std::complex<float>,
+                                    outTy,
+                                    std::complex<float>>,
+
+        td_ns::TypePairDefinedEntry<argTy,
+                                    std::complex<double>,
+                                    outTy,
+                                    std::complex<double>>,
+
+        // fall-through
+        td_ns::NotDefinedEntry>::is_defined;
+};
+
+template <typename fnT, typename srcTy, typename dstTy>
+struct MaxOverAxisAtomicStridedFactory
+{
+    fnT get() const
+    {
+        if constexpr (TypePairSupportDataForMaxReductionAtomic<
+                          srcTy, dstTy>::is_defined)
+        {
+            if constexpr (std::is_floating_point<dstTy>::value) {
+                using ReductionOpT = su_ns::Maximum<dstTy>;
+                return dpctl::tensor::kernels::
+                    reduction_over_group_with_atomics_strided_impl<
+                        srcTy, dstTy, ReductionOpT>;
+            }
+            else {
+                using ReductionOpT = sycl::maximum<dstTy>;
+                return dpctl::tensor::kernels::
+                    reduction_over_group_with_atomics_strided_impl<
+                        srcTy, dstTy, ReductionOpT>;
+            }
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+template <typename fnT, typename srcTy, typename dstTy>
+struct MaxOverAxisTempsStridedFactory
+{
+    fnT get() const
+    {
+        if constexpr (TypePairSupportDataForMaxReductionTemps<
+                          srcTy, dstTy>::is_defined) {
+            if constexpr (std::is_integral_v<dstTy> &&
+                          !std::is_same_v<dstTy, bool>) {
+                using ReductionOpT = sycl::maximum<dstTy>;
+                return dpctl::tensor::kernels::
+                    reduction_over_group_temps_strided_impl<srcTy, dstTy,
+                                                            ReductionOpT>;
+            }
+            else {
+                using ReductionOpT = su_ns::Maximum<dstTy>;
+                return dpctl::tensor::kernels::
+                    reduction_over_group_temps_strided_impl<srcTy, dstTy,
+                                                            ReductionOpT>;
+            }
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+template <typename fnT, typename srcTy, typename dstTy>
+struct MaxOverAxis1AtomicContigFactory
+{
+    fnT get() const
+    {
+        if constexpr (TypePairSupportDataForMaxReductionAtomic<
+                          srcTy, dstTy>::is_defined)
+        {
+            if constexpr (std::is_floating_point<dstTy>::value) {
+                using ReductionOpT = su_ns::Maximum<dstTy>;
+                return dpctl::tensor::kernels::
+                    reduction_axis1_over_group_with_atomics_contig_impl<
+                        srcTy, dstTy, ReductionOpT>;
+            }
+            else {
+                using ReductionOpT = sycl::maximum<dstTy>;
+                return dpctl::tensor::kernels::
+                    reduction_axis1_over_group_with_atomics_contig_impl<
+                        srcTy, dstTy, ReductionOpT>;
+            }
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+template <typename fnT, typename srcTy, typename dstTy>
+struct MaxOverAxis0AtomicContigFactory
+{
+    fnT get() const
+    {
+        if constexpr (TypePairSupportDataForMaxReductionAtomic<
+                          srcTy, dstTy>::is_defined)
+        {
+            if constexpr (std::is_floating_point<dstTy>::value) {
+                using ReductionOpT = su_ns::Maximum<dstTy>;
+                return dpctl::tensor::kernels::
+                    reduction_axis0_over_group_with_atomics_contig_impl<
+                        srcTy, dstTy, ReductionOpT>;
+            }
+            else {
+                using ReductionOpT = sycl::maximum<dstTy>;
+                return dpctl::tensor::kernels::
+                    reduction_axis0_over_group_with_atomics_contig_impl<
+                        srcTy, dstTy, ReductionOpT>;
+            }
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+template <typename fnT, typename srcTy, typename dstTy>
+struct MaxOverAxis1TempsContigFactory
+{
+    fnT get() const
+    {
+        if constexpr (TypePairSupportDataForMaxReductionTemps<
+                          srcTy, dstTy>::is_defined) {
+            if constexpr (std::is_integral_v<dstTy> &&
+                          !std::is_same_v<dstTy, bool>) {
+                using ReductionOpT = sycl::maximum<dstTy>;
+                return dpctl::tensor::kernels::
+                    reduction_axis1_over_group_temps_contig_impl<srcTy, dstTy,
+                                                                 ReductionOpT>;
+            }
+            else {
+                using ReductionOpT = su_ns::Maximum<dstTy>;
+                return dpctl::tensor::kernels::
+                    reduction_axis1_over_group_temps_contig_impl<srcTy, dstTy,
+                                                                 ReductionOpT>;
+            }
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+template <typename fnT, typename srcTy, typename dstTy>
+struct MaxOverAxis0TempsContigFactory
+{
+    fnT get() const
+    {
+        if constexpr (TypePairSupportDataForMaxReductionTemps<
+                          srcTy, dstTy>::is_defined) {
+            if constexpr (std::is_integral_v<dstTy> &&
+                          !std::is_same_v<dstTy, bool>) {
+                using ReductionOpT = sycl::maximum<dstTy>;
+                return dpctl::tensor::kernels::
+                    reduction_axis0_over_group_temps_contig_impl<srcTy, dstTy,
+                                                                 ReductionOpT>;
+            }
+            else {
+                using ReductionOpT = su_ns::Maximum<dstTy>;
+                return dpctl::tensor::kernels::
+                    reduction_axis0_over_group_temps_contig_impl<srcTy, dstTy,
+                                                                 ReductionOpT>;
+            }
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+void populate_max_over_axis_dispatch_tables(void)
+{
+    using td_ns::DispatchTableBuilder;
+
+    DispatchTableBuilder<reduction_strided_impl_fn_ptr,
+                         MaxOverAxisAtomicStridedFactory, td_ns::num_types>
+        dtb1;
+    dtb1.populate_dispatch_table(max_over_axis_strided_atomic_dispatch_table);
+
+    DispatchTableBuilder<reduction_strided_impl_fn_ptr,
+                         MaxOverAxisTempsStridedFactory, td_ns::num_types>
+        dtb2;
+    dtb2.populate_dispatch_table(max_over_axis_strided_temps_dispatch_table);
+
+    DispatchTableBuilder<reduction_contig_impl_fn_ptr,
+                         MaxOverAxis1AtomicContigFactory, td_ns::num_types>
+        dtb3;
+    dtb3.populate_dispatch_table(max_over_axis1_contig_atomic_dispatch_table);
+
+    DispatchTableBuilder<reduction_contig_impl_fn_ptr,
+                         MaxOverAxis0AtomicContigFactory, td_ns::num_types>
+        dtb4;
+    dtb4.populate_dispatch_table(max_over_axis0_contig_atomic_dispatch_table);
+
+    DispatchTableBuilder<reduction_contig_impl_fn_ptr,
+                         MaxOverAxis1TempsContigFactory, td_ns::num_types>
+        dtb5;
+    dtb5.populate_dispatch_table(max_over_axis1_contig_temps_dispatch_table);
+
+    DispatchTableBuilder<reduction_contig_impl_fn_ptr,
+                         MaxOverAxis0TempsContigFactory, td_ns::num_types>
+        dtb6;
+    dtb6.populate_dispatch_table(max_over_axis0_contig_temps_dispatch_table);
+}
+
+using atomic_support::atomic_support_fn_ptr_t;
+static atomic_support_fn_ptr_t max_atomic_support_vector[td_ns::num_types];
+
+void populate_max_atomic_support_dispatch_vector(void)
+{
+    using td_ns::DispatchVectorBuilder;
+
+    using atomic_support::MaxAtomicSupportFactory;
+    DispatchVectorBuilder<atomic_support_fn_ptr_t, MaxAtomicSupportFactory,
+                          td_ns::num_types>
+        dvb;
+    dvb.populate_dispatch_vector(max_atomic_support_vector);
+}
+
+} // namespace impl
+
+void init_max(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        using impl::populate_max_over_axis_dispatch_tables;
+        populate_max_over_axis_dispatch_tables();
+        using impl::max_over_axis0_contig_atomic_dispatch_table;
+        using impl::max_over_axis0_contig_temps_dispatch_table;
+        using impl::max_over_axis1_contig_atomic_dispatch_table;
+        using impl::max_over_axis1_contig_temps_dispatch_table;
+        using impl::max_over_axis_strided_atomic_dispatch_table;
+        using impl::max_over_axis_strided_temps_dispatch_table;
+
+        using impl::populate_max_atomic_support_dispatch_vector;
+        populate_max_atomic_support_dispatch_vector();
+        using impl::max_atomic_support_vector;
+
+        auto max_pyapi = [&](const arrayT &src, int trailing_dims_to_reduce,
+                             const arrayT &dst, sycl::queue &exec_q,
+                             const event_vecT &depends = {}) {
+            return py_reduction_over_axis(
+                src, trailing_dims_to_reduce, dst, exec_q, depends,
+                max_over_axis_strided_atomic_dispatch_table,
+                max_over_axis0_contig_atomic_dispatch_table,
+                max_over_axis1_contig_atomic_dispatch_table,
+                max_over_axis_strided_temps_dispatch_table,
+                max_over_axis0_contig_temps_dispatch_table,
+                max_over_axis1_contig_temps_dispatch_table,
+                max_atomic_support_vector);
+        };
+        m.def("_max_over_axis", max_pyapi, "", py::arg("src"),
+              py::arg("trailing_dims_to_reduce"), py::arg("dst"),
+              py::arg("sycl_queue"), py::arg("depends") = py::list());
+    }
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/reductions/max.hpp b/dpctl_ext/tensor/libtensor/source/reductions/max.hpp
new file mode 100644
index 000000000000..bc242dc8d74b
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/reductions/max.hpp
@@ -0,0 +1,46 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_reductions_impl
+/// extension.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern void init_max(py::module_ m);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/reductions/min.cpp b/dpctl_ext/tensor/libtensor/source/reductions/min.cpp
new file mode 100644
index 000000000000..97d3432b13ed
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/reductions/min.cpp
@@ -0,0 +1,412 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_reductions_impl
+/// extension.
+//===---------------------------------------------------------------------===//
+
+#include <complex>
+#include <cstdint>
+#include <type_traits>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "kernels/reductions.hpp"
+#include "utils/sycl_utils.hpp"
+#include "utils/type_dispatch_building.hpp"
+
+#include "reduction_atomic_support.hpp"
+#include "reduction_over_axis.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+namespace su_ns = dpctl::tensor::sycl_utils;
+
+namespace impl
+{
+
+using dpctl::tensor::kernels::reduction_strided_impl_fn_ptr;
+static reduction_strided_impl_fn_ptr
+    min_over_axis_strided_atomic_dispatch_table[td_ns::num_types]
+                                               [td_ns::num_types];
+static reduction_strided_impl_fn_ptr
+    min_over_axis_strided_temps_dispatch_table[td_ns::num_types]
+                                              [td_ns::num_types];
+
+using dpctl::tensor::kernels::reduction_contig_impl_fn_ptr;
+static reduction_contig_impl_fn_ptr
+    min_over_axis1_contig_atomic_dispatch_table[td_ns::num_types]
+                                               [td_ns::num_types];
+static reduction_contig_impl_fn_ptr
+    min_over_axis0_contig_atomic_dispatch_table[td_ns::num_types]
+                                               [td_ns::num_types];
+static reduction_contig_impl_fn_ptr
+    min_over_axis1_contig_temps_dispatch_table[td_ns::num_types]
+                                              [td_ns::num_types];
+static reduction_contig_impl_fn_ptr
+    min_over_axis0_contig_temps_dispatch_table[td_ns::num_types]
+                                              [td_ns::num_types];
+
+/* @brief Types supported by min reduction code based on atomic_ref */
+template <typename argTy, typename outTy>
+struct TypePairSupportDataForMinReductionAtomic
+{
+    /* value is true if a kernel for <argTy, outTy> must be instantiated, false
+     * otherwise */
+    static constexpr bool is_defined = std::disjunction<
+        // input int32
+        td_ns::TypePairDefinedEntry<argTy, std::int32_t, outTy, std::int32_t>,
+        // input uint32
+        td_ns::TypePairDefinedEntry<argTy, std::uint32_t, outTy, std::uint32_t>,
+        // input int64
+        td_ns::TypePairDefinedEntry<argTy, std::int64_t, outTy, std::int64_t>,
+        // input uint64
+        td_ns::TypePairDefinedEntry<argTy, std::uint64_t, outTy, std::uint64_t>,
+        // input float
+        td_ns::TypePairDefinedEntry<argTy, float, outTy, float>,
+        // input double
+        td_ns::TypePairDefinedEntry<argTy, double, outTy, double>,
+        // fall-through
+        td_ns::NotDefinedEntry>::is_defined;
+};
+
+template <typename argTy, typename outTy>
+struct TypePairSupportDataForMinReductionTemps
+{
+    static constexpr bool is_defined = std::disjunction<
+        // input bool
+        td_ns::TypePairDefinedEntry<argTy, bool, outTy, bool>,
+        // input int8_t
+        td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, std::int8_t>,
+        // input uint8_t
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, std::uint8_t>,
+
+        // input int16_t
+        td_ns::TypePairDefinedEntry<argTy, std::int16_t, outTy, std::int16_t>,
+        // input uint16_t
+        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, std::uint16_t>,
+
+        // input int32_t
+        td_ns::TypePairDefinedEntry<argTy, std::int32_t, outTy, std::int32_t>,
+        // input uint32_t
+        td_ns::TypePairDefinedEntry<argTy, std::uint32_t, outTy, std::uint32_t>,
+
+        // input int64_t
+        td_ns::TypePairDefinedEntry<argTy, std::int64_t, outTy, std::int64_t>,
+
+        // input uint32_t
+        td_ns::TypePairDefinedEntry<argTy, std::uint64_t, outTy, std::uint64_t>,
+
+        // input half
+        td_ns::TypePairDefinedEntry<argTy, sycl::half, outTy, sycl::half>,
+
+        // input float
+        td_ns::TypePairDefinedEntry<argTy, float, outTy, float>,
+
+        // input double
+        td_ns::TypePairDefinedEntry<argTy, double, outTy, double>,
+
+        // input std::complex
+        td_ns::TypePairDefinedEntry<argTy,
+                                    std::complex<float>,
+                                    outTy,
+                                    std::complex<float>>,
+
+        td_ns::TypePairDefinedEntry<argTy,
+                                    std::complex<double>,
+                                    outTy,
+                                    std::complex<double>>,
+
+        // fall-through
+        td_ns::NotDefinedEntry>::is_defined;
+};
+
+template <typename fnT, typename srcTy, typename dstTy>
+struct MinOverAxisAtomicStridedFactory
+{
+    fnT get() const
+    {
+        if constexpr (TypePairSupportDataForMinReductionAtomic<
+                          srcTy, dstTy>::is_defined)
+        {
+            if constexpr (std::is_floating_point<dstTy>::value) {
+                using ReductionOpT = su_ns::Minimum<dstTy>;
+                return dpctl::tensor::kernels::
+                    reduction_over_group_with_atomics_strided_impl<
+                        srcTy, dstTy, ReductionOpT>;
+            }
+            else {
+                using ReductionOpT = sycl::minimum<dstTy>;
+                return dpctl::tensor::kernels::
+                    reduction_over_group_with_atomics_strided_impl<
+                        srcTy, dstTy, ReductionOpT>;
+            }
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+template <typename fnT, typename srcTy, typename dstTy>
+struct MinOverAxisTempsStridedFactory
+{
+    fnT get() const
+    {
+        if constexpr (TypePairSupportDataForMinReductionTemps<
+                          srcTy, dstTy>::is_defined) {
+            if constexpr (std::is_integral_v<dstTy> &&
+                          !std::is_same_v<dstTy, bool>) {
+                using ReductionOpT = sycl::minimum<dstTy>;
+                return dpctl::tensor::kernels::
+                    reduction_over_group_temps_strided_impl<srcTy, dstTy,
+                                                            ReductionOpT>;
+            }
+            else {
+                using ReductionOpT = su_ns::Minimum<dstTy>;
+                return dpctl::tensor::kernels::
+                    reduction_over_group_temps_strided_impl<srcTy, dstTy,
+                                                            ReductionOpT>;
+            }
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+template <typename fnT, typename srcTy, typename dstTy>
+struct MinOverAxis1AtomicContigFactory
+{
+    fnT get() const
+    {
+        if constexpr (TypePairSupportDataForMinReductionAtomic<
+                          srcTy, dstTy>::is_defined)
+        {
+            if constexpr (std::is_floating_point<dstTy>::value) {
+                using ReductionOpT = su_ns::Minimum<dstTy>;
+                return dpctl::tensor::kernels::
+                    reduction_axis1_over_group_with_atomics_contig_impl<
+                        srcTy, dstTy, ReductionOpT>;
+            }
+            else {
+                using ReductionOpT = sycl::minimum<dstTy>;
+                return dpctl::tensor::kernels::
+                    reduction_axis1_over_group_with_atomics_contig_impl<
+                        srcTy, dstTy, ReductionOpT>;
+            }
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+template <typename fnT, typename srcTy, typename dstTy>
+struct MinOverAxis0AtomicContigFactory
+{
+    fnT get() const
+    {
+        if constexpr (TypePairSupportDataForMinReductionAtomic<
+                          srcTy, dstTy>::is_defined)
+        {
+            if constexpr (std::is_floating_point<dstTy>::value) {
+                using ReductionOpT = su_ns::Minimum<dstTy>;
+                return dpctl::tensor::kernels::
+                    reduction_axis0_over_group_with_atomics_contig_impl<
+                        srcTy, dstTy, ReductionOpT>;
+            }
+            else {
+                using ReductionOpT = sycl::minimum<dstTy>;
+                return dpctl::tensor::kernels::
+                    reduction_axis0_over_group_with_atomics_contig_impl<
+                        srcTy, dstTy, ReductionOpT>;
+            }
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+template <typename fnT, typename srcTy, typename dstTy>
+struct MinOverAxis1TempsContigFactory
+{
+    fnT get() const
+    {
+        if constexpr (TypePairSupportDataForMinReductionTemps<
+                          srcTy, dstTy>::is_defined) {
+            if constexpr (std::is_integral_v<dstTy> &&
+                          !std::is_same_v<dstTy, bool>) {
+                using ReductionOpT = sycl::minimum<dstTy>;
+                return dpctl::tensor::kernels::
+                    reduction_axis1_over_group_temps_contig_impl<srcTy, dstTy,
+                                                                 ReductionOpT>;
+            }
+            else {
+                using ReductionOpT = su_ns::Minimum<dstTy>;
+                return dpctl::tensor::kernels::
+                    reduction_axis1_over_group_temps_contig_impl<srcTy, dstTy,
+                                                                 ReductionOpT>;
+            }
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+template <typename fnT, typename srcTy, typename dstTy>
+struct MinOverAxis0TempsContigFactory
+{
+    fnT get() const
+    {
+        if constexpr (TypePairSupportDataForMinReductionTemps<
+                          srcTy, dstTy>::is_defined) {
+            if constexpr (std::is_integral_v<dstTy> &&
+                          !std::is_same_v<dstTy, bool>) {
+                using ReductionOpT = sycl::minimum<dstTy>;
+                return dpctl::tensor::kernels::
+                    reduction_axis0_over_group_temps_contig_impl<srcTy, dstTy,
+                                                                 ReductionOpT>;
+            }
+            else {
+                using ReductionOpT = su_ns::Minimum<dstTy>;
+                return dpctl::tensor::kernels::
+                    reduction_axis0_over_group_temps_contig_impl<srcTy, dstTy,
+                                                                 ReductionOpT>;
+            }
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+void populate_min_over_axis_dispatch_tables(void)
+{
+    using dpctl::tensor::kernels::reduction_contig_impl_fn_ptr;
+    using dpctl::tensor::kernels::reduction_strided_impl_fn_ptr;
+    using td_ns::DispatchTableBuilder;
+
+    DispatchTableBuilder<reduction_strided_impl_fn_ptr,
+                         MinOverAxisAtomicStridedFactory, td_ns::num_types>
+        dtb1;
+    dtb1.populate_dispatch_table(min_over_axis_strided_atomic_dispatch_table);
+
+    DispatchTableBuilder<reduction_strided_impl_fn_ptr,
+                         MinOverAxisTempsStridedFactory, td_ns::num_types>
+        dtb2;
+    dtb2.populate_dispatch_table(min_over_axis_strided_temps_dispatch_table);
+
+    DispatchTableBuilder<reduction_contig_impl_fn_ptr,
+                         MinOverAxis1AtomicContigFactory, td_ns::num_types>
+        dtb3;
+    dtb3.populate_dispatch_table(min_over_axis1_contig_atomic_dispatch_table);
+
+    DispatchTableBuilder<reduction_contig_impl_fn_ptr,
+                         MinOverAxis0AtomicContigFactory, td_ns::num_types>
+        dtb4;
+    dtb4.populate_dispatch_table(min_over_axis0_contig_atomic_dispatch_table);
+
+    DispatchTableBuilder<reduction_contig_impl_fn_ptr,
+                         MinOverAxis1TempsContigFactory, td_ns::num_types>
+        dtb5;
+    dtb5.populate_dispatch_table(min_over_axis1_contig_temps_dispatch_table);
+
+    DispatchTableBuilder<reduction_contig_impl_fn_ptr,
+                         MinOverAxis0TempsContigFactory, td_ns::num_types>
+        dtb6;
+    dtb6.populate_dispatch_table(min_over_axis0_contig_temps_dispatch_table);
+}
+
+using atomic_support::atomic_support_fn_ptr_t;
+static atomic_support_fn_ptr_t min_atomic_support_vector[td_ns::num_types];
+
+void populate_min_atomic_support_dispatch_vector(void)
+{
+    using td_ns::DispatchVectorBuilder;
+
+    using atomic_support::MinAtomicSupportFactory;
+    DispatchVectorBuilder<atomic_support_fn_ptr_t, MinAtomicSupportFactory,
+                          td_ns::num_types>
+        dvb;
+    dvb.populate_dispatch_vector(min_atomic_support_vector);
+}
+
+} // namespace impl
+
+void init_min(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        using impl::populate_min_over_axis_dispatch_tables;
+        populate_min_over_axis_dispatch_tables();
+        using impl::min_over_axis0_contig_atomic_dispatch_table;
+        using impl::min_over_axis0_contig_temps_dispatch_table;
+        using impl::min_over_axis1_contig_atomic_dispatch_table;
+        using impl::min_over_axis1_contig_temps_dispatch_table;
+        using impl::min_over_axis_strided_atomic_dispatch_table;
+        using impl::min_over_axis_strided_temps_dispatch_table;
+
+        using impl::populate_min_atomic_support_dispatch_vector;
+        populate_min_atomic_support_dispatch_vector();
+        using impl::min_atomic_support_vector;
+
+        auto min_pyapi = [&](const arrayT &src, int trailing_dims_to_reduce,
+                             const arrayT &dst, sycl::queue &exec_q,
+                             const event_vecT &depends = {}) {
+            return py_reduction_over_axis(
+                src, trailing_dims_to_reduce, dst, exec_q, depends,
+                min_over_axis_strided_atomic_dispatch_table,
+                min_over_axis0_contig_atomic_dispatch_table,
+                min_over_axis1_contig_atomic_dispatch_table,
+                min_over_axis_strided_temps_dispatch_table,
+                min_over_axis0_contig_temps_dispatch_table,
+                min_over_axis1_contig_temps_dispatch_table,
+                min_atomic_support_vector);
+        };
+        m.def("_min_over_axis", min_pyapi, "", py::arg("src"),
+              py::arg("trailing_dims_to_reduce"), py::arg("dst"),
+              py::arg("sycl_queue"), py::arg("depends") = py::list());
+    }
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/reductions/min.hpp b/dpctl_ext/tensor/libtensor/source/reductions/min.hpp
new file mode 100644
index 000000000000..e054f44539f3
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/reductions/min.hpp
@@ -0,0 +1,46 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_reductions_impl
+/// extension.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern void init_min(py::module_ m);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/reductions/prod.cpp b/dpctl_ext/tensor/libtensor/source/reductions/prod.cpp
new file mode 100644
index 000000000000..6cbb21dfe02c
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/reductions/prod.cpp
@@ -0,0 +1,466 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_reductions_impl
+/// extension.
+//===---------------------------------------------------------------------===//
+
+#include <complex>
+#include <cstdint>
+#include <string>
+#include <type_traits>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "kernels/reductions.hpp"
+#include "utils/type_dispatch_building.hpp"
+
+#include "reduction_atomic_support.hpp"
+#include "reduction_over_axis.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace impl
+{
+
+using dpctl::tensor::kernels::reduction_strided_impl_fn_ptr;
+static reduction_strided_impl_fn_ptr
+    prod_over_axis_strided_atomic_dispatch_table[td_ns::num_types]
+                                                [td_ns::num_types];
+static reduction_strided_impl_fn_ptr
+    prod_over_axis_strided_temps_dispatch_table[td_ns::num_types]
+                                               [td_ns::num_types];
+
+using dpctl::tensor::kernels::reduction_contig_impl_fn_ptr;
+static reduction_contig_impl_fn_ptr
+    prod_over_axis1_contig_atomic_dispatch_table[td_ns::num_types]
+                                                [td_ns::num_types];
+static reduction_contig_impl_fn_ptr
+    prod_over_axis0_contig_atomic_dispatch_table[td_ns::num_types]
+                                                [td_ns::num_types];
+static reduction_contig_impl_fn_ptr
+    prod_over_axis1_contig_temps_dispatch_table[td_ns::num_types]
+                                               [td_ns::num_types];
+static reduction_contig_impl_fn_ptr
+    prod_over_axis0_contig_temps_dispatch_table[td_ns::num_types]
+                                               [td_ns::num_types];
+
+/* @brief Types supported by plus-reduction code based on atomic_ref */
+template <typename argTy, typename outTy>
+struct TypePairSupportDataForProductReductionAtomic
+{
+
+    /* value if true a kernel for <argTy, outTy> must be instantiated, false
+     * otherwise */
+    static constexpr bool is_defined = std::disjunction<
+        td_ns::TypePairDefinedEntry<argTy, bool, outTy, std::int32_t>,
+        td_ns::TypePairDefinedEntry<argTy, bool, outTy, std::uint32_t>,
+        td_ns::TypePairDefinedEntry<argTy, bool, outTy, std::int64_t>,
+        td_ns::TypePairDefinedEntry<argTy, bool, outTy, std::uint64_t>,
+        // input int8
+        td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, std::int32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, std::int64_t>,
+        // input uint8
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, std::int32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, std::uint32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, std::int64_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, std::uint64_t>,
+        // input int16
+        td_ns::TypePairDefinedEntry<argTy, std::int16_t, outTy, std::int32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int16_t, outTy, std::int64_t>,
+        // input uint16
+        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, std::int32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, std::uint32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, std::int64_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, std::uint64_t>,
+        // input int32
+        td_ns::TypePairDefinedEntry<argTy, std::int32_t, outTy, std::int32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int32_t, outTy, std::int64_t>,
+        // input uint32
+        td_ns::TypePairDefinedEntry<argTy, std::uint32_t, outTy, std::uint32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint32_t, outTy, std::int64_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint32_t, outTy, std::uint64_t>,
+        // input int64
+        td_ns::TypePairDefinedEntry<argTy, std::int64_t, outTy, std::int64_t>,
+        // input uint64
+        td_ns::TypePairDefinedEntry<argTy, std::uint64_t, outTy, std::uint64_t>,
+        // fall-through
+        td_ns::NotDefinedEntry>::is_defined;
+};
+
+template <typename argTy, typename outTy>
+struct TypePairSupportDataForProductReductionTemps
+{
+
+    static constexpr bool is_defined = std::disjunction<
+        td_ns::TypePairDefinedEntry<argTy, bool, outTy, bool>,
+        td_ns::TypePairDefinedEntry<argTy, bool, outTy, std::int8_t>,
+        td_ns::TypePairDefinedEntry<argTy, bool, outTy, std::uint8_t>,
+        td_ns::TypePairDefinedEntry<argTy, bool, outTy, std::int16_t>,
+        td_ns::TypePairDefinedEntry<argTy, bool, outTy, std::uint16_t>,
+        td_ns::TypePairDefinedEntry<argTy, bool, outTy, std::int32_t>,
+        td_ns::TypePairDefinedEntry<argTy, bool, outTy, std::uint32_t>,
+        td_ns::TypePairDefinedEntry<argTy, bool, outTy, std::int64_t>,
+        td_ns::TypePairDefinedEntry<argTy, bool, outTy, std::uint64_t>,
+        td_ns::TypePairDefinedEntry<argTy, bool, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, bool, outTy, double>,
+
+        // input int8_t
+        td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, std::int8_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, std::int16_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, std::int32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, std::int64_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, double>,
+
+        // input uint8_t
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, std::uint8_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, std::int16_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, std::uint16_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, std::int32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, std::uint32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, std::int64_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, std::uint64_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, double>,
+
+        // input int16_t
+        td_ns::TypePairDefinedEntry<argTy, std::int16_t, outTy, std::int16_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int16_t, outTy, std::int32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int16_t, outTy, std::int64_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int16_t, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, std::int16_t, outTy, double>,
+
+        // input uint16_t
+        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, std::uint16_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, std::int32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, std::uint32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, std::int64_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, std::uint64_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, double>,
+
+        // input int32_t
+        td_ns::TypePairDefinedEntry<argTy, std::int32_t, outTy, std::int32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int32_t, outTy, std::int64_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int32_t, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, std::int32_t, outTy, double>,
+
+        // input uint32_t
+        td_ns::TypePairDefinedEntry<argTy, std::uint32_t, outTy, std::uint32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint32_t, outTy, std::uint64_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint32_t, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint32_t, outTy, double>,
+
+        // input int64_t
+        td_ns::TypePairDefinedEntry<argTy, std::int64_t, outTy, std::int64_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int64_t, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, std::int64_t, outTy, double>,
+
+        // input uint32_t
+        td_ns::TypePairDefinedEntry<argTy, std::uint64_t, outTy, std::uint64_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint64_t, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint64_t, outTy, double>,
+
+        // input half
+        td_ns::TypePairDefinedEntry<argTy, sycl::half, outTy, sycl::half>,
+        td_ns::TypePairDefinedEntry<argTy, sycl::half, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, sycl::half, outTy, double>,
+        td_ns::
+            TypePairDefinedEntry<argTy, sycl::half, outTy, std::complex<float>>,
+        td_ns::TypePairDefinedEntry<argTy,
+                                    sycl::half,
+                                    outTy,
+                                    std::complex<double>>,
+
+        // input float
+        td_ns::TypePairDefinedEntry<argTy, float, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, float, outTy, double>,
+        td_ns::TypePairDefinedEntry<argTy, float, outTy, std::complex<float>>,
+        td_ns::TypePairDefinedEntry<argTy, float, outTy, std::complex<double>>,
+
+        // input double
+        td_ns::TypePairDefinedEntry<argTy, double, outTy, double>,
+        td_ns::TypePairDefinedEntry<argTy, double, outTy, std::complex<double>>,
+
+        // input std::complex
+        td_ns::TypePairDefinedEntry<argTy,
+                                    std::complex<float>,
+                                    outTy,
+                                    std::complex<float>>,
+        td_ns::TypePairDefinedEntry<argTy,
+                                    std::complex<float>,
+                                    outTy,
+                                    std::complex<double>>,
+
+        td_ns::TypePairDefinedEntry<argTy,
+                                    std::complex<double>,
+                                    outTy,
+                                    std::complex<double>>,
+
+        // fall-through
+        td_ns::NotDefinedEntry>::is_defined;
+};
+
+template <typename fnT, typename srcTy, typename dstTy>
+struct ProductOverAxisAtomicStridedFactory
+{
+    fnT get() const
+    {
+        if constexpr (TypePairSupportDataForProductReductionAtomic<
+                          srcTy, dstTy>::is_defined)
+        {
+            using ReductionOpT = sycl::multiplies<dstTy>;
+            return dpctl::tensor::kernels::
+                reduction_over_group_with_atomics_strided_impl<srcTy, dstTy,
+                                                               ReductionOpT>;
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+template <typename fnT, typename srcTy, typename dstTy>
+struct ProductOverAxisTempsStridedFactory
+{
+    fnT get() const
+    {
+        if constexpr (TypePairSupportDataForProductReductionTemps<
+                          srcTy, dstTy>::is_defined)
+        {
+            using ReductionOpT = std::conditional_t<std::is_same_v<dstTy, bool>,
+                                                    sycl::logical_and<dstTy>,
+                                                    sycl::multiplies<dstTy>>;
+            return dpctl::tensor::kernels::
+                reduction_over_group_temps_strided_impl<srcTy, dstTy,
+                                                        ReductionOpT>;
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+template <typename fnT, typename srcTy, typename dstTy>
+struct ProductOverAxis1AtomicContigFactory
+{
+    fnT get() const
+    {
+        if constexpr (TypePairSupportDataForProductReductionAtomic<
+                          srcTy, dstTy>::is_defined)
+        {
+            using ReductionOpT = sycl::multiplies<dstTy>;
+            return dpctl::tensor::kernels::
+                reduction_axis1_over_group_with_atomics_contig_impl<
+                    srcTy, dstTy, ReductionOpT>;
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+template <typename fnT, typename srcTy, typename dstTy>
+struct ProductOverAxis0AtomicContigFactory
+{
+    fnT get() const
+    {
+        if constexpr (TypePairSupportDataForProductReductionAtomic<
+                          srcTy, dstTy>::is_defined)
+        {
+            using ReductionOpT = sycl::multiplies<dstTy>;
+            return dpctl::tensor::kernels::
+                reduction_axis0_over_group_with_atomics_contig_impl<
+                    srcTy, dstTy, ReductionOpT>;
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+template <typename fnT, typename srcTy, typename dstTy>
+struct ProductOverAxis1TempsContigFactory
+{
+    fnT get() const
+    {
+        if constexpr (TypePairSupportDataForProductReductionTemps<
+                          srcTy, dstTy>::is_defined)
+        {
+            using ReductionOpT = std::conditional_t<std::is_same_v<dstTy, bool>,
+                                                    sycl::logical_and<dstTy>,
+                                                    sycl::multiplies<dstTy>>;
+            return dpctl::tensor::kernels::
+                reduction_axis1_over_group_temps_contig_impl<srcTy, dstTy,
+                                                             ReductionOpT>;
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+template <typename fnT, typename srcTy, typename dstTy>
+struct ProductOverAxis0TempsContigFactory
+{
+    fnT get() const
+    {
+        if constexpr (TypePairSupportDataForProductReductionTemps<
+                          srcTy, dstTy>::is_defined)
+        {
+            using ReductionOpT = std::conditional_t<std::is_same_v<dstTy, bool>,
+                                                    sycl::logical_and<dstTy>,
+                                                    sycl::multiplies<dstTy>>;
+            return dpctl::tensor::kernels::
+                reduction_axis0_over_group_temps_contig_impl<srcTy, dstTy,
+                                                             ReductionOpT>;
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+void populate_prod_over_axis_dispatch_tables(void)
+{
+    using dpctl::tensor::kernels::reduction_contig_impl_fn_ptr;
+    using dpctl::tensor::kernels::reduction_strided_impl_fn_ptr;
+    using namespace td_ns;
+
+    DispatchTableBuilder<reduction_strided_impl_fn_ptr,
+                         ProductOverAxisAtomicStridedFactory, num_types>
+        dtb1;
+    dtb1.populate_dispatch_table(prod_over_axis_strided_atomic_dispatch_table);
+
+    DispatchTableBuilder<reduction_strided_impl_fn_ptr,
+                         ProductOverAxisTempsStridedFactory, num_types>
+        dtb2;
+    dtb2.populate_dispatch_table(prod_over_axis_strided_temps_dispatch_table);
+
+    DispatchTableBuilder<reduction_contig_impl_fn_ptr,
+                         ProductOverAxis1AtomicContigFactory, num_types>
+        dtb3;
+    dtb3.populate_dispatch_table(prod_over_axis1_contig_atomic_dispatch_table);
+
+    DispatchTableBuilder<reduction_contig_impl_fn_ptr,
+                         ProductOverAxis0AtomicContigFactory, num_types>
+        dtb4;
+    dtb4.populate_dispatch_table(prod_over_axis0_contig_atomic_dispatch_table);
+
+    DispatchTableBuilder<reduction_contig_impl_fn_ptr,
+                         ProductOverAxis1TempsContigFactory, td_ns::num_types>
+        dtb5;
+    dtb5.populate_dispatch_table(prod_over_axis1_contig_temps_dispatch_table);
+
+    DispatchTableBuilder<reduction_contig_impl_fn_ptr,
+                         ProductOverAxis0TempsContigFactory, td_ns::num_types>
+        dtb6;
+    dtb6.populate_dispatch_table(prod_over_axis0_contig_temps_dispatch_table);
+}
+
+using atomic_support::atomic_support_fn_ptr_t;
+static atomic_support_fn_ptr_t prod_atomic_support_vector[td_ns::num_types];
+
+void populate_prod_atomic_support_dispatch_vector(void)
+{
+    using td_ns::DispatchVectorBuilder;
+
+    using atomic_support::ProductAtomicSupportFactory;
+    DispatchVectorBuilder<atomic_support_fn_ptr_t, ProductAtomicSupportFactory,
+                          td_ns::num_types>
+        dvb;
+    dvb.populate_dispatch_vector(prod_atomic_support_vector);
+}
+
+} // namespace impl
+
+void init_prod(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        using impl::populate_prod_over_axis_dispatch_tables;
+        populate_prod_over_axis_dispatch_tables();
+        using impl::prod_over_axis0_contig_atomic_dispatch_table;
+        using impl::prod_over_axis0_contig_temps_dispatch_table;
+        using impl::prod_over_axis1_contig_atomic_dispatch_table;
+        using impl::prod_over_axis1_contig_temps_dispatch_table;
+        using impl::prod_over_axis_strided_atomic_dispatch_table;
+        using impl::prod_over_axis_strided_temps_dispatch_table;
+
+        using impl::populate_prod_atomic_support_dispatch_vector;
+        populate_prod_atomic_support_dispatch_vector();
+        using impl::prod_atomic_support_vector;
+
+        auto prod_pyapi = [&](const arrayT &src, int trailing_dims_to_reduce,
+                              const arrayT &dst, sycl::queue &exec_q,
+                              const event_vecT &depends = {}) {
+            return py_reduction_over_axis(
+                src, trailing_dims_to_reduce, dst, exec_q, depends,
+                prod_over_axis_strided_atomic_dispatch_table,
+                prod_over_axis0_contig_atomic_dispatch_table,
+                prod_over_axis1_contig_atomic_dispatch_table,
+                prod_over_axis_strided_temps_dispatch_table,
+                prod_over_axis0_contig_temps_dispatch_table,
+                prod_over_axis1_contig_temps_dispatch_table,
+                prod_atomic_support_vector);
+        };
+        m.def("_prod_over_axis", prod_pyapi, "", py::arg("src"),
+              py::arg("trailing_dims_to_reduce"), py::arg("dst"),
+              py::arg("sycl_queue"), py::arg("depends") = py::list());
+
+        auto prod_dtype_supported =
+            [&](const py::dtype &input_dtype, const py::dtype &output_dtype,
+                const std::string &dst_usm_type, sycl::queue &q) {
+                return py_reduction_dtype_supported(
+                    input_dtype, output_dtype, dst_usm_type, q,
+                    prod_over_axis_strided_atomic_dispatch_table,
+                    prod_over_axis_strided_temps_dispatch_table,
+                    prod_atomic_support_vector);
+            };
+        m.def("_prod_over_axis_dtype_supported", prod_dtype_supported, "",
+              py::arg("arg_dtype"), py::arg("out_dtype"),
+              py::arg("dst_usm_type"), py::arg("sycl_queue"));
+    }
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/reductions/prod.hpp b/dpctl_ext/tensor/libtensor/source/reductions/prod.hpp
new file mode 100644
index 000000000000..15b1c07e5ddd
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/reductions/prod.hpp
@@ -0,0 +1,46 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_reductions_impl
+/// extension.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern void init_prod(py::module_ m);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/reductions/reduce_hypot.cpp b/dpctl_ext/tensor/libtensor/source/reductions/reduce_hypot.cpp
new file mode 100644
index 000000000000..5279b4f6c276
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/reductions/reduce_hypot.cpp
@@ -0,0 +1,254 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_reductions_impl
+/// extension.
+//===---------------------------------------------------------------------===//
+
+#include <cstdint>
+#include <type_traits>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "kernels/reductions.hpp"
+#include "utils/sycl_utils.hpp"
+#include "utils/type_dispatch_building.hpp"
+
+#include "reduction_over_axis.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+namespace su_ns = dpctl::tensor::sycl_utils;
+
+namespace impl
+{
+
+using dpctl::tensor::kernels::reduction_strided_impl_fn_ptr;
+static reduction_strided_impl_fn_ptr
+    hypot_over_axis_strided_temps_dispatch_table[td_ns::num_types]
+                                                [td_ns::num_types];
+
+using dpctl::tensor::kernels::reduction_contig_impl_fn_ptr;
+static reduction_contig_impl_fn_ptr
+    hypot_over_axis1_contig_temps_dispatch_table[td_ns::num_types]
+                                                [td_ns::num_types];
+static reduction_contig_impl_fn_ptr
+    hypot_over_axis0_contig_temps_dispatch_table[td_ns::num_types]
+                                                [td_ns::num_types];
+
+template <typename argTy, typename outTy>
+struct TypePairSupportDataForHypotReductionTemps
+{
+
+    static constexpr bool is_defined = std::disjunction<
+        td_ns::TypePairDefinedEntry<argTy, bool, outTy, sycl::half>,
+        td_ns::TypePairDefinedEntry<argTy, bool, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, bool, outTy, double>,
+
+        // input int8_t
+        td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, sycl::half>,
+        td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, double>,
+
+        // input uint8_t
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, sycl::half>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, double>,
+
+        // input int16_t
+        td_ns::TypePairDefinedEntry<argTy, std::int16_t, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, std::int16_t, outTy, double>,
+
+        // input uint16_t
+        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, double>,
+
+        // input int32_t
+        td_ns::TypePairDefinedEntry<argTy, std::int32_t, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, std::int32_t, outTy, double>,
+
+        // input uint32_t
+        td_ns::TypePairDefinedEntry<argTy, std::uint32_t, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint32_t, outTy, double>,
+
+        // input int64_t
+        td_ns::TypePairDefinedEntry<argTy, std::int64_t, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, std::int64_t, outTy, double>,
+
+        // input uint64_t
+        td_ns::TypePairDefinedEntry<argTy, std::uint64_t, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint64_t, outTy, double>,
+
+        // input half
+        td_ns::TypePairDefinedEntry<argTy, sycl::half, outTy, sycl::half>,
+        td_ns::TypePairDefinedEntry<argTy, sycl::half, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, sycl::half, outTy, double>,
+
+        // input float
+        td_ns::TypePairDefinedEntry<argTy, float, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, float, outTy, double>,
+
+        // input double
+        td_ns::TypePairDefinedEntry<argTy, double, outTy, double>,
+
+        // fall-through
+        td_ns::NotDefinedEntry>::is_defined;
+};
+
+template <typename fnT, typename srcTy, typename dstTy>
+struct HypotOverAxisTempsStridedFactory
+{
+    fnT get() const
+    {
+        if constexpr (TypePairSupportDataForHypotReductionTemps<
+                          srcTy, dstTy>::is_defined)
+        {
+            using ReductionOpT = su_ns::Hypot<dstTy>;
+            return dpctl::tensor::kernels::
+                reduction_over_group_temps_strided_impl<srcTy, dstTy,
+                                                        ReductionOpT>;
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+template <typename fnT, typename srcTy, typename dstTy>
+struct HypotOverAxis1TempsContigFactory
+{
+    fnT get() const
+    {
+        if constexpr (TypePairSupportDataForHypotReductionTemps<
+                          srcTy, dstTy>::is_defined)
+        {
+            using ReductionOpT = su_ns::Hypot<dstTy>;
+            return dpctl::tensor::kernels::
+                reduction_axis1_over_group_temps_contig_impl<srcTy, dstTy,
+                                                             ReductionOpT>;
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+template <typename fnT, typename srcTy, typename dstTy>
+struct HypotOverAxis0TempsContigFactory
+{
+    fnT get() const
+    {
+        if constexpr (TypePairSupportDataForHypotReductionTemps<
+                          srcTy, dstTy>::is_defined)
+        {
+            using ReductionOpT = su_ns::Hypot<dstTy>;
+            return dpctl::tensor::kernels::
+                reduction_axis0_over_group_temps_contig_impl<srcTy, dstTy,
+                                                             ReductionOpT>;
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+void populate_hypot_over_axis_dispatch_tables(void)
+{
+    using dpctl::tensor::kernels::reduction_contig_impl_fn_ptr;
+    using dpctl::tensor::kernels::reduction_strided_impl_fn_ptr;
+    using namespace td_ns;
+
+    DispatchTableBuilder<reduction_strided_impl_fn_ptr,
+                         HypotOverAxisTempsStridedFactory, num_types>
+        dtb1;
+    dtb1.populate_dispatch_table(hypot_over_axis_strided_temps_dispatch_table);
+
+    DispatchTableBuilder<reduction_contig_impl_fn_ptr,
+                         HypotOverAxis1TempsContigFactory, td_ns::num_types>
+        dtb2;
+    dtb2.populate_dispatch_table(hypot_over_axis1_contig_temps_dispatch_table);
+
+    DispatchTableBuilder<reduction_contig_impl_fn_ptr,
+                         HypotOverAxis0TempsContigFactory, td_ns::num_types>
+        dtb3;
+    dtb3.populate_dispatch_table(hypot_over_axis0_contig_temps_dispatch_table);
+}
+
+} // namespace impl
+
+void init_reduce_hypot(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        using impl::populate_hypot_over_axis_dispatch_tables;
+        populate_hypot_over_axis_dispatch_tables();
+        using impl::hypot_over_axis0_contig_temps_dispatch_table;
+        using impl::hypot_over_axis1_contig_temps_dispatch_table;
+        using impl::hypot_over_axis_strided_temps_dispatch_table;
+
+        using dpctl::tensor::kernels::reduction_contig_impl_fn_ptr;
+        using dpctl::tensor::kernels::reduction_strided_impl_fn_ptr;
+
+        auto hypot_pyapi = [&](const arrayT &src, int trailing_dims_to_reduce,
+                               const arrayT &dst, sycl::queue &exec_q,
+                               const event_vecT &depends = {}) {
+            return py_tree_reduction_over_axis(
+                src, trailing_dims_to_reduce, dst, exec_q, depends,
+                hypot_over_axis_strided_temps_dispatch_table,
+                hypot_over_axis0_contig_temps_dispatch_table,
+                hypot_over_axis1_contig_temps_dispatch_table);
+        };
+        m.def("_hypot_over_axis", hypot_pyapi, "", py::arg("src"),
+              py::arg("trailing_dims_to_reduce"), py::arg("dst"),
+              py::arg("sycl_queue"), py::arg("depends") = py::list());
+
+        auto hypot_dtype_supported = [&](const py::dtype &input_dtype,
+                                         const py::dtype &output_dtype) {
+            return py_tree_reduction_dtype_supported(
+                input_dtype, output_dtype,
+                hypot_over_axis_strided_temps_dispatch_table);
+        };
+        m.def("_hypot_over_axis_dtype_supported", hypot_dtype_supported, "",
+              py::arg("arg_dtype"), py::arg("out_dtype"));
+    }
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/reductions/reduce_hypot.hpp b/dpctl_ext/tensor/libtensor/source/reductions/reduce_hypot.hpp
new file mode 100644
index 000000000000..c0a16345af75
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/reductions/reduce_hypot.hpp
@@ -0,0 +1,46 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_reductions_impl
+/// extension.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern void init_reduce_hypot(py::module_ m);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/reductions/reduction_atomic_support.hpp b/dpctl_ext/tensor/libtensor/source/reductions/reduction_atomic_support.hpp
new file mode 100644
index 000000000000..5f9cc32f1203
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/reductions/reduction_atomic_support.hpp
@@ -0,0 +1,147 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_reductions_impl
+/// extension.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <type_traits>
+
+#include <sycl/sycl.hpp>
+
+#include "utils/type_utils.hpp"
+
+namespace dpctl::tensor::py_internal::atomic_support
+{
+
+typedef bool (*atomic_support_fn_ptr_t)(const sycl::queue &, sycl::usm::alloc);
+
+/*! @brief Function which returns a constant value for atomic support */
+template <bool return_value>
+bool fixed_decision(const sycl::queue &, sycl::usm::alloc)
+{
+    return return_value;
+}
+
+/*! @brief Template for querying atomic support for a type on a device */
+template <typename T>
+bool check_atomic_support(const sycl::queue &exec_q,
+                          sycl::usm::alloc usm_alloc_type)
+{
+    static constexpr bool atomic32 = (sizeof(T) == 4);
+    static constexpr bool atomic64 = (sizeof(T) == 8);
+    using dpctl::tensor::type_utils::is_complex;
+    if constexpr ((!atomic32 && !atomic64) || is_complex<T>::value) {
+        return fixed_decision<false>(exec_q, usm_alloc_type);
+    }
+    else {
+        bool supports_atomics = false;
+        const sycl::device &dev = exec_q.get_device();
+        if constexpr (atomic64) {
+            if (!dev.has(sycl::aspect::atomic64)) {
+                return false;
+            }
+        }
+        switch (usm_alloc_type) {
+        case sycl::usm::alloc::shared:
+            supports_atomics =
+                dev.has(sycl::aspect::usm_atomic_shared_allocations);
+            break;
+        case sycl::usm::alloc::host:
+            supports_atomics =
+                dev.has(sycl::aspect::usm_atomic_host_allocations);
+            break;
+        case sycl::usm::alloc::device:
+            supports_atomics = true;
+            break;
+        default:
+            supports_atomics = false;
+        }
+        return supports_atomics;
+    }
+}
+
+template <typename fnT, typename T>
+struct ArithmeticAtomicSupportFactory
+{
+    fnT get()
+    {
+        using dpctl::tensor::type_utils::is_complex;
+        if constexpr (std::is_floating_point_v<T> ||
+                      std::is_same_v<T, sycl::half> || is_complex<T>::value)
+        {
+            // for real- and complex- floating point types, tree reduction has
+            // better round-off accumulation properties (round-off error is
+            // proportional to the log2(reduction_size), while naive elementwise
+            // summation used by atomic implementation has round-off error
+            // growing proportional to the reduction_size.), hence reduction
+            // over floating point types should always use tree_reduction
+            // algorithm, even though atomic implementation may be applicable
+            return fixed_decision<false>;
+        }
+        else {
+            return check_atomic_support<T>;
+        }
+    }
+};
+
+template <typename fnT, typename T>
+struct MinMaxAtomicSupportFactory
+{
+    fnT get()
+    {
+        return check_atomic_support<T>;
+    }
+};
+
+template <typename fnT, typename T>
+struct MaxAtomicSupportFactory : public MinMaxAtomicSupportFactory<fnT, T>
+{
+};
+
+template <typename fnT, typename T>
+struct MinAtomicSupportFactory : public MinMaxAtomicSupportFactory<fnT, T>
+{
+};
+
+template <typename fnT, typename T>
+struct SumAtomicSupportFactory : public ArithmeticAtomicSupportFactory<fnT, T>
+{
+};
+
+template <typename fnT, typename T>
+struct ProductAtomicSupportFactory
+    : public ArithmeticAtomicSupportFactory<fnT, T>
+{
+};
+
+} // namespace dpctl::tensor::py_internal::atomic_support
diff --git a/dpctl_ext/tensor/libtensor/source/reductions/reduction_common.cpp b/dpctl_ext/tensor/libtensor/source/reductions/reduction_common.cpp
new file mode 100644
index 000000000000..fca5e09e2fe5
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/reductions/reduction_common.cpp
@@ -0,0 +1,69 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_reductions_impl
+/// extension.
+//===---------------------------------------------------------------------===//
+
+#include <pybind11/pybind11.h>
+
+#include "all.hpp"
+#include "any.hpp"
+#include "argmax.hpp"
+#include "argmin.hpp"
+#include "logsumexp.hpp"
+#include "max.hpp"
+#include "min.hpp"
+#include "prod.hpp"
+#include "reduce_hypot.hpp"
+#include "sum.hpp"
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+/*! @brief Add reduction functions to Python module */
+void init_reduction_functions(py::module_ m)
+{
+    init_all(m);
+    init_any(m);
+    init_argmax(m);
+    init_argmin(m);
+    init_logsumexp(m);
+    init_max(m);
+    init_min(m);
+    init_prod(m);
+    init_reduce_hypot(m);
+    init_sum(m);
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/reductions/reduction_common.hpp b/dpctl_ext/tensor/libtensor/source/reductions/reduction_common.hpp
new file mode 100644
index 000000000000..4df67c16bc4e
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/reductions/reduction_common.hpp
@@ -0,0 +1,46 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_reductions_impl
+/// extension.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern void init_reduction_functions(py::module_);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/reductions/reduction_over_axis.hpp b/dpctl_ext/tensor/libtensor/source/reductions/reduction_over_axis.hpp
new file mode 100644
index 000000000000..936c8dbe9b56
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/reductions/reduction_over_axis.hpp
@@ -0,0 +1,1318 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_reductions_impl
+/// extension, specifically functions for reductions.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+
+#include <algorithm>
+#include <cstddef>
+#include <exception>
+#include <iterator>
+#include <stdexcept>
+#include <string>
+#include <type_traits>
+#include <utility>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+
+#include "kernels/reductions.hpp"
+#include "simplify_iteration_space.hpp"
+#include "utils/memory_overlap.hpp"
+#include "utils/offset_utils.hpp"
+#include "utils/output_validation.hpp"
+#include "utils/sycl_alloc_utils.hpp"
+#include "utils/type_dispatch.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+/* ====================== dtype supported ======================== */
+
+/*! @brief Template implementing Python API for querying type support by
+ * reduction which may support atomics */
+template <typename fnT, typename CheckAtomicSupportFnT>
+bool py_reduction_dtype_supported(
+    const py::dtype &input_dtype,
+    const py::dtype &output_dtype,
+    const std::string &dst_usm_type,
+    sycl::queue &q,
+    const fnT &atomic_dispatch_table,
+    const fnT &temps_dispatch_table,
+    const CheckAtomicSupportFnT &check_atomic_support)
+{
+    int arg_tn =
+        input_dtype.num(); // NumPy type numbers are the same as in dpctl
+    int out_tn =
+        output_dtype.num(); // NumPy type numbers are the same as in dpctl
+    int arg_typeid = -1;
+    int out_typeid = -1;
+
+    auto array_types = td_ns::usm_ndarray_types();
+
+    try {
+        arg_typeid = array_types.typenum_to_lookup_id(arg_tn);
+        out_typeid = array_types.typenum_to_lookup_id(out_tn);
+    } catch (const std::exception &e) {
+        throw py::value_error(e.what());
+    }
+
+    if (arg_typeid < 0 || arg_typeid >= td_ns::num_types || out_typeid < 0 ||
+        out_typeid >= td_ns::num_types)
+    {
+        throw std::runtime_error("Reduction type support check: lookup failed");
+    }
+
+    // remove_all_extents gets underlying type of table
+    using fn_ptrT = typename std::remove_all_extents<fnT>::type;
+    fn_ptrT fn = nullptr;
+
+    sycl::usm::alloc kind = sycl::usm::alloc::unknown;
+
+    if (dst_usm_type == "device") {
+        kind = sycl::usm::alloc::device;
+    }
+    else if (dst_usm_type == "shared") {
+        kind = sycl::usm::alloc::shared;
+    }
+    else if (dst_usm_type == "host") {
+        kind = sycl::usm::alloc::host;
+    }
+    else {
+        throw py::value_error("Unrecognized `dst_usm_type` argument.");
+    }
+
+    bool supports_atomics = check_atomic_support[out_typeid](q, kind);
+
+    if (supports_atomics) {
+        fn = atomic_dispatch_table[arg_typeid][out_typeid];
+    }
+
+    if (fn == nullptr) {
+        // use slower reduction implementation using temporaries
+        fn = temps_dispatch_table[arg_typeid][out_typeid];
+    }
+
+    return (fn != nullptr);
+}
+
+/*! @brief Template implementing Python API for querying type support by tree
+ * reduction */
+template <typename fnT>
+bool py_tree_reduction_dtype_supported(const py::dtype &input_dtype,
+                                       const py::dtype &output_dtype,
+                                       const fnT &temps_dispatch_table)
+{
+    int arg_tn =
+        input_dtype.num(); // NumPy type numbers are the same as in dpctl
+    int out_tn =
+        output_dtype.num(); // NumPy type numbers are the same as in dpctl
+    int arg_typeid = -1;
+    int out_typeid = -1;
+
+    auto array_types = td_ns::usm_ndarray_types();
+
+    try {
+        arg_typeid = array_types.typenum_to_lookup_id(arg_tn);
+        out_typeid = array_types.typenum_to_lookup_id(out_tn);
+    } catch (const std::exception &e) {
+        throw py::value_error(e.what());
+    }
+
+    if (arg_typeid < 0 || arg_typeid >= td_ns::num_types || out_typeid < 0 ||
+        out_typeid >= td_ns::num_types)
+    {
+        throw std::runtime_error("Reduction type support check: lookup failed");
+    }
+
+    auto fn = temps_dispatch_table[arg_typeid][out_typeid];
+
+    return (fn != nullptr);
+}
+
+/* ==================== Generic reductions ====================== */
+
+/*! @brief Template implementing Python API for reduction over axis which may
+ * support atomics */
+template <typename strided_fnT, typename contig_fnT, typename SupportAtomicFnT>
+std::pair<sycl::event, sycl::event> py_reduction_over_axis(
+    const dpctl::tensor::usm_ndarray &src,
+    int trailing_dims_to_reduce, // comp over this many trailing indexes
+    const dpctl::tensor::usm_ndarray &dst,
+    sycl::queue &exec_q,
+    const std::vector<sycl::event> &depends,
+    const strided_fnT &atomic_dispatch_table,
+    const contig_fnT &axis0_atomic_dispatch_table,
+    const contig_fnT &axis1_atomic_dispatch_table,
+    const strided_fnT &temps_dispatch_table,
+    const contig_fnT &axis0_temps_dispatch_table,
+    const contig_fnT &axis1_temps_dispatch_table,
+    const SupportAtomicFnT &check_atomic_support)
+{
+    int src_nd = src.get_ndim();
+    int iteration_nd = src_nd - trailing_dims_to_reduce;
+    if (trailing_dims_to_reduce <= 0 || iteration_nd < 0) {
+        throw py::value_error("Trailing_dim_to_reduce must be positive, but no "
+                              "greater than rank of the array being reduced");
+    }
+
+    int dst_nd = dst.get_ndim();
+    if (dst_nd != iteration_nd) {
+        throw py::value_error("Destination array rank does not match input "
+                              "array rank and number of reduced dimensions");
+    }
+
+    const py::ssize_t *src_shape_ptr = src.get_shape_raw();
+    const py::ssize_t *dst_shape_ptr = dst.get_shape_raw();
+
+    bool same_shapes = true;
+    for (int i = 0; same_shapes && (i < dst_nd); ++i) {
+        same_shapes = same_shapes && (src_shape_ptr[i] == dst_shape_ptr[i]);
+    }
+
+    if (!same_shapes) {
+        throw py::value_error("Destination shape does not match unreduced "
+                              "dimensions of the input shape");
+    }
+
+    if (!dpctl::utils::queues_are_compatible(exec_q, {src, dst})) {
+        throw py::value_error(
+            "Execution queue is not compatible with allocation queues");
+    }
+
+    dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst);
+
+    std::size_t dst_nelems = dst.get_size();
+
+    if (dst_nelems == 0) {
+        return std::make_pair(sycl::event(), sycl::event());
+    }
+
+    std::size_t reduction_nelems(1);
+    for (int i = dst_nd; i < src_nd; ++i) {
+        reduction_nelems *= static_cast<std::size_t>(src_shape_ptr[i]);
+    }
+
+    // check that dst and src do not overlap
+    auto const &overlap = dpctl::tensor::overlap::MemoryOverlap();
+    if (overlap(src, dst)) {
+        throw py::value_error("Arrays index overlapping segments of memory");
+    }
+
+    dpctl::tensor::validation::AmpleMemory::throw_if_not_ample(dst, dst_nelems);
+
+    int src_typenum = src.get_typenum();
+    int dst_typenum = dst.get_typenum();
+
+    namespace td_ns = dpctl::tensor::type_dispatch;
+    const auto &array_types = td_ns::usm_ndarray_types();
+    int src_typeid = array_types.typenum_to_lookup_id(src_typenum);
+    int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum);
+
+    void *data_ptr = dst.get_data();
+    const auto &ctx = exec_q.get_context();
+    auto usm_type = sycl::get_pointer_type(data_ptr, ctx);
+
+    bool supports_atomics = check_atomic_support[dst_typeid](exec_q, usm_type);
+
+    // handle special case when both reduction and iteration are 1D contiguous
+    bool is_src_c_contig = src.is_c_contiguous();
+    bool is_dst_c_contig = dst.is_c_contiguous();
+    bool is_src_f_contig = src.is_f_contiguous();
+
+    if ((is_src_c_contig && is_dst_c_contig) ||
+        (is_src_f_contig && dst_nelems == 1))
+    {
+        // remove_all_extents gets underlying type of table
+        using contig_fn_ptr_T =
+            typename std::remove_all_extents<contig_fnT>::type;
+        contig_fn_ptr_T fn;
+        if (supports_atomics) {
+            fn = axis1_atomic_dispatch_table[src_typeid][dst_typeid];
+        }
+        else {
+            fn = axis1_temps_dispatch_table[src_typeid][dst_typeid];
+        }
+        if (fn != nullptr) {
+            std::size_t iter_nelems = dst_nelems;
+
+            static constexpr py::ssize_t zero_offset = 0;
+
+            sycl::event reduction_over_axis_contig_ev =
+                fn(exec_q, iter_nelems, reduction_nelems, src.get_data(),
+                   dst.get_data(),
+                   zero_offset, // iteration_src_offset
+                   zero_offset, // iteration_dst_offset
+                   zero_offset, // reduction_src_offset
+                   depends);
+
+            sycl::event keep_args_event = dpctl::utils::keep_args_alive(
+                exec_q, {src, dst}, {reduction_over_axis_contig_ev});
+
+            return std::make_pair(keep_args_event,
+                                  reduction_over_axis_contig_ev);
+        }
+    }
+    else if (is_src_f_contig &&
+             ((is_dst_c_contig && dst_nd == 1) || dst.is_f_contiguous()))
+    {
+        // remove_all_extents gets underlying type of table
+        using contig_fn_ptr_T =
+            typename std::remove_all_extents<contig_fnT>::type;
+        contig_fn_ptr_T fn;
+        if (supports_atomics) {
+            fn = axis0_atomic_dispatch_table[src_typeid][dst_typeid];
+        }
+        else {
+            fn = axis0_temps_dispatch_table[src_typeid][dst_typeid];
+        }
+        if (fn != nullptr) {
+            std::size_t iter_nelems = dst_nelems;
+
+            static constexpr py::ssize_t zero_offset = 0;
+
+            sycl::event reduction_over_axis_contig_ev =
+                fn(exec_q, iter_nelems, reduction_nelems, src.get_data(),
+                   dst.get_data(),
+                   zero_offset, // iteration_src_offset
+                   zero_offset, // iteration_dst_offset
+                   zero_offset, // reduction_src_offset
+                   depends);
+
+            sycl::event keep_args_event = dpctl::utils::keep_args_alive(
+                exec_q, {src, dst}, {reduction_over_axis_contig_ev});
+
+            return std::make_pair(keep_args_event,
+                                  reduction_over_axis_contig_ev);
+        }
+    }
+
+    // TODO: not used anywhere
+    auto const &src_shape_vecs = src.get_shape_vector();
+    auto const &src_strides_vecs = src.get_strides_vector();
+    auto const &dst_strides_vecs = dst.get_strides_vector();
+
+    int reduction_nd = trailing_dims_to_reduce;
+    const py::ssize_t *reduction_shape_ptr = src_shape_ptr + dst_nd;
+    using shT = std::vector<py::ssize_t>;
+    shT reduction_src_strides(std::begin(src_strides_vecs) + dst_nd,
+                              std::end(src_strides_vecs));
+
+    shT simplified_reduction_shape;
+    shT simplified_reduction_src_strides;
+    py::ssize_t reduction_src_offset(0);
+
+    simplify_iteration_space_1(
+        reduction_nd, reduction_shape_ptr, reduction_src_strides,
+        // output
+        simplified_reduction_shape, simplified_reduction_src_strides,
+        reduction_src_offset);
+
+    const py::ssize_t *iteration_shape_ptr = src_shape_ptr;
+
+    shT iteration_src_strides(std::begin(src_strides_vecs),
+                              std::begin(src_strides_vecs) + iteration_nd);
+    shT const &iteration_dst_strides = dst_strides_vecs;
+
+    shT simplified_iteration_shape;
+    shT simplified_iteration_src_strides;
+    shT simplified_iteration_dst_strides;
+    py::ssize_t iteration_src_offset(0);
+    py::ssize_t iteration_dst_offset(0);
+
+    if (iteration_nd == 0) {
+        if (dst_nelems != 1) {
+            throw std::runtime_error("iteration_nd == 0, but dst_nelems != 1");
+        }
+        iteration_nd = 1;
+        simplified_iteration_shape.push_back(1);
+        simplified_iteration_src_strides.push_back(0);
+        simplified_iteration_dst_strides.push_back(0);
+    }
+    else {
+        simplify_iteration_space(iteration_nd, iteration_shape_ptr,
+                                 iteration_src_strides, iteration_dst_strides,
+                                 // output
+                                 simplified_iteration_shape,
+                                 simplified_iteration_src_strides,
+                                 simplified_iteration_dst_strides,
+                                 iteration_src_offset, iteration_dst_offset);
+    }
+
+    if ((reduction_nd == 1) && (iteration_nd == 1)) {
+        bool mat_reduce_over_axis1 = false;
+        bool mat_reduce_over_axis0 = false;
+        bool array_reduce_all_elems = false;
+        std::size_t iter_nelems = dst_nelems;
+
+        if (simplified_reduction_src_strides[0] == 1) {
+            array_reduce_all_elems = (simplified_iteration_shape[0] == 1);
+            mat_reduce_over_axis1 =
+                (simplified_iteration_dst_strides[0] == 1) &&
+                (static_cast<std::size_t>(
+                     simplified_iteration_src_strides[0]) == reduction_nelems);
+        }
+        else if (static_cast<std::size_t>(
+                     simplified_reduction_src_strides[0]) == iter_nelems)
+        {
+            mat_reduce_over_axis0 =
+                (simplified_iteration_dst_strides[0] == 1) &&
+                (simplified_iteration_src_strides[0] == 1);
+        }
+
+        if (mat_reduce_over_axis1 || array_reduce_all_elems) {
+            using contig_fn_ptr_T =
+                typename std::remove_all_extents<contig_fnT>::type;
+            contig_fn_ptr_T fn;
+            if (supports_atomics) {
+                fn = axis1_atomic_dispatch_table[src_typeid][dst_typeid];
+            }
+            else {
+                fn = axis1_temps_dispatch_table[src_typeid][dst_typeid];
+            }
+            if (fn != nullptr) {
+                sycl::event reduction_over_axis1_contig_ev =
+                    fn(exec_q, iter_nelems, reduction_nelems, src.get_data(),
+                       dst.get_data(), iteration_src_offset,
+                       iteration_dst_offset, reduction_src_offset, depends);
+
+                sycl::event keep_args_event = dpctl::utils::keep_args_alive(
+                    exec_q, {src, dst}, {reduction_over_axis1_contig_ev});
+
+                return std::make_pair(keep_args_event,
+                                      reduction_over_axis1_contig_ev);
+            }
+        }
+        else if (mat_reduce_over_axis0) {
+            using contig_fn_ptr_T =
+                typename std::remove_all_extents<contig_fnT>::type;
+            contig_fn_ptr_T fn;
+            if (supports_atomics) {
+                fn = axis0_atomic_dispatch_table[src_typeid][dst_typeid];
+            }
+            else {
+                fn = axis0_temps_dispatch_table[src_typeid][dst_typeid];
+            }
+            if (fn != nullptr) {
+                sycl::event reduction_over_axis0_contig_ev =
+                    fn(exec_q, iter_nelems, reduction_nelems, src.get_data(),
+                       dst.get_data(), iteration_src_offset,
+                       iteration_dst_offset, reduction_src_offset, depends);
+
+                sycl::event keep_args_event = dpctl::utils::keep_args_alive(
+                    exec_q, {src, dst}, {reduction_over_axis0_contig_ev});
+
+                return std::make_pair(keep_args_event,
+                                      reduction_over_axis0_contig_ev);
+            }
+        }
+    }
+
+    // remove_all_extents gets underlying type of table
+    using strided_fn_ptr_T =
+        typename std::remove_all_extents<strided_fnT>::type;
+    strided_fn_ptr_T fn = nullptr;
+
+    if (supports_atomics) {
+        fn = atomic_dispatch_table[src_typeid][dst_typeid];
+    }
+
+    if (fn == nullptr) {
+        // use slower reduction implementation using temporaries
+        fn = temps_dispatch_table[src_typeid][dst_typeid];
+        if (fn == nullptr) {
+            throw std::runtime_error("Datatypes are not supported");
+        }
+    }
+
+    std::vector<sycl::event> host_task_events{};
+    using dpctl::tensor::offset_utils::device_allocate_and_pack;
+    auto arrays_metainfo_packing_triple_ =
+        device_allocate_and_pack<py::ssize_t>(
+            exec_q, host_task_events,
+            // iteration metadata
+            simplified_iteration_shape, simplified_iteration_src_strides,
+            simplified_iteration_dst_strides,
+            // reduction metadata
+            simplified_reduction_shape, simplified_reduction_src_strides);
+    auto tmp_alloc_owner =
+        std::move(std::get<0>(arrays_metainfo_packing_triple_));
+    const auto &copy_metadata_ev = std::get<2>(arrays_metainfo_packing_triple_);
+    const py::ssize_t *temp_allocation_ptr = tmp_alloc_owner.get();
+
+    const py::ssize_t *iter_shape_and_strides = temp_allocation_ptr;
+    const py::ssize_t *reduction_shape_stride =
+        temp_allocation_ptr + 3 * simplified_iteration_shape.size();
+
+    std::vector<sycl::event> all_deps;
+    all_deps.reserve(depends.size() + 1);
+    all_deps.resize(depends.size());
+    std::copy(depends.begin(), depends.end(), all_deps.begin());
+    all_deps.push_back(copy_metadata_ev);
+
+    auto reduction_ev =
+        fn(exec_q, dst_nelems, reduction_nelems, src.get_data(), dst.get_data(),
+           iteration_nd, iter_shape_and_strides, iteration_src_offset,
+           iteration_dst_offset,
+           reduction_nd, // number dimensions being reduced
+           reduction_shape_stride, reduction_src_offset, all_deps);
+
+    sycl::event temp_cleanup_ev = dpctl::tensor::alloc_utils::async_smart_free(
+        exec_q, {reduction_ev}, tmp_alloc_owner);
+    host_task_events.push_back(temp_cleanup_ev);
+
+    sycl::event keep_args_event =
+        dpctl::utils::keep_args_alive(exec_q, {src, dst}, host_task_events);
+
+    return std::make_pair(keep_args_event, reduction_ev);
+}
+
+/* ================= No atomic reductions ====================== */
+
+/*! @brief Template implementing Python API for reduction over axis without
+ * atomics */
+template <typename strided_fnT, typename contig_fnT>
+std::pair<sycl::event, sycl::event> py_tree_reduction_over_axis(
+    const dpctl::tensor::usm_ndarray &src,
+    int trailing_dims_to_reduce, // comp over this many trailing indexes
+    const dpctl::tensor::usm_ndarray &dst,
+    sycl::queue &exec_q,
+    const std::vector<sycl::event> &depends,
+    const strided_fnT &temps_dispatch_table,
+    const contig_fnT &axis0_temps_dispatch_table,
+    const contig_fnT &axis1_temps_dispatch_table)
+{
+    int src_nd = src.get_ndim();
+    int iteration_nd = src_nd - trailing_dims_to_reduce;
+    if (trailing_dims_to_reduce <= 0 || iteration_nd < 0) {
+        throw py::value_error("Trailing_dim_to_reduce must be positive, but no "
+                              "greater than rank of the array being reduced");
+    }
+
+    int dst_nd = dst.get_ndim();
+    if (dst_nd != iteration_nd) {
+        throw py::value_error("Destination array rank does not match input "
+                              "array rank and number of reduced dimensions");
+    }
+
+    const py::ssize_t *src_shape_ptr = src.get_shape_raw();
+    const py::ssize_t *dst_shape_ptr = dst.get_shape_raw();
+
+    bool same_shapes = true;
+    for (int i = 0; same_shapes && (i < dst_nd); ++i) {
+        same_shapes = same_shapes && (src_shape_ptr[i] == dst_shape_ptr[i]);
+    }
+
+    if (!same_shapes) {
+        throw py::value_error("Destination shape does not match unreduced "
+                              "dimensions of the input shape");
+    }
+
+    if (!dpctl::utils::queues_are_compatible(exec_q, {src, dst})) {
+        throw py::value_error(
+            "Execution queue is not compatible with allocation queues");
+    }
+
+    dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst);
+
+    std::size_t dst_nelems = dst.get_size();
+
+    if (dst_nelems == 0) {
+        return std::make_pair(sycl::event(), sycl::event());
+    }
+
+    std::size_t reduction_nelems(1);
+    for (int i = dst_nd; i < src_nd; ++i) {
+        reduction_nelems *= static_cast<std::size_t>(src_shape_ptr[i]);
+    }
+
+    // check that dst and src do not overlap
+    auto const &overlap = dpctl::tensor::overlap::MemoryOverlap();
+    if (overlap(src, dst)) {
+        throw py::value_error("Arrays index overlapping segments of memory");
+    }
+
+    dpctl::tensor::validation::AmpleMemory::throw_if_not_ample(dst, dst_nelems);
+
+    int src_typenum = src.get_typenum();
+    int dst_typenum = dst.get_typenum();
+
+    namespace td_ns = dpctl::tensor::type_dispatch;
+    const auto &array_types = td_ns::usm_ndarray_types();
+    int src_typeid = array_types.typenum_to_lookup_id(src_typenum);
+    int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum);
+
+    // handle special case when both reduction and iteration are 1D contiguous
+    bool is_src_c_contig = src.is_c_contiguous();
+    bool is_dst_c_contig = dst.is_c_contiguous();
+    bool is_src_f_contig = src.is_f_contiguous();
+
+    if ((is_src_c_contig && is_dst_c_contig) ||
+        (is_src_f_contig && dst_nelems == 1))
+    {
+        auto fn = axis1_temps_dispatch_table[src_typeid][dst_typeid];
+        if (fn != nullptr) {
+            std::size_t iter_nelems = dst_nelems;
+
+            static constexpr py::ssize_t zero_offset = 0;
+
+            sycl::event reduction_over_axis_contig_ev =
+                fn(exec_q, iter_nelems, reduction_nelems, src.get_data(),
+                   dst.get_data(),
+                   zero_offset, // iteration_src_offset
+                   zero_offset, // iteration_dst_offset
+                   zero_offset, // reduction_src_offset
+                   depends);
+
+            sycl::event keep_args_event = dpctl::utils::keep_args_alive(
+                exec_q, {src, dst}, {reduction_over_axis_contig_ev});
+
+            return std::make_pair(keep_args_event,
+                                  reduction_over_axis_contig_ev);
+        }
+    }
+    else if (is_src_f_contig &&
+             ((is_dst_c_contig && dst_nd == 1) || dst.is_f_contiguous()))
+    {
+        auto fn = axis0_temps_dispatch_table[src_typeid][dst_typeid];
+        if (fn != nullptr) {
+            std::size_t iter_nelems = dst_nelems;
+
+            static constexpr py::ssize_t zero_offset = 0;
+
+            sycl::event reduction_over_axis_contig_ev =
+                fn(exec_q, iter_nelems, reduction_nelems, src.get_data(),
+                   dst.get_data(),
+                   zero_offset, // iteration_src_offset
+                   zero_offset, // iteration_dst_offset
+                   zero_offset, // reduction_src_offset
+                   depends);
+
+            sycl::event keep_args_event = dpctl::utils::keep_args_alive(
+                exec_q, {src, dst}, {reduction_over_axis_contig_ev});
+
+            return std::make_pair(keep_args_event,
+                                  reduction_over_axis_contig_ev);
+        }
+    }
+
+    auto const &src_shape_vecs = src.get_shape_vector();
+    auto const &src_strides_vecs = src.get_strides_vector();
+    auto const &dst_strides_vecs = dst.get_strides_vector();
+
+    int reduction_nd = trailing_dims_to_reduce;
+    const py::ssize_t *reduction_shape_ptr = src_shape_ptr + dst_nd;
+    using shT = std::vector<py::ssize_t>;
+    shT reduction_src_strides(std::begin(src_strides_vecs) + dst_nd,
+                              std::end(src_strides_vecs));
+
+    shT simplified_reduction_shape;
+    shT simplified_reduction_src_strides;
+    py::ssize_t reduction_src_offset(0);
+
+    simplify_iteration_space_1(
+        reduction_nd, reduction_shape_ptr, reduction_src_strides,
+        // output
+        simplified_reduction_shape, simplified_reduction_src_strides,
+        reduction_src_offset);
+
+    const py::ssize_t *iteration_shape_ptr = src_shape_ptr;
+
+    shT iteration_src_strides(std::begin(src_strides_vecs),
+                              std::begin(src_strides_vecs) + iteration_nd);
+    shT const &iteration_dst_strides = dst_strides_vecs;
+
+    shT simplified_iteration_shape;
+    shT simplified_iteration_src_strides;
+    shT simplified_iteration_dst_strides;
+    py::ssize_t iteration_src_offset(0);
+    py::ssize_t iteration_dst_offset(0);
+
+    if (iteration_nd == 0) {
+        if (dst_nelems != 1) {
+            throw std::runtime_error("iteration_nd == 0, but dst_nelems != 1");
+        }
+        iteration_nd = 1;
+        simplified_iteration_shape.push_back(1);
+        simplified_iteration_src_strides.push_back(0);
+        simplified_iteration_dst_strides.push_back(0);
+    }
+    else {
+        simplify_iteration_space(iteration_nd, iteration_shape_ptr,
+                                 iteration_src_strides, iteration_dst_strides,
+                                 // output
+                                 simplified_iteration_shape,
+                                 simplified_iteration_src_strides,
+                                 simplified_iteration_dst_strides,
+                                 iteration_src_offset, iteration_dst_offset);
+    }
+
+    if ((reduction_nd == 1) && (iteration_nd == 1)) {
+        bool mat_reduce_over_axis1 = false;
+        bool mat_reduce_over_axis0 = false;
+        bool array_reduce_all_elems = false;
+        std::size_t iter_nelems = dst_nelems;
+
+        if (simplified_reduction_src_strides[0] == 1) {
+            array_reduce_all_elems = (simplified_iteration_shape[0] == 1);
+            mat_reduce_over_axis1 =
+                (simplified_iteration_dst_strides[0] == 1) &&
+                (static_cast<std::size_t>(
+                     simplified_iteration_src_strides[0]) == reduction_nelems);
+        }
+        else if (static_cast<std::size_t>(
+                     simplified_reduction_src_strides[0]) == iter_nelems)
+        {
+            mat_reduce_over_axis0 =
+                (simplified_iteration_dst_strides[0] == 1) &&
+                (simplified_iteration_src_strides[0] == 1);
+        }
+
+        if (mat_reduce_over_axis1 || array_reduce_all_elems) {
+            auto fn = axis1_temps_dispatch_table[src_typeid][dst_typeid];
+            if (fn != nullptr) {
+                sycl::event reduction_over_axis1_contig_ev =
+                    fn(exec_q, iter_nelems, reduction_nelems, src.get_data(),
+                       dst.get_data(), iteration_src_offset,
+                       iteration_dst_offset, reduction_src_offset, depends);
+
+                sycl::event keep_args_event = dpctl::utils::keep_args_alive(
+                    exec_q, {src, dst}, {reduction_over_axis1_contig_ev});
+
+                return std::make_pair(keep_args_event,
+                                      reduction_over_axis1_contig_ev);
+            }
+        }
+        else if (mat_reduce_over_axis0) {
+            auto fn = axis0_temps_dispatch_table[src_typeid][dst_typeid];
+            if (fn != nullptr) {
+                sycl::event reduction_over_axis0_contig_ev =
+                    fn(exec_q, iter_nelems, reduction_nelems, src.get_data(),
+                       dst.get_data(), iteration_src_offset,
+                       iteration_dst_offset, reduction_src_offset, depends);
+
+                sycl::event keep_args_event = dpctl::utils::keep_args_alive(
+                    exec_q, {src, dst}, {reduction_over_axis0_contig_ev});
+
+                return std::make_pair(keep_args_event,
+                                      reduction_over_axis0_contig_ev);
+            }
+        }
+    }
+
+    auto fn = temps_dispatch_table[src_typeid][dst_typeid];
+    if (fn == nullptr) {
+        throw std::runtime_error("Datatypes are not supported");
+    }
+
+    std::vector<sycl::event> host_task_events{};
+    using dpctl::tensor::offset_utils::device_allocate_and_pack;
+    auto arrays_metainfo_packing_triple_ =
+        device_allocate_and_pack<py::ssize_t>(
+            exec_q, host_task_events,
+            // iteration metadata
+            simplified_iteration_shape, simplified_iteration_src_strides,
+            simplified_iteration_dst_strides,
+            // reduction metadata
+            simplified_reduction_shape, simplified_reduction_src_strides);
+    auto tmp_owner = std::move(std::get<0>(arrays_metainfo_packing_triple_));
+    const auto &copy_metadata_ev = std::get<2>(arrays_metainfo_packing_triple_);
+    const py::ssize_t *temp_allocation_ptr = tmp_owner.get();
+
+    const py::ssize_t *iter_shape_and_strides = temp_allocation_ptr;
+    const py::ssize_t *reduction_shape_stride =
+        temp_allocation_ptr + 3 * simplified_iteration_shape.size();
+
+    std::vector<sycl::event> all_deps;
+    all_deps.reserve(depends.size() + 1);
+    all_deps.resize(depends.size());
+    std::copy(depends.begin(), depends.end(), all_deps.begin());
+    all_deps.push_back(copy_metadata_ev);
+
+    auto reduction_ev =
+        fn(exec_q, dst_nelems, reduction_nelems, src.get_data(), dst.get_data(),
+           iteration_nd, iter_shape_and_strides, iteration_src_offset,
+           iteration_dst_offset,
+           reduction_nd, // number dimensions being reduced
+           reduction_shape_stride, reduction_src_offset, all_deps);
+
+    sycl::event temp_cleanup_ev = dpctl::tensor::alloc_utils::async_smart_free(
+        exec_q, {reduction_ev}, tmp_owner);
+    host_task_events.push_back(temp_cleanup_ev);
+
+    sycl::event keep_args_event =
+        dpctl::utils::keep_args_alive(exec_q, {src, dst}, host_task_events);
+
+    return std::make_pair(keep_args_event, reduction_ev);
+}
+
+/*! @brief Template implementing Python API for searching over an axis */
+template <typename strided_fnT, typename contig_fnT>
+std::pair<sycl::event, sycl::event> py_search_over_axis(
+    const dpctl::tensor::usm_ndarray &src,
+    int trailing_dims_to_reduce, // comp over this many trailing indexes
+    const dpctl::tensor::usm_ndarray &dst,
+    sycl::queue &exec_q,
+    const std::vector<sycl::event> &depends,
+    const strided_fnT &strided_dispatch_table,
+    const contig_fnT &axis0_contig_dispatch_table,
+    const contig_fnT &axis1_contig_dispatch_table)
+{
+    int src_nd = src.get_ndim();
+    int iteration_nd = src_nd - trailing_dims_to_reduce;
+    if (trailing_dims_to_reduce <= 0 || iteration_nd < 0) {
+        throw py::value_error("Trailing_dim_to_reduce must be positive, but no "
+                              "greater than rank of the array being reduced");
+    }
+
+    int dst_nd = dst.get_ndim();
+    if (dst_nd != iteration_nd) {
+        throw py::value_error("Destination array rank does not match input "
+                              "array rank and number of reduced dimensions");
+    }
+
+    const py::ssize_t *src_shape_ptr = src.get_shape_raw();
+    const py::ssize_t *dst_shape_ptr = dst.get_shape_raw();
+
+    bool same_shapes = true;
+    for (int i = 0; same_shapes && (i < dst_nd); ++i) {
+        same_shapes = same_shapes && (src_shape_ptr[i] == dst_shape_ptr[i]);
+    }
+
+    if (!same_shapes) {
+        throw py::value_error("Destination shape does not match unreduced "
+                              "dimensions of the input shape");
+    }
+
+    if (!dpctl::utils::queues_are_compatible(exec_q, {src, dst})) {
+        throw py::value_error(
+            "Execution queue is not compatible with allocation queues");
+    }
+
+    dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst);
+
+    std::size_t dst_nelems = dst.get_size();
+
+    if (dst_nelems == 0) {
+        return std::make_pair(sycl::event(), sycl::event());
+    }
+
+    std::size_t reduction_nelems(1);
+    for (int i = dst_nd; i < src_nd; ++i) {
+        reduction_nelems *= static_cast<std::size_t>(src_shape_ptr[i]);
+    }
+
+    // check that dst and src do not overlap
+    auto const &overlap = dpctl::tensor::overlap::MemoryOverlap();
+    if (overlap(src, dst)) {
+        throw py::value_error("Arrays index overlapping segments of memory");
+    }
+
+    dpctl::tensor::validation::AmpleMemory::throw_if_not_ample(dst, dst_nelems);
+
+    int src_typenum = src.get_typenum();
+    int dst_typenum = dst.get_typenum();
+
+    namespace td_ns = dpctl::tensor::type_dispatch;
+    const auto &array_types = td_ns::usm_ndarray_types();
+    int src_typeid = array_types.typenum_to_lookup_id(src_typenum);
+    int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum);
+
+    // handle special case when both reduction and iteration are 1D contiguous
+    bool is_src_c_contig = src.is_c_contiguous();
+    bool is_dst_c_contig = dst.is_c_contiguous();
+    bool is_src_f_contig = src.is_f_contiguous();
+
+    if (is_src_c_contig && is_dst_c_contig) {
+        auto fn = axis1_contig_dispatch_table[src_typeid][dst_typeid];
+        if (fn != nullptr) {
+            std::size_t iter_nelems = dst_nelems;
+
+            static constexpr py::ssize_t zero_offset = 0;
+
+            sycl::event reduction_over_axis_contig_ev =
+                fn(exec_q, iter_nelems, reduction_nelems, src.get_data(),
+                   dst.get_data(),
+                   zero_offset, // iteration_src_offset
+                   zero_offset, // iteration_dst_offset
+                   zero_offset, // reduction_src_offset
+                   depends);
+
+            sycl::event keep_args_event = dpctl::utils::keep_args_alive(
+                exec_q, {src, dst}, {reduction_over_axis_contig_ev});
+
+            return std::make_pair(keep_args_event,
+                                  reduction_over_axis_contig_ev);
+        }
+    }
+    else if (is_src_f_contig && dst_nd == 1) {
+        auto fn = axis0_contig_dispatch_table[src_typeid][dst_typeid];
+        if (fn != nullptr) {
+            std::size_t iter_nelems = dst_nelems;
+
+            static constexpr py::ssize_t zero_offset = 0;
+
+            sycl::event reduction_over_axis_contig_ev =
+                fn(exec_q, iter_nelems, reduction_nelems, src.get_data(),
+                   dst.get_data(),
+                   zero_offset, // iteration_src_offset
+                   zero_offset, // iteration_dst_offset
+                   zero_offset, // reduction_src_offset
+                   depends);
+
+            sycl::event keep_args_event = dpctl::utils::keep_args_alive(
+                exec_q, {src, dst}, {reduction_over_axis_contig_ev});
+
+            return std::make_pair(keep_args_event,
+                                  reduction_over_axis_contig_ev);
+        }
+    }
+
+    auto const &src_shape_vecs = src.get_shape_vector();
+    auto const &src_strides_vecs = src.get_strides_vector();
+    auto const &dst_strides_vecs = dst.get_strides_vector();
+
+    int reduction_nd = trailing_dims_to_reduce;
+    const py::ssize_t *reduction_shape_ptr = src_shape_ptr + dst_nd;
+    using shT = std::vector<py::ssize_t>;
+    shT reduction_src_strides(std::begin(src_strides_vecs) + dst_nd,
+                              std::end(src_strides_vecs));
+
+    shT compact_reduction_shape;
+    shT compact_reduction_src_strides;
+    py::ssize_t reduction_src_offset(0);
+
+    // TODO: not used anywhere
+    compact_iteration_space(
+        reduction_nd, reduction_shape_ptr, reduction_src_strides,
+        // output
+        compact_reduction_shape, compact_reduction_src_strides);
+
+    const py::ssize_t *iteration_shape_ptr = src_shape_ptr;
+
+    shT iteration_src_strides(std::begin(src_strides_vecs),
+                              std::begin(src_strides_vecs) + iteration_nd);
+    shT const &iteration_dst_strides = dst_strides_vecs;
+
+    shT simplified_iteration_shape;
+    shT simplified_iteration_src_strides;
+    shT simplified_iteration_dst_strides;
+    py::ssize_t iteration_src_offset(0);
+    py::ssize_t iteration_dst_offset(0);
+
+    if (iteration_nd == 0) {
+        if (dst_nelems != 1) {
+            throw std::runtime_error("iteration_nd == 0, but dst_nelems != 1");
+        }
+        iteration_nd = 1;
+        simplified_iteration_shape.push_back(1);
+        simplified_iteration_src_strides.push_back(0);
+        simplified_iteration_dst_strides.push_back(0);
+    }
+    else {
+        simplify_iteration_space(iteration_nd, iteration_shape_ptr,
+                                 iteration_src_strides, iteration_dst_strides,
+                                 // output
+                                 simplified_iteration_shape,
+                                 simplified_iteration_src_strides,
+                                 simplified_iteration_dst_strides,
+                                 iteration_src_offset, iteration_dst_offset);
+    }
+
+    if ((reduction_nd == 1) && (iteration_nd == 1)) {
+        bool mat_reduce_over_axis1 = false;
+        bool mat_reduce_over_axis0 = false;
+        std::size_t iter_nelems = dst_nelems;
+
+        if (compact_reduction_src_strides[0] == 1) {
+            mat_reduce_over_axis1 =
+                (simplified_iteration_dst_strides[0] == 1) &&
+                (static_cast<std::size_t>(
+                     simplified_iteration_src_strides[0]) == reduction_nelems);
+        }
+        else if (static_cast<std::size_t>(compact_reduction_src_strides[0]) ==
+                 iter_nelems)
+        {
+            mat_reduce_over_axis0 =
+                (simplified_iteration_dst_strides[0] == 1) &&
+                (simplified_iteration_src_strides[0] == 1);
+        }
+
+        if (mat_reduce_over_axis1) {
+            auto fn = axis1_contig_dispatch_table[src_typeid][dst_typeid];
+            if (fn != nullptr) {
+                sycl::event reduction_over_axis1_contig_ev =
+                    fn(exec_q, iter_nelems, reduction_nelems, src.get_data(),
+                       dst.get_data(), iteration_src_offset,
+                       iteration_dst_offset, reduction_src_offset, depends);
+
+                sycl::event keep_args_event = dpctl::utils::keep_args_alive(
+                    exec_q, {src, dst}, {reduction_over_axis1_contig_ev});
+
+                return std::make_pair(keep_args_event,
+                                      reduction_over_axis1_contig_ev);
+            }
+        }
+        else if (mat_reduce_over_axis0) {
+            auto fn = axis0_contig_dispatch_table[src_typeid][dst_typeid];
+            if (fn != nullptr) {
+                sycl::event reduction_over_axis0_contig_ev =
+                    fn(exec_q, iter_nelems, reduction_nelems, src.get_data(),
+                       dst.get_data(), iteration_src_offset,
+                       iteration_dst_offset, reduction_src_offset, depends);
+
+                sycl::event keep_args_event = dpctl::utils::keep_args_alive(
+                    exec_q, {src, dst}, {reduction_over_axis0_contig_ev});
+
+                return std::make_pair(keep_args_event,
+                                      reduction_over_axis0_contig_ev);
+            }
+        }
+    }
+
+    auto fn = strided_dispatch_table[src_typeid][dst_typeid];
+    if (fn == nullptr) {
+        throw std::runtime_error("Datatypes are not supported");
+    }
+
+    std::vector<sycl::event> host_task_events{};
+
+    using dpctl::tensor::offset_utils::device_allocate_and_pack;
+
+    auto arrays_metainfo_packing_triple_ =
+        device_allocate_and_pack<py::ssize_t>(
+            exec_q, host_task_events,
+            // iteration metadata
+            simplified_iteration_shape, simplified_iteration_src_strides,
+            simplified_iteration_dst_strides,
+            // reduction metadata
+            compact_reduction_shape, compact_reduction_src_strides);
+    auto tmp_owner = std::move(std::get<0>(arrays_metainfo_packing_triple_));
+    const auto &copy_metadata_ev = std::get<2>(arrays_metainfo_packing_triple_);
+    const py::ssize_t *temp_allocation_ptr = tmp_owner.get();
+
+    const py::ssize_t *iter_shape_and_strides = temp_allocation_ptr;
+    const py::ssize_t *reduction_shape_stride =
+        temp_allocation_ptr + 3 * simplified_iteration_shape.size();
+
+    std::vector<sycl::event> all_deps;
+    all_deps.reserve(depends.size() + 1);
+    all_deps.resize(depends.size());
+    std::copy(depends.begin(), depends.end(), all_deps.begin());
+    all_deps.push_back(copy_metadata_ev);
+
+    auto comp_ev = fn(exec_q, dst_nelems, reduction_nelems, src.get_data(),
+                      dst.get_data(), iteration_nd, iter_shape_and_strides,
+                      iteration_src_offset, iteration_dst_offset,
+                      reduction_nd, // number dimensions being reduced
+                      reduction_shape_stride, reduction_src_offset, all_deps);
+
+    sycl::event temp_cleanup_ev = dpctl::tensor::alloc_utils::async_smart_free(
+        exec_q, {comp_ev}, tmp_owner);
+    host_task_events.push_back(temp_cleanup_ev);
+
+    sycl::event keep_args_event =
+        dpctl::utils::keep_args_alive(exec_q, {src, dst}, host_task_events);
+
+    return std::make_pair(keep_args_event, comp_ev);
+}
+
+/* ================= Atomic only reductions ====================== */
+
+/*! @brief Template implementing Python API for boolean reductions over an axis
+ */
+template <typename contig_dispatchT,
+          typename strided_dispatchT,
+          typename atomic_support_fnT>
+std::pair<sycl::event, sycl::event>
+    py_boolean_reduction(const dpctl::tensor::usm_ndarray &src,
+                         int trailing_dims_to_reduce,
+                         const dpctl::tensor::usm_ndarray &dst,
+                         sycl::queue &exec_q,
+                         const std::vector<sycl::event> &depends,
+                         const contig_dispatchT &axis1_contig_dispatch_vector,
+                         const contig_dispatchT &axis0_contig_dispatch_vector,
+                         const strided_dispatchT &strided_dispatch_vector,
+                         const atomic_support_fnT check_atomic_support)
+{
+    int src_nd = src.get_ndim();
+    int iter_nd = src_nd - trailing_dims_to_reduce;
+    if (trailing_dims_to_reduce <= 0 || iter_nd < 0) {
+        throw py::value_error("Trailing_dim_to_reduce must be positive, but no "
+                              "greater than rank of the array being reduced");
+    }
+
+    int dst_nd = dst.get_ndim();
+    if (dst_nd != iter_nd) {
+        throw py::value_error("Destination array rank does not match input "
+                              "array rank and number of reduced dimensions");
+    }
+
+    const py::ssize_t *src_shape_ptr = src.get_shape_raw();
+    const py::ssize_t *dst_shape_ptr = dst.get_shape_raw();
+
+    bool same_shapes = true;
+    for (int i = 0; same_shapes && (i < dst_nd); ++i) {
+        same_shapes = same_shapes && (src_shape_ptr[i] == dst_shape_ptr[i]);
+    }
+
+    if (!same_shapes) {
+        throw py::value_error("Destination shape does not match unreduced "
+                              "dimensions of the input shape");
+    }
+
+    if (!dpctl::utils::queues_are_compatible(exec_q, {src, dst})) {
+        throw py::value_error(
+            "Execution queue is not compatible with allocation queues");
+    }
+
+    dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst);
+
+    std::size_t dst_nelems = dst.get_size();
+
+    std::size_t red_nelems(1);
+    for (int i = dst_nd; i < src_nd; ++i) {
+        red_nelems *= static_cast<std::size_t>(src_shape_ptr[i]);
+    }
+
+    auto const &overlap = dpctl::tensor::overlap::MemoryOverlap();
+    if (overlap(dst, src)) {
+        throw py::value_error("Arrays are expected to have no memory overlap");
+    }
+
+    dpctl::tensor::validation::AmpleMemory::throw_if_not_ample(dst, dst_nelems);
+
+    const char *src_data = src.get_data();
+    char *dst_data = dst.get_data();
+
+    int src_typenum = src.get_typenum();
+    int dst_typenum = dst.get_typenum();
+
+    const auto &array_types = td_ns::usm_ndarray_types();
+    int src_typeid = array_types.typenum_to_lookup_id(src_typenum);
+    int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum);
+
+    static constexpr int int32_typeid =
+        static_cast<int>(td_ns::typenum_t::INT32);
+    if (dst_typeid != int32_typeid) {
+        throw py::value_error(
+            "Unexpected data type of destination array, expecting 'int32'");
+    }
+
+    void *data_ptr = dst.get_data();
+    const auto &ctx = exec_q.get_context();
+    auto usm_type = sycl::get_pointer_type(data_ptr, ctx);
+
+    bool supports_atomics = check_atomic_support(exec_q, usm_type);
+    if (!supports_atomics) {
+        throw py::value_error(
+            "This reduction is not supported for this device and usm_type.");
+    }
+
+    bool is_src_c_contig = src.is_c_contiguous();
+    bool is_src_f_contig = src.is_f_contiguous();
+    bool is_dst_c_contig = dst.is_c_contiguous();
+
+    // TODO: should be dst_nelems == 0?
+    if ((is_src_c_contig && is_dst_c_contig) ||
+        (is_src_f_contig && dst_nelems == 0))
+    {
+        auto fn = axis1_contig_dispatch_vector[src_typeid];
+        static constexpr py::ssize_t zero_offset = 0;
+
+        sycl::event red_ev =
+            fn(exec_q, dst_nelems, red_nelems, src_data, dst_data, zero_offset,
+               zero_offset, zero_offset, depends);
+
+        sycl::event keep_args_event =
+            dpctl::utils::keep_args_alive(exec_q, {src, dst}, {red_ev});
+
+        return std::make_pair(keep_args_event, red_ev);
+    }
+    else if (is_src_f_contig &&
+             ((is_dst_c_contig && dst_nd == 1) || dst.is_f_contiguous()))
+    {
+        auto fn = axis0_contig_dispatch_vector[src_typeid];
+        static constexpr py::ssize_t zero_offset = 0;
+
+        sycl::event red_ev =
+            fn(exec_q, dst_nelems, red_nelems, src_data, dst_data, zero_offset,
+               zero_offset, zero_offset, depends);
+
+        sycl::event keep_args_event =
+            dpctl::utils::keep_args_alive(exec_q, {src, dst}, {red_ev});
+
+        return std::make_pair(keep_args_event, red_ev);
+    }
+
+    auto src_shape_vecs = src.get_shape_vector();
+    auto src_strides_vecs = src.get_strides_vector();
+    auto dst_strides_vecs = dst.get_strides_vector();
+
+    int simplified_red_nd = trailing_dims_to_reduce;
+
+    using shT = std::vector<py::ssize_t>;
+    shT red_src_strides(std::begin(src_strides_vecs) + dst_nd,
+                        std::end(src_strides_vecs));
+
+    shT simplified_red_shape;
+    shT simplified_red_src_strides;
+    py::ssize_t red_src_offset(0);
+
+    simplify_iteration_space_1(
+        simplified_red_nd, src_shape_ptr + dst_nd, red_src_strides,
+        // output
+        simplified_red_shape, simplified_red_src_strides, red_src_offset);
+
+    shT iter_src_strides(std::begin(src_strides_vecs),
+                         std::begin(src_strides_vecs) + iter_nd);
+    shT const &iter_dst_strides = dst_strides_vecs;
+
+    shT simplified_iter_shape;
+    shT simplified_iter_src_strides;
+    shT simplified_iter_dst_strides;
+    py::ssize_t iter_src_offset(0);
+    py::ssize_t iter_dst_offset(0);
+
+    if (iter_nd == 0) {
+        if (dst_nelems != 1) {
+            throw std::runtime_error("iteration_nd == 0, but dst_nelems != 1");
+        }
+        iter_nd = 1;
+        simplified_iter_shape.push_back(1);
+        simplified_iter_src_strides.push_back(0);
+        simplified_iter_dst_strides.push_back(0);
+    }
+    else {
+        simplify_iteration_space(
+            iter_nd, src_shape_ptr, iter_src_strides, iter_dst_strides,
+            // output
+            simplified_iter_shape, simplified_iter_src_strides,
+            simplified_iter_dst_strides, iter_src_offset, iter_dst_offset);
+    }
+
+    if (simplified_red_nd == 1 && iter_nd == 1) {
+        bool mat_reduce_over_axis1 = false;
+        bool mat_reduce_over_axis0 = false;
+        bool array_reduce_all_elems = false;
+        std::size_t iter_nelems = dst_nelems;
+
+        if (simplified_red_src_strides[0] == 1) {
+            array_reduce_all_elems = (simplified_iter_shape[0] == 1);
+            mat_reduce_over_axis1 =
+                (simplified_iter_dst_strides[0] == 1) &&
+                (static_cast<std::size_t>(simplified_iter_src_strides[0]) ==
+                 red_nelems);
+        }
+        else if (static_cast<std::size_t>(simplified_red_src_strides[0]) ==
+                 iter_nelems) {
+            mat_reduce_over_axis0 = (simplified_iter_dst_strides[0] == 1) &&
+                                    (simplified_iter_src_strides[0] == 1);
+        }
+        if (mat_reduce_over_axis1 || array_reduce_all_elems) {
+            auto fn = axis1_contig_dispatch_vector[src_typeid];
+
+            sycl::event red_ev =
+                fn(exec_q, iter_nelems, red_nelems, src_data, dst_data,
+                   iter_src_offset, iter_dst_offset, red_src_offset, depends);
+
+            sycl::event keep_args_event =
+                dpctl::utils::keep_args_alive(exec_q, {src, dst}, {red_ev});
+
+            return std::make_pair(keep_args_event, red_ev);
+        }
+        else if (mat_reduce_over_axis0) {
+            auto fn = axis0_contig_dispatch_vector[src_typeid];
+
+            sycl::event red_ev =
+                fn(exec_q, iter_nelems, red_nelems, src_data, dst_data,
+                   iter_src_offset, iter_dst_offset, red_src_offset, depends);
+
+            sycl::event keep_args_event =
+                dpctl::utils::keep_args_alive(exec_q, {src, dst}, {red_ev});
+
+            return std::make_pair(keep_args_event, red_ev);
+        }
+    }
+
+    auto fn = strided_dispatch_vector[src_typeid];
+
+    std::vector<sycl::event> host_task_events{};
+    auto iter_red_metadata_packing_triple_ =
+        dpctl::tensor::offset_utils::device_allocate_and_pack<py::ssize_t>(
+            exec_q, host_task_events, simplified_iter_shape,
+            simplified_iter_src_strides, simplified_iter_dst_strides,
+            simplified_red_shape, simplified_red_src_strides);
+    auto packed_shapes_strides_owner =
+        std::move(std::get<0>(iter_red_metadata_packing_triple_));
+    const auto &copy_metadata_ev =
+        std::get<2>(iter_red_metadata_packing_triple_);
+    const py::ssize_t *packed_shapes_and_strides =
+        packed_shapes_strides_owner.get();
+
+    const py::ssize_t *iter_shape_and_strides = packed_shapes_and_strides;
+    const py::ssize_t *red_shape_stride =
+        packed_shapes_and_strides + 3 * simplified_iter_shape.size();
+
+    std::vector<sycl::event> all_deps;
+    all_deps.reserve(depends.size() + 1);
+    all_deps.resize(depends.size());
+    std::copy(depends.begin(), depends.end(), all_deps.begin());
+    all_deps.push_back(copy_metadata_ev);
+
+    auto red_ev =
+        fn(exec_q, dst_nelems, red_nelems, src_data, dst_data, iter_nd,
+           iter_shape_and_strides, iter_src_offset, iter_dst_offset,
+           simplified_red_nd, red_shape_stride, red_src_offset, all_deps);
+
+    sycl::event temp_cleanup_ev = dpctl::tensor::alloc_utils::async_smart_free(
+        exec_q, {red_ev}, packed_shapes_strides_owner);
+    host_task_events.push_back(temp_cleanup_ev);
+
+    sycl::event keep_args_event =
+        dpctl::utils::keep_args_alive(exec_q, {src, dst}, host_task_events);
+
+    return std::make_pair(keep_args_event, red_ev);
+}
+
+extern void init_reduction_functions(py::module_ m);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/reductions/sum.cpp b/dpctl_ext/tensor/libtensor/source/reductions/sum.cpp
new file mode 100644
index 000000000000..d7142477750a
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/reductions/sum.cpp
@@ -0,0 +1,463 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_reductions_impl
+/// extension.
+//===---------------------------------------------------------------------===//
+
+#include <complex>
+#include <cstdint>
+#include <string>
+#include <type_traits>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "kernels/reductions.hpp"
+#include "utils/type_dispatch_building.hpp"
+
+#include "reduction_atomic_support.hpp"
+#include "reduction_over_axis.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace impl
+{
+
+using dpctl::tensor::kernels::reduction_strided_impl_fn_ptr;
+static reduction_strided_impl_fn_ptr
+    sum_over_axis_strided_atomic_dispatch_table[td_ns::num_types]
+                                               [td_ns::num_types];
+static reduction_strided_impl_fn_ptr
+    sum_over_axis_strided_temps_dispatch_table[td_ns::num_types]
+                                              [td_ns::num_types];
+
+using dpctl::tensor::kernels::reduction_contig_impl_fn_ptr;
+static reduction_contig_impl_fn_ptr
+    sum_over_axis1_contig_atomic_dispatch_table[td_ns::num_types]
+                                               [td_ns::num_types];
+static reduction_contig_impl_fn_ptr
+    sum_over_axis0_contig_atomic_dispatch_table[td_ns::num_types]
+                                               [td_ns::num_types];
+static reduction_contig_impl_fn_ptr
+    sum_over_axis1_contig_temps_dispatch_table[td_ns::num_types]
+                                              [td_ns::num_types];
+static reduction_contig_impl_fn_ptr
+    sum_over_axis0_contig_temps_dispatch_table[td_ns::num_types]
+                                              [td_ns::num_types];
+
+/* @brief Types supported by plus-reduction code based on atomic_ref */
+template <typename argTy, typename outTy>
+struct TypePairSupportDataForSumReductionAtomic
+{
+
+    /* value if true a kernel for <argTy, outTy> must be instantiated, false
+     * otherwise */
+    static constexpr bool is_defined = std::disjunction<
+        td_ns::TypePairDefinedEntry<argTy, bool, outTy, std::int32_t>,
+        td_ns::TypePairDefinedEntry<argTy, bool, outTy, std::uint32_t>,
+        td_ns::TypePairDefinedEntry<argTy, bool, outTy, std::int64_t>,
+        td_ns::TypePairDefinedEntry<argTy, bool, outTy, std::uint64_t>,
+        // input int8
+        td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, std::int32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, std::int64_t>,
+        // input uint8
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, std::int32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, std::uint32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, std::int64_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, std::uint64_t>,
+        // input int16
+        td_ns::TypePairDefinedEntry<argTy, std::int16_t, outTy, std::int32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int16_t, outTy, std::int64_t>,
+        // input uint16
+        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, std::int32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, std::uint32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, std::int64_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, std::uint64_t>,
+        // input int32
+        td_ns::TypePairDefinedEntry<argTy, std::int32_t, outTy, std::int32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int32_t, outTy, std::int64_t>,
+        // input uint32
+        td_ns::TypePairDefinedEntry<argTy, std::uint32_t, outTy, std::uint32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint32_t, outTy, std::int64_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint32_t, outTy, std::uint64_t>,
+        // input int64
+        td_ns::TypePairDefinedEntry<argTy, std::int64_t, outTy, std::int64_t>,
+        // input uint64
+        td_ns::TypePairDefinedEntry<argTy, std::uint64_t, outTy, std::uint64_t>,
+        // fall-through
+        td_ns::NotDefinedEntry>::is_defined;
+};
+
+template <typename argTy, typename outTy>
+struct TypePairSupportDataForSumReductionTemps
+{
+
+    static constexpr bool is_defined = std::disjunction<
+        td_ns::TypePairDefinedEntry<argTy, bool, outTy, bool>,
+        td_ns::TypePairDefinedEntry<argTy, bool, outTy, std::int8_t>,
+        td_ns::TypePairDefinedEntry<argTy, bool, outTy, std::uint8_t>,
+        td_ns::TypePairDefinedEntry<argTy, bool, outTy, std::int16_t>,
+        td_ns::TypePairDefinedEntry<argTy, bool, outTy, std::uint16_t>,
+        td_ns::TypePairDefinedEntry<argTy, bool, outTy, std::int32_t>,
+        td_ns::TypePairDefinedEntry<argTy, bool, outTy, std::uint32_t>,
+        td_ns::TypePairDefinedEntry<argTy, bool, outTy, std::int64_t>,
+        td_ns::TypePairDefinedEntry<argTy, bool, outTy, std::uint64_t>,
+        td_ns::TypePairDefinedEntry<argTy, bool, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, bool, outTy, double>,
+
+        // input int8_t
+        td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, std::int8_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, std::int16_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, std::int32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, std::int64_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, double>,
+
+        // input uint8_t
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, std::uint8_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, std::int16_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, std::uint16_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, std::int32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, std::uint32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, std::int64_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, std::uint64_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, double>,
+
+        // input int16_t
+        td_ns::TypePairDefinedEntry<argTy, std::int16_t, outTy, std::int16_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int16_t, outTy, std::int32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int16_t, outTy, std::int64_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int16_t, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, std::int16_t, outTy, double>,
+
+        // input uint16_t
+        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, std::uint16_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, std::int32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, std::uint32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, std::int64_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, std::uint64_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, double>,
+
+        // input int32_t
+        td_ns::TypePairDefinedEntry<argTy, std::int32_t, outTy, std::int32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int32_t, outTy, std::int64_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int32_t, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, std::int32_t, outTy, double>,
+
+        // input uint32_t
+        td_ns::TypePairDefinedEntry<argTy, std::uint32_t, outTy, std::uint32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint32_t, outTy, std::uint64_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint32_t, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint32_t, outTy, double>,
+
+        // input int64_t
+        td_ns::TypePairDefinedEntry<argTy, std::int64_t, outTy, std::int64_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int64_t, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, std::int64_t, outTy, double>,
+
+        // input uint64_t
+        td_ns::TypePairDefinedEntry<argTy, std::uint64_t, outTy, std::uint64_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint64_t, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint64_t, outTy, double>,
+
+        // input half
+        td_ns::TypePairDefinedEntry<argTy, sycl::half, outTy, sycl::half>,
+        td_ns::TypePairDefinedEntry<argTy, sycl::half, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, sycl::half, outTy, double>,
+        td_ns::
+            TypePairDefinedEntry<argTy, sycl::half, outTy, std::complex<float>>,
+        td_ns::TypePairDefinedEntry<argTy,
+                                    sycl::half,
+                                    outTy,
+                                    std::complex<double>>,
+
+        // input float
+        td_ns::TypePairDefinedEntry<argTy, float, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, float, outTy, double>,
+        td_ns::TypePairDefinedEntry<argTy, float, outTy, std::complex<float>>,
+        td_ns::TypePairDefinedEntry<argTy, float, outTy, std::complex<double>>,
+
+        // input double
+        td_ns::TypePairDefinedEntry<argTy, double, outTy, double>,
+        td_ns::TypePairDefinedEntry<argTy, double, outTy, std::complex<double>>,
+
+        // input std::complex
+        td_ns::TypePairDefinedEntry<argTy,
+                                    std::complex<float>,
+                                    outTy,
+                                    std::complex<float>>,
+        td_ns::TypePairDefinedEntry<argTy,
+                                    std::complex<float>,
+                                    outTy,
+                                    std::complex<double>>,
+
+        td_ns::TypePairDefinedEntry<argTy,
+                                    std::complex<double>,
+                                    outTy,
+                                    std::complex<double>>,
+
+        // fall-through
+        td_ns::NotDefinedEntry>::is_defined;
+};
+
+template <typename fnT, typename srcTy, typename dstTy>
+struct SumOverAxisAtomicStridedFactory
+{
+    fnT get() const
+    {
+        if constexpr (TypePairSupportDataForSumReductionAtomic<
+                          srcTy, dstTy>::is_defined)
+        {
+            using ReductionOpT = sycl::plus<dstTy>;
+            return dpctl::tensor::kernels::
+                reduction_over_group_with_atomics_strided_impl<srcTy, dstTy,
+                                                               ReductionOpT>;
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+template <typename fnT, typename srcTy, typename dstTy>
+struct SumOverAxisTempsStridedFactory
+{
+    fnT get() const
+    {
+        if constexpr (TypePairSupportDataForSumReductionTemps<
+                          srcTy, dstTy>::is_defined) {
+            using ReductionOpT =
+                std::conditional_t<std::is_same_v<dstTy, bool>,
+                                   sycl::logical_or<dstTy>, sycl::plus<dstTy>>;
+            return dpctl::tensor::kernels::
+                reduction_over_group_temps_strided_impl<srcTy, dstTy,
+                                                        ReductionOpT>;
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+template <typename fnT, typename srcTy, typename dstTy>
+struct SumOverAxis1AtomicContigFactory
+{
+    fnT get() const
+    {
+        if constexpr (TypePairSupportDataForSumReductionAtomic<
+                          srcTy, dstTy>::is_defined)
+        {
+            using ReductionOpT = sycl::plus<dstTy>;
+            return dpctl::tensor::kernels::
+                reduction_axis1_over_group_with_atomics_contig_impl<
+                    srcTy, dstTy, ReductionOpT>;
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+template <typename fnT, typename srcTy, typename dstTy>
+struct SumOverAxis0AtomicContigFactory
+{
+    fnT get() const
+    {
+        if constexpr (TypePairSupportDataForSumReductionAtomic<
+                          srcTy, dstTy>::is_defined)
+        {
+            using ReductionOpT = sycl::plus<dstTy>;
+            return dpctl::tensor::kernels::
+                reduction_axis0_over_group_with_atomics_contig_impl<
+                    srcTy, dstTy, ReductionOpT>;
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+template <typename fnT, typename srcTy, typename dstTy>
+struct SumOverAxis1TempsContigFactory
+{
+    fnT get() const
+    {
+        if constexpr (TypePairSupportDataForSumReductionTemps<
+                          srcTy, dstTy>::is_defined) {
+            using ReductionOpT =
+                std::conditional_t<std::is_same_v<dstTy, bool>,
+                                   sycl::logical_or<dstTy>, sycl::plus<dstTy>>;
+            return dpctl::tensor::kernels::
+                reduction_axis1_over_group_temps_contig_impl<srcTy, dstTy,
+                                                             ReductionOpT>;
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+template <typename fnT, typename srcTy, typename dstTy>
+struct SumOverAxis0TempsContigFactory
+{
+    fnT get() const
+    {
+        if constexpr (TypePairSupportDataForSumReductionTemps<
+                          srcTy, dstTy>::is_defined) {
+            using ReductionOpT =
+                std::conditional_t<std::is_same_v<dstTy, bool>,
+                                   sycl::logical_or<dstTy>, sycl::plus<dstTy>>;
+            return dpctl::tensor::kernels::
+                reduction_axis0_over_group_temps_contig_impl<srcTy, dstTy,
+                                                             ReductionOpT>;
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+void populate_sum_over_axis_dispatch_tables(void)
+{
+    using dpctl::tensor::kernels::reduction_contig_impl_fn_ptr;
+    using dpctl::tensor::kernels::reduction_strided_impl_fn_ptr;
+    using namespace td_ns;
+
+    DispatchTableBuilder<reduction_strided_impl_fn_ptr,
+                         SumOverAxisAtomicStridedFactory, num_types>
+        dtb1;
+    dtb1.populate_dispatch_table(sum_over_axis_strided_atomic_dispatch_table);
+
+    DispatchTableBuilder<reduction_strided_impl_fn_ptr,
+                         SumOverAxisTempsStridedFactory, num_types>
+        dtb2;
+    dtb2.populate_dispatch_table(sum_over_axis_strided_temps_dispatch_table);
+
+    DispatchTableBuilder<reduction_contig_impl_fn_ptr,
+                         SumOverAxis1AtomicContigFactory, num_types>
+        dtb3;
+    dtb3.populate_dispatch_table(sum_over_axis1_contig_atomic_dispatch_table);
+
+    DispatchTableBuilder<reduction_contig_impl_fn_ptr,
+                         SumOverAxis0AtomicContigFactory, num_types>
+        dtb4;
+    dtb4.populate_dispatch_table(sum_over_axis0_contig_atomic_dispatch_table);
+
+    DispatchTableBuilder<reduction_contig_impl_fn_ptr,
+                         SumOverAxis1TempsContigFactory, td_ns::num_types>
+        dtb5;
+    dtb5.populate_dispatch_table(sum_over_axis1_contig_temps_dispatch_table);
+
+    DispatchTableBuilder<reduction_contig_impl_fn_ptr,
+                         SumOverAxis0TempsContigFactory, td_ns::num_types>
+        dtb6;
+    dtb6.populate_dispatch_table(sum_over_axis0_contig_temps_dispatch_table);
+}
+
+using atomic_support::atomic_support_fn_ptr_t;
+static atomic_support_fn_ptr_t sum_atomic_support_vector[td_ns::num_types];
+
+void populate_sum_atomic_support_dispatch_vector(void)
+{
+    using td_ns::DispatchVectorBuilder;
+
+    using atomic_support::SumAtomicSupportFactory;
+    DispatchVectorBuilder<atomic_support_fn_ptr_t, SumAtomicSupportFactory,
+                          td_ns::num_types>
+        dvb;
+    dvb.populate_dispatch_vector(sum_atomic_support_vector);
+}
+
+} // namespace impl
+
+void init_sum(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        using impl::populate_sum_over_axis_dispatch_tables;
+        populate_sum_over_axis_dispatch_tables();
+        using impl::sum_over_axis0_contig_atomic_dispatch_table;
+        using impl::sum_over_axis0_contig_temps_dispatch_table;
+        using impl::sum_over_axis1_contig_atomic_dispatch_table;
+        using impl::sum_over_axis1_contig_temps_dispatch_table;
+        using impl::sum_over_axis_strided_atomic_dispatch_table;
+        using impl::sum_over_axis_strided_temps_dispatch_table;
+
+        using impl::populate_sum_atomic_support_dispatch_vector;
+        populate_sum_atomic_support_dispatch_vector();
+        using impl::sum_atomic_support_vector;
+
+        auto sum_pyapi = [&](const arrayT &src, int trailing_dims_to_reduce,
+                             const arrayT &dst, sycl::queue &exec_q,
+                             const event_vecT &depends = {}) {
+            return py_reduction_over_axis(
+                src, trailing_dims_to_reduce, dst, exec_q, depends,
+                sum_over_axis_strided_atomic_dispatch_table,
+                sum_over_axis0_contig_atomic_dispatch_table,
+                sum_over_axis1_contig_atomic_dispatch_table,
+                sum_over_axis_strided_temps_dispatch_table,
+                sum_over_axis0_contig_temps_dispatch_table,
+                sum_over_axis1_contig_temps_dispatch_table,
+                sum_atomic_support_vector);
+        };
+        m.def("_sum_over_axis", sum_pyapi, "", py::arg("src"),
+              py::arg("trailing_dims_to_reduce"), py::arg("dst"),
+              py::arg("sycl_queue"), py::arg("depends") = py::list());
+
+        auto sum_dtype_supported =
+            [&](const py::dtype &input_dtype, const py::dtype &output_dtype,
+                const std::string &dst_usm_type, sycl::queue &q) {
+                return py_reduction_dtype_supported(
+                    input_dtype, output_dtype, dst_usm_type, q,
+                    sum_over_axis_strided_atomic_dispatch_table,
+                    sum_over_axis_strided_temps_dispatch_table,
+                    sum_atomic_support_vector);
+            };
+        m.def("_sum_over_axis_dtype_supported", sum_dtype_supported, "",
+              py::arg("arg_dtype"), py::arg("out_dtype"),
+              py::arg("dst_usm_type"), py::arg("sycl_queue"));
+    }
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/reductions/sum.hpp b/dpctl_ext/tensor/libtensor/source/reductions/sum.hpp
new file mode 100644
index 000000000000..08add902a049
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/reductions/sum.hpp
@@ -0,0 +1,46 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_reductions_impl
+/// extension.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern void init_sum(py::module_ m);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/tensor_reductions.cpp b/dpctl_ext/tensor/libtensor/source/tensor_reductions.cpp
new file mode 100644
index 000000000000..6e6a24f7b934
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/tensor_reductions.cpp
@@ -0,0 +1,43 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_reductions_impl
+/// extension.
+//===---------------------------------------------------------------------===//
+
+#include <pybind11/pybind11.h>
+
+#include "reductions/reduction_common.hpp"
+
+PYBIND11_MODULE(_tensor_reductions_impl, m)
+{
+    dpctl::tensor::py_internal::init_reduction_functions(m);
+}
diff --git a/dpnp/dpnp_iface_counting.py b/dpnp/dpnp_iface_counting.py
index a4b85aa85294..a8ebafbcead7 100644
--- a/dpnp/dpnp_iface_counting.py
+++ b/dpnp/dpnp_iface_counting.py
@@ -39,8 +39,9 @@
 
 """
 
-import dpctl.tensor as dpt
-
+# TODO: revert to `import dpctl.tensor...`
+# when dpnp fully migrates dpctl/tensor
+import dpctl_ext.tensor as dpt
 import dpnp
 
 
diff --git a/dpnp/dpnp_iface_logic.py b/dpnp/dpnp_iface_logic.py
index 3e3501b14c7c..a81416a28e43 100644
--- a/dpnp/dpnp_iface_logic.py
+++ b/dpnp/dpnp_iface_logic.py
@@ -44,14 +44,13 @@
 # pylint: disable=no-name-in-module
 
 
-import dpctl.tensor as dpt
 import dpctl.tensor._tensor_elementwise_impl as ti
 import dpctl.utils as dpu
 import numpy
 
 # TODO: revert to `import dpctl.tensor...`
 # when dpnp fully migrates dpctl/tensor
-import dpctl_ext.tensor as dpt_ext
+import dpctl_ext.tensor as dpt
 import dpnp
 import dpnp.backend.extensions.ufunc._ufunc_impl as ufi
 from dpnp.dpnp_algo.dpnp_elementwise_common import DPNPBinaryFunc, DPNPUnaryFunc
@@ -1276,7 +1275,7 @@ def isin(
         usm_element = dpnp.get_usm_ndarray(element)
         usm_test = dpnp.get_usm_ndarray(test_elements)
     return dpnp_array._create_from_usm_ndarray(
-        dpt_ext.isin(
+        dpt.isin(
             usm_element,
             usm_test,
             invert=invert,
diff --git a/dpnp/dpnp_iface_manipulation.py b/dpnp/dpnp_iface_manipulation.py
index 7eb44f79ae38..b5afd9523d67 100644
--- a/dpnp/dpnp_iface_manipulation.py
+++ b/dpnp/dpnp_iface_manipulation.py
@@ -428,7 +428,9 @@ def _get_first_nan_index(usm_a):
         if first_nan is not None:
             # all NaNs are collapsed, so need to put a count of all NaNs
             # at the last index
-            dpt.sum(usm_res.counts[first_nan:], out=usm_res.counts[first_nan])
+            dpt_ext.sum(
+                usm_res.counts[first_nan:], out=usm_res.counts[first_nan]
+            )
             result += (usm_res.counts[: first_nan + 1],)
         else:
             result += (usm_res.counts,)
diff --git a/dpnp/dpnp_iface_mathematical.py b/dpnp/dpnp_iface_mathematical.py
index 000c343abdb4..cdcdd3af92e4 100644
--- a/dpnp/dpnp_iface_mathematical.py
+++ b/dpnp/dpnp_iface_mathematical.py
@@ -47,14 +47,13 @@
 import builtins
 import warnings
 
-import dpctl.tensor as dpt
 import dpctl.tensor._tensor_elementwise_impl as ti
 import dpctl.utils as dpu
 import numpy
 
 # TODO: revert to `import dpctl.tensor...`
 # when dpnp fully migrates dpctl/tensor
-import dpctl_ext.tensor as dpt_ext
+import dpctl_ext.tensor as dpt
 import dpctl_ext.tensor._type_utils as dtu
 import dpnp
 import dpnp.backend.extensions.ufunc._ufunc_impl as ufi
@@ -730,7 +729,7 @@ def clip(a, /, min=None, max=None, *, out=None, order="K", **kwargs):
     usm_max = None if max is None else dpnp.get_usm_ndarray_or_scalar(max)
 
     usm_out = None if out is None else dpnp.get_usm_ndarray(out)
-    usm_res = dpt_ext.clip(usm_arr, usm_min, usm_max, out=usm_out, order=order)
+    usm_res = dpt.clip(usm_arr, usm_min, usm_max, out=usm_out, order=order)
     if out is not None and isinstance(out, dpnp_array):
         return out
     return dpnp_array._create_from_usm_ndarray(usm_res)
@@ -1126,7 +1125,7 @@ def cumprod(a, axis=None, dtype=None, out=None):
     return dpnp_wrap_reduction_call(
         usm_a,
         out,
-        dpt_ext.cumulative_prod,
+        dpt.cumulative_prod,
         _get_reduction_res_dt(a, dtype),
         axis=axis,
         dtype=dtype,
@@ -1218,7 +1217,7 @@ def cumsum(a, axis=None, dtype=None, out=None):
     return dpnp_wrap_reduction_call(
         usm_a,
         out,
-        dpt_ext.cumulative_sum,
+        dpt.cumulative_sum,
         _get_reduction_res_dt(a, dtype),
         axis=axis,
         dtype=dtype,
@@ -1307,7 +1306,7 @@ def cumulative_prod(
     return dpnp_wrap_reduction_call(
         dpnp.get_usm_ndarray(x),
         out,
-        dpt_ext.cumulative_prod,
+        dpt.cumulative_prod,
         _get_reduction_res_dt(x, dtype),
         axis=axis,
         dtype=dtype,
@@ -1403,7 +1402,7 @@ def cumulative_sum(
     return dpnp_wrap_reduction_call(
         dpnp.get_usm_ndarray(x),
         out,
-        dpt_ext.cumulative_sum,
+        dpt.cumulative_sum,
         _get_reduction_res_dt(x, dtype),
         axis=axis,
         dtype=dtype,
diff --git a/dpnp/dpnp_iface_searching.py b/dpnp/dpnp_iface_searching.py
index 055aaa999c3a..19279f81286a 100644
--- a/dpnp/dpnp_iface_searching.py
+++ b/dpnp/dpnp_iface_searching.py
@@ -39,12 +39,10 @@
 
 """
 
-import dpctl.tensor as dpt
-
 # pylint: disable=no-name-in-module
 # TODO: revert to `import dpctl.tensor...`
 # when dpnp fully migrates dpctl/tensor
-import dpctl_ext.tensor as dpt_ext
+import dpctl_ext.tensor as dpt
 import dpctl_ext.tensor._tensor_impl as dti
 import dpnp
 
@@ -376,13 +374,13 @@ def searchsorted(a, v, side="left", sorter=None):
 
     usm_a = dpnp.get_usm_ndarray(a)
     if dpnp.isscalar(v):
-        usm_v = dpt_ext.asarray(v, sycl_queue=a.sycl_queue, usm_type=a.usm_type)
+        usm_v = dpt.asarray(v, sycl_queue=a.sycl_queue, usm_type=a.usm_type)
     else:
         usm_v = dpnp.get_usm_ndarray(v)
 
     usm_sorter = None if sorter is None else dpnp.get_usm_ndarray(sorter)
     return dpnp_array._create_from_usm_ndarray(
-        dpt_ext.searchsorted(usm_a, usm_v, side=side, sorter=usm_sorter)
+        dpt.searchsorted(usm_a, usm_v, side=side, sorter=usm_sorter)
     )
 
 
@@ -474,7 +472,5 @@ def where(condition, x=None, y=None, /, *, order="K", out=None):
     usm_condition = dpnp.get_usm_ndarray(condition)
 
     usm_out = None if out is None else dpnp.get_usm_ndarray(out)
-    usm_res = dpt_ext.where(
-        usm_condition, usm_x, usm_y, order=order, out=usm_out
-    )
+    usm_res = dpt.where(usm_condition, usm_x, usm_y, order=order, out=usm_out)
     return dpnp.get_result_array(usm_res, out)
diff --git a/dpnp/dpnp_iface_statistics.py b/dpnp/dpnp_iface_statistics.py
index 9d3ccc40ecf5..75fe215837b9 100644
--- a/dpnp/dpnp_iface_statistics.py
+++ b/dpnp/dpnp_iface_statistics.py
@@ -1118,7 +1118,7 @@ def max(a, axis=None, out=None, keepdims=False, initial=None, where=True):
     return dpnp_wrap_reduction_call(
         usm_a,
         out,
-        dpt.max,
+        dpt_ext.max,
         a.dtype,
         axis=axis,
         keepdims=keepdims,
@@ -1395,7 +1395,7 @@ def min(a, axis=None, out=None, keepdims=False, initial=None, where=True):
     return dpnp_wrap_reduction_call(
         usm_a,
         out,
-        dpt.min,
+        dpt_ext.min,
         a.dtype,
         axis=axis,
         keepdims=keepdims,
diff --git a/dpnp/dpnp_iface_trigonometric.py b/dpnp/dpnp_iface_trigonometric.py
index 460a0dc80f0f..a17c7dfd9d9a 100644
--- a/dpnp/dpnp_iface_trigonometric.py
+++ b/dpnp/dpnp_iface_trigonometric.py
@@ -42,13 +42,11 @@
 # pylint: disable=protected-access
 # pylint: disable=no-name-in-module
 
-
-import dpctl.tensor as dpt
 import dpctl.tensor._tensor_elementwise_impl as ti
 
 # TODO: revert to `import dpctl.tensor...`
 # when dpnp fully migrates dpctl/tensor
-import dpctl_ext.tensor as dpt_ext
+import dpctl_ext.tensor as dpt
 import dpctl_ext.tensor._type_utils as dtu
 import dpnp
 import dpnp.backend.extensions.ufunc._ufunc_impl as ufi
@@ -935,7 +933,7 @@ def cumlogsumexp(
     return dpnp_wrap_reduction_call(
         usm_x,
         out,
-        dpt_ext.cumulative_logsumexp,
+        dpt.cumulative_logsumexp,
         _get_accumulation_res_dt(x, dtype),
         axis=axis,
         dtype=dtype,

From 0f6d63e0511c08c3796eea19686a6c6954038b27 Mon Sep 17 00:00:00 2001
From: Anton <100830759+antonwolfy@users.noreply.github.com>
Date: Tue, 10 Mar 2026 17:08:26 +0100
Subject: [PATCH 12/43] Add missing includes (#2810)

The PR adds missing includes to tensor source and header files.
---
 .../tensor/libtensor/include/kernels/accumulators.hpp    | 2 ++
 .../include/kernels/elementwise_functions/minimum.hpp    | 1 +
 dpctl_ext/tensor/libtensor/include/utils/sycl_utils.hpp  | 3 +++
 .../tensor/libtensor/include/utils/type_dispatch.hpp     | 1 +
 .../libtensor/include/utils/type_dispatch_building.hpp   | 1 +
 .../libtensor/source/boolean_advanced_indexing.cpp       | 1 +
 .../tensor/libtensor/source/copy_and_cast_usm_to_usm.cpp | 1 +
 .../tensor/libtensor/source/simplify_iteration_space.cpp | 9 ++++++---
 dpnp/backend/include/dpnp4pybind11.hpp                   | 3 +++
 9 files changed, 19 insertions(+), 3 deletions(-)

diff --git a/dpctl_ext/tensor/libtensor/include/kernels/accumulators.hpp b/dpctl_ext/tensor/libtensor/include/kernels/accumulators.hpp
index 6451bc950006..60382e210d8b 100644
--- a/dpctl_ext/tensor/libtensor/include/kernels/accumulators.hpp
+++ b/dpctl_ext/tensor/libtensor/include/kernels/accumulators.hpp
@@ -33,6 +33,8 @@
 //===---------------------------------------------------------------------===//
 
 #pragma once
+
+#include <algorithm>
 #include <array>
 #include <cstddef>
 #include <cstdint>
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/minimum.hpp b/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/minimum.hpp
index d18577a5cf4e..cb7d86377984 100644
--- a/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/minimum.hpp
+++ b/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/minimum.hpp
@@ -37,6 +37,7 @@
 #include <cstddef>
 #include <cstdint>
 #include <type_traits>
+#include <vector>
 
 #include <sycl/sycl.hpp>
 
diff --git a/dpctl_ext/tensor/libtensor/include/utils/sycl_utils.hpp b/dpctl_ext/tensor/libtensor/include/utils/sycl_utils.hpp
index 1cb70adafeec..f5ea4d4ca486 100644
--- a/dpctl_ext/tensor/libtensor/include/utils/sycl_utils.hpp
+++ b/dpctl_ext/tensor/libtensor/include/utils/sycl_utils.hpp
@@ -33,9 +33,12 @@
 #pragma once
 
 #include <algorithm>
+#include <cmath>
 #include <complex>
 #include <cstddef>
 #include <cstdint>
+#include <iterator>
+#include <limits>
 #include <type_traits>
 #include <vector>
 
diff --git a/dpctl_ext/tensor/libtensor/include/utils/type_dispatch.hpp b/dpctl_ext/tensor/libtensor/include/utils/type_dispatch.hpp
index 242c2cf8724a..d08187aeaacc 100644
--- a/dpctl_ext/tensor/libtensor/include/utils/type_dispatch.hpp
+++ b/dpctl_ext/tensor/libtensor/include/utils/type_dispatch.hpp
@@ -32,6 +32,7 @@
 
 #pragma once
 
+#include <cassert>
 #include <cstdint>
 #include <stdexcept>
 #include <string>
diff --git a/dpctl_ext/tensor/libtensor/include/utils/type_dispatch_building.hpp b/dpctl_ext/tensor/libtensor/include/utils/type_dispatch_building.hpp
index b1e02eb1513b..431e020fbdbe 100644
--- a/dpctl_ext/tensor/libtensor/include/utils/type_dispatch_building.hpp
+++ b/dpctl_ext/tensor/libtensor/include/utils/type_dispatch_building.hpp
@@ -32,6 +32,7 @@
 
 #pragma once
 
+#include <cassert>
 #include <complex>
 #include <cstdint>
 #include <type_traits>
diff --git a/dpctl_ext/tensor/libtensor/source/boolean_advanced_indexing.cpp b/dpctl_ext/tensor/libtensor/source/boolean_advanced_indexing.cpp
index 4c46e1e2fec8..e44abbd48303 100644
--- a/dpctl_ext/tensor/libtensor/source/boolean_advanced_indexing.cpp
+++ b/dpctl_ext/tensor/libtensor/source/boolean_advanced_indexing.cpp
@@ -35,6 +35,7 @@
 
 #include <algorithm>
 #include <array>
+#include <cassert>
 #include <cstddef>
 #include <cstdint>
 #include <functional>
diff --git a/dpctl_ext/tensor/libtensor/source/copy_and_cast_usm_to_usm.cpp b/dpctl_ext/tensor/libtensor/source/copy_and_cast_usm_to_usm.cpp
index 9ea49ae1d88b..43a6fbf4a0dd 100644
--- a/dpctl_ext/tensor/libtensor/source/copy_and_cast_usm_to_usm.cpp
+++ b/dpctl_ext/tensor/libtensor/source/copy_and_cast_usm_to_usm.cpp
@@ -33,6 +33,7 @@
 //===----------------------------------------------------------------------===//
 
 #include <array>
+#include <cassert>
 #include <cstddef>
 #include <sycl/sycl.hpp>
 #include <tuple>
diff --git a/dpctl_ext/tensor/libtensor/source/simplify_iteration_space.cpp b/dpctl_ext/tensor/libtensor/source/simplify_iteration_space.cpp
index e3cff701ed50..5e42938a22f2 100644
--- a/dpctl_ext/tensor/libtensor/source/simplify_iteration_space.cpp
+++ b/dpctl_ext/tensor/libtensor/source/simplify_iteration_space.cpp
@@ -32,14 +32,17 @@
 /// This file defines functions of dpctl.tensor._tensor_impl extensions
 //===--------------------------------------------------------------------===//
 
-#include "simplify_iteration_space.hpp"
-#include "utils/strided_iters.hpp"
 #include <algorithm>
+#include <cassert>
 #include <cstddef>
 #include <iterator>
-#include <pybind11/pybind11.h>
 #include <vector>
 
+#include <pybind11/pybind11.h>
+
+#include "simplify_iteration_space.hpp"
+#include "utils/strided_iters.hpp"
+
 namespace dpctl::tensor::py_internal
 {
 
diff --git a/dpnp/backend/include/dpnp4pybind11.hpp b/dpnp/backend/include/dpnp4pybind11.hpp
index cd287989bef2..87657d1fbecd 100644
--- a/dpnp/backend/include/dpnp4pybind11.hpp
+++ b/dpnp/backend/include/dpnp4pybind11.hpp
@@ -30,11 +30,14 @@
 
 #include "dpctl_capi.h"
 
+#include <array>
 #include <complex>
 #include <cstddef> // for std::size_t for C++ linkage
+#include <cstdint>
 #include <memory>
 #include <stddef.h> // for size_t for C linkage
 #include <stdexcept>
+#include <type_traits>
 #include <utility>
 #include <vector>
 

From 2ef9a94b51697a1b291d2c6cec279f1d0baea8d3 Mon Sep 17 00:00:00 2001
From: vlad-perevezentsev <vladislav.perevezentsev@intel.com>
Date: Mon, 16 Mar 2026 14:18:35 +0100
Subject: [PATCH 13/43] Move `_tensor_elementwise_impl` (unary) extension and
 use it for dpnp (#2795)

This PR initializes `_tensor_elementwise_impl ` pybind11 extension in
`dpctl_ext.tensor` and extends `dpctl_ext.tensor ` Python API with the
part of unary functions : `abs, acos , acosh, angle. atan, atanh, bitwise_invert. ceil, conj`

This is the first part of the work on migrating
`_tensor_elementwise_impl` (unary)_
---
 dpctl_ext/tensor/CMakeLists.txt               | 100 ++++-
 dpctl_ext/tensor/__init__.py                  |  24 ++
 dpctl_ext/tensor/_elementwise_common.py       | 285 ++++++++++++++
 dpctl_ext/tensor/_elementwise_funcs.py        | 360 ++++++++++++++++++
 .../kernels/elementwise_functions/abs.hpp     | 239 ++++++++++++
 .../kernels/elementwise_functions/acos.hpp    | 273 +++++++++++++
 .../kernels/elementwise_functions/acosh.hpp   | 304 +++++++++++++++
 .../kernels/elementwise_functions/angle.hpp   | 215 +++++++++++
 .../kernels/elementwise_functions/asin.hpp    | 296 ++++++++++++++
 .../kernels/elementwise_functions/asinh.hpp   | 279 ++++++++++++++
 .../kernels/elementwise_functions/atan.hpp    | 288 ++++++++++++++
 .../kernels/elementwise_functions/atanh.hpp   | 280 ++++++++++++++
 .../elementwise_functions/bitwise_invert.hpp  | 231 +++++++++++
 .../elementwise_functions/cabs_impl.hpp       |  77 ++++
 .../kernels/elementwise_functions/ceil.hpp    | 230 +++++++++++
 .../kernels/elementwise_functions/common.hpp  |   1 +
 .../kernels/elementwise_functions/conj.hpp    | 234 ++++++++++++
 .../source/elementwise_functions/abs.cpp      | 125 ++++++
 .../source/elementwise_functions/abs.hpp      |  46 +++
 .../source/elementwise_functions/acos.cpp     | 125 ++++++
 .../source/elementwise_functions/acos.hpp     |  46 +++
 .../source/elementwise_functions/acosh.cpp    | 127 ++++++
 .../source/elementwise_functions/acosh.hpp    |  46 +++
 .../source/elementwise_functions/angle.cpp    | 127 ++++++
 .../source/elementwise_functions/angle.hpp    |  46 +++
 .../source/elementwise_functions/asin.cpp     | 125 ++++++
 .../source/elementwise_functions/asin.hpp     |  46 +++
 .../source/elementwise_functions/asinh.cpp    | 127 ++++++
 .../source/elementwise_functions/asinh.hpp    |  46 +++
 .../source/elementwise_functions/atan.cpp     | 125 ++++++
 .../source/elementwise_functions/atan.hpp     |  46 +++
 .../source/elementwise_functions/atanh.cpp    | 127 ++++++
 .../source/elementwise_functions/atanh.hpp    |  46 +++
 .../elementwise_functions/bitwise_invert.cpp  | 129 +++++++
 .../elementwise_functions/bitwise_invert.hpp  |  46 +++
 .../source/elementwise_functions/ceil.cpp     | 125 ++++++
 .../source/elementwise_functions/ceil.hpp     |  46 +++
 .../source/elementwise_functions/conj.cpp     | 125 ++++++
 .../source/elementwise_functions/conj.hpp     |  46 +++
 .../elementwise_common.cpp                    | 191 ++++++++++
 .../elementwise_common.hpp                    |  46 +++
 .../elementwise_functions.hpp                 | 284 ++++++++++++++
 .../elementwise_functions_type_utils.cpp      |  96 +++++
 .../elementwise_functions_type_utils.hpp      |  56 +++
 .../libtensor/source/tensor_elementwise.cpp   |  45 +++
 dpnp/dpnp_iface_bitwise.py                    |   7 +-
 dpnp/dpnp_iface_mathematical.py               |  17 +-
 dpnp/dpnp_iface_trigonometric.py              |  25 +-
 48 files changed, 6353 insertions(+), 23 deletions(-)
 create mode 100644 dpctl_ext/tensor/_elementwise_common.py
 create mode 100644 dpctl_ext/tensor/_elementwise_funcs.py
 create mode 100644 dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/abs.hpp
 create mode 100644 dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/acos.hpp
 create mode 100644 dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/acosh.hpp
 create mode 100644 dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/angle.hpp
 create mode 100644 dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/asin.hpp
 create mode 100644 dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/asinh.hpp
 create mode 100644 dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/atan.hpp
 create mode 100644 dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/atanh.hpp
 create mode 100644 dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/bitwise_invert.hpp
 create mode 100644 dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/cabs_impl.hpp
 create mode 100644 dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/ceil.hpp
 create mode 100644 dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/conj.hpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/elementwise_functions/abs.cpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/elementwise_functions/abs.hpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/elementwise_functions/acos.cpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/elementwise_functions/acos.hpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/elementwise_functions/acosh.cpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/elementwise_functions/acosh.hpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/elementwise_functions/angle.cpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/elementwise_functions/angle.hpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/elementwise_functions/asin.cpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/elementwise_functions/asin.hpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/elementwise_functions/asinh.cpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/elementwise_functions/asinh.hpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/elementwise_functions/atan.cpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/elementwise_functions/atan.hpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/elementwise_functions/atanh.cpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/elementwise_functions/atanh.hpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/elementwise_functions/bitwise_invert.cpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/elementwise_functions/bitwise_invert.hpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/elementwise_functions/ceil.cpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/elementwise_functions/ceil.hpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/elementwise_functions/conj.cpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/elementwise_functions/conj.hpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/elementwise_functions/elementwise_common.cpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/elementwise_functions/elementwise_common.hpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/elementwise_functions/elementwise_functions.hpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/elementwise_functions/elementwise_functions_type_utils.cpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/elementwise_functions/elementwise_functions_type_utils.hpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/tensor_elementwise.cpp

diff --git a/dpctl_ext/tensor/CMakeLists.txt b/dpctl_ext/tensor/CMakeLists.txt
index cf55035c23d9..1a9649b91f82 100644
--- a/dpctl_ext/tensor/CMakeLists.txt
+++ b/dpctl_ext/tensor/CMakeLists.txt
@@ -69,6 +69,81 @@ set(_accumulator_sources
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/accumulators/cumulative_prod.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/accumulators/cumulative_sum.cpp
 )
+set(_elementwise_sources
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/elementwise_common.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/elementwise_functions_type_utils.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/abs.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/acos.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/acosh.cpp
+    #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/add.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/angle.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/asin.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/asinh.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/atan.cpp
+    #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/atan2.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/atanh.cpp
+    #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/bitwise_and.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/bitwise_invert.cpp
+    #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/bitwise_left_shift.cpp
+    #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/bitwise_or.cpp
+    #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/bitwise_right_shift.cpp
+    #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/bitwise_xor.cpp
+    #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/cbrt.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/ceil.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/conj.cpp
+    #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/copysign.cpp
+    #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/cos.cpp
+    #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/cosh.cpp
+    #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/equal.cpp
+    #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/exp.cpp
+    #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/exp2.cpp
+    #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/expm1.cpp
+    #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/floor_divide.cpp
+    #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/floor.cpp
+    #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/greater_equal.cpp
+    #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/greater.cpp
+    #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/hypot.cpp
+    #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/imag.cpp
+    #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/isfinite.cpp
+    #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/isinf.cpp
+    #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/isnan.cpp
+    #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/less_equal.cpp
+    #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/less.cpp
+    #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/log.cpp
+    #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/log1p.cpp
+    #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/log2.cpp
+    #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/log10.cpp
+    #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/logaddexp.cpp
+    #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/logical_and.cpp
+    #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/logical_not.cpp
+    #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/logical_or.cpp
+    #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/logical_xor.cpp
+    #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/maximum.cpp
+    #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/minimum.cpp
+    #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/multiply.cpp
+    #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/negative.cpp
+    #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/nextafter.cpp
+    #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/not_equal.cpp
+    #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/positive.cpp
+    #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/pow.cpp
+    #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/proj.cpp
+    #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/real.cpp
+    #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/reciprocal.cpp
+    #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/remainder.cpp
+    #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/round.cpp
+    #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/rsqrt.cpp
+    #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/sign.cpp
+    #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/signbit.cpp
+    #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/sin.cpp
+    #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/sinh.cpp
+    #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/sqrt.cpp
+    #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/square.cpp
+    #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/subtract.cpp
+    #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/tan.cpp
+    #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/tanh.cpp
+    #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/true_divide.cpp
+    #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/trunc.cpp
+)
 set(_reduction_sources
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/reductions/reduction_common.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/reductions/all.cpp
@@ -95,6 +170,10 @@ set(_tensor_accumulation_impl_sources
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/tensor_accumulation.cpp
     ${_accumulator_sources}
 )
+set(_tensor_elementwise_impl_sources
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/tensor_elementwise.cpp
+    ${_elementwise_sources}
+)
 set(_tensor_reductions_impl_sources
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/tensor_reductions.cpp
     ${_reduction_sources}
@@ -131,6 +210,12 @@ add_sycl_to_target(TARGET ${python_module_name} SOURCES ${_tensor_accumulation_i
 target_link_libraries(${python_module_name} PRIVATE ${_static_lib_trgt})
 list(APPEND _py_trgts ${python_module_name})
 
+set(python_module_name _tensor_elementwise_impl)
+pybind11_add_module(${python_module_name} MODULE ${_tensor_elementwise_impl_sources})
+add_sycl_to_target(TARGET ${python_module_name} SOURCES ${_tensor_elementwise_impl_sources})
+target_link_libraries(${python_module_name} PRIVATE ${_static_lib_trgt})
+list(APPEND _py_trgts ${python_module_name})
+
 set(python_module_name _tensor_reductions_impl)
 pybind11_add_module(${python_module_name} MODULE ${_tensor_reductions_impl_sources})
 add_sycl_to_target(TARGET ${python_module_name} SOURCES ${_tensor_reductions_impl_sources})
@@ -157,7 +242,7 @@ set(_no_fast_math_sources
 )
 list(
     APPEND _no_fast_math_sources
-    # ${_elementwise_sources}
+    ${_elementwise_sources}
     ${_reduction_sources}
     ${_sorting_sources}
     # ${_linalg_sources}
@@ -175,6 +260,19 @@ endforeach()
 
 set(_compiler_definitions "")
 
+foreach(_src_fn ${_elementwise_sources})
+    get_source_file_property(_cmpl_options_defs ${_src_fn} COMPILE_DEFINITIONS)
+    if(${_cmpl_options_defs})
+        set(_combined_options_defs ${_cmpl_options_defs} "${_compiler_definitions}")
+    else()
+        set(_combined_options_defs "${_compiler_definitions}")
+    endif()
+    set_source_files_properties(
+        ${_src_fn}
+        PROPERTIES COMPILE_DEFINITIONS "${_combined_options_defs}"
+    )
+endforeach()
+
 set(_linker_options "LINKER:${DPNP_LDFLAGS}")
 foreach(python_module_name ${_py_trgts})
     target_compile_options(
diff --git a/dpctl_ext/tensor/__init__.py b/dpctl_ext/tensor/__init__.py
index ac24151bedfe..be7ec6851b5b 100644
--- a/dpctl_ext/tensor/__init__.py
+++ b/dpctl_ext/tensor/__init__.py
@@ -53,6 +53,19 @@
     zeros,
     zeros_like,
 )
+from ._elementwise_funcs import (
+    abs,
+    acos,
+    acosh,
+    angle,
+    asin,
+    asinh,
+    atan,
+    atanh,
+    bitwise_invert,
+    ceil,
+    conj,
+)
 from ._indexing_functions import (
     extract,
     nonzero,
@@ -104,19 +117,30 @@
 from ._utility_functions import all, any, diff
 
 __all__ = [
+    "abs",
+    "acos",
+    "acosh",
     "all",
+    "angle",
     "any",
     "arange",
     "argmax",
     "argmin",
     "argsort",
     "asarray",
+    "asin",
+    "asinh",
     "asnumpy",
     "astype",
+    "atan",
+    "atanh",
+    "bitwise_invert",
     "broadcast_arrays",
     "broadcast_to",
     "can_cast",
+    "ceil",
     "concat",
+    "conj",
     "copy",
     "count_nonzero",
     "clip",
diff --git a/dpctl_ext/tensor/_elementwise_common.py b/dpctl_ext/tensor/_elementwise_common.py
new file mode 100644
index 000000000000..7811c01d9ce2
--- /dev/null
+++ b/dpctl_ext/tensor/_elementwise_common.py
@@ -0,0 +1,285 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+import dpctl
+import dpctl.tensor as dpt
+from dpctl.utils import ExecutionPlacementError, SequentialOrderManager
+
+# TODO: revert to `import dpctl.tensor...`
+# when dpnp fully migrates dpctl/tensor
+import dpctl_ext.tensor as dpt_ext
+import dpctl_ext.tensor._tensor_impl as ti
+
+from ._copy_utils import _empty_like_orderK
+from ._type_utils import (
+    _acceptance_fn_default_unary,
+    _all_data_types,
+    _find_buf_dtype,
+)
+
+
+class UnaryElementwiseFunc:
+    """
+    Class that implements unary element-wise functions.
+
+    Args:
+        name (str):
+            Name of the unary function
+        result_type_resovler_fn (callable):
+            Function that takes dtype of the input and
+            returns the dtype of the result if the
+            implementation functions supports it, or
+            returns `None` otherwise.
+        unary_dp_impl_fn (callable):
+            Data-parallel implementation function with signature
+            `impl_fn(src: usm_ndarray, dst: usm_ndarray,
+             sycl_queue: SyclQueue, depends: Optional[List[SyclEvent]])`
+            where the `src` is the argument array, `dst` is the
+            array to be populated with function values, effectively
+            evaluating `dst = func(src)`.
+            The `impl_fn` is expected to return a 2-tuple of `SyclEvent`s.
+            The first event corresponds to data-management host tasks,
+            including lifetime management of argument Python objects to ensure
+            that their associated USM allocation is not freed before offloaded
+            computational tasks complete execution, while the second event
+            corresponds to computational tasks associated with function
+            evaluation.
+        acceptance_fn (callable, optional):
+            Function to influence type promotion behavior of this unary
+            function. The function takes 4 arguments:
+                arg_dtype - Data type of the first argument
+                buf_dtype - Data type the argument would be cast to
+                res_dtype - Data type of the output array with function values
+                sycl_dev - The :class:`dpctl.SyclDevice` where the function
+                    evaluation is carried out.
+            The function is invoked when the argument of the unary function
+            requires casting, e.g. the argument of `dpctl.tensor.log` is an
+            array with integral data type.
+        docs (str):
+            Documentation string for the unary function.
+    """
+
+    def __init__(
+        self,
+        name,
+        result_type_resolver_fn,
+        unary_dp_impl_fn,
+        docs,
+        acceptance_fn=None,
+    ):
+        self.__name__ = "UnaryElementwiseFunc"
+        self.name_ = name
+        self.result_type_resolver_fn_ = result_type_resolver_fn
+        self.types_ = None
+        self.unary_fn_ = unary_dp_impl_fn
+        self.__doc__ = docs
+        if callable(acceptance_fn):
+            self.acceptance_fn_ = acceptance_fn
+        else:
+            self.acceptance_fn_ = _acceptance_fn_default_unary
+
+    def __str__(self):
+        return f"<{self.__name__} '{self.name_}'>"
+
+    def __repr__(self):
+        return f"<{self.__name__} '{self.name_}'>"
+
+    def get_implementation_function(self):
+        """Returns the implementation function for
+        this elementwise unary function.
+
+        """
+        return self.unary_fn_
+
+    def get_type_result_resolver_function(self):
+        """Returns the type resolver function for this
+        elementwise unary function.
+        """
+        return self.result_type_resolver_fn_
+
+    def get_type_promotion_path_acceptance_function(self):
+        """Returns the acceptance function for this
+        elementwise binary function.
+
+        Acceptance function influences the type promotion
+        behavior of this unary function.
+        The function takes 4 arguments:
+            arg_dtype - Data type of the first argument
+            buf_dtype - Data type the argument would be cast to
+            res_dtype - Data type of the output array with function values
+            sycl_dev - The :class:`dpctl.SyclDevice` where the function
+                evaluation is carried out.
+        The function is invoked when the argument of the unary function
+        requires casting, e.g. the argument of `dpctl.tensor.log` is an
+        array with integral data type.
+        """
+        return self.acceptance_fn_
+
+    @property
+    def nin(self):
+        """Returns the number of arguments treated as inputs."""
+        return 1
+
+    @property
+    def nout(self):
+        """Returns the number of arguments treated as outputs."""
+        return 1
+
+    @property
+    def types(self):
+        """Returns information about types supported by
+        implementation function, using NumPy's character
+        encoding for data types, e.g.
+
+        :Example:
+            .. code-block:: python
+
+                dpctl.tensor.sin.types
+                # Outputs: ['e->e', 'f->f', 'd->d', 'F->F', 'D->D']
+        """
+        types = self.types_
+        if not types:
+            types = []
+            for dt1 in _all_data_types(True, True):
+                dt2 = self.result_type_resolver_fn_(dt1)
+                if dt2:
+                    types.append(f"{dt1.char}->{dt2.char}")
+            self.types_ = types
+        return types
+
+    def __call__(self, x, /, *, out=None, order="K"):
+        if not isinstance(x, dpt.usm_ndarray):
+            raise TypeError(f"Expected dpctl.tensor.usm_ndarray, got {type(x)}")
+
+        if order not in ["C", "F", "K", "A"]:
+            order = "K"
+        buf_dt, res_dt = _find_buf_dtype(
+            x.dtype,
+            self.result_type_resolver_fn_,
+            x.sycl_device,
+            acceptance_fn=self.acceptance_fn_,
+        )
+        if res_dt is None:
+            raise ValueError(
+                f"function '{self.name_}' does not support input type "
+                f"({x.dtype}), "
+                "and the input could not be safely coerced to any "
+                "supported types according to the casting rule ''safe''."
+            )
+
+        orig_out = out
+        if out is not None:
+            if not isinstance(out, dpt.usm_ndarray):
+                raise TypeError(
+                    f"output array must be of usm_ndarray type, got {type(out)}"
+                )
+
+            if not out.flags.writable:
+                raise ValueError("provided `out` array is read-only")
+
+            if out.shape != x.shape:
+                raise ValueError(
+                    "The shape of input and output arrays are inconsistent. "
+                    f"Expected output shape is {x.shape}, got {out.shape}"
+                )
+
+            if res_dt != out.dtype:
+                raise ValueError(
+                    f"Output array of type {res_dt} is needed, "
+                    f"got {out.dtype}"
+                )
+
+            if (
+                buf_dt is None
+                and ti._array_overlap(x, out)
+                and not ti._same_logical_tensors(x, out)
+            ):
+                # Allocate a temporary buffer to avoid memory overlapping.
+                # Note if `buf_dt` is not None, a temporary copy of `x` will be
+                # created, so the array overlap check isn't needed.
+                out = dpt_ext.empty_like(out)
+
+            if (
+                dpctl.utils.get_execution_queue((x.sycl_queue, out.sycl_queue))
+                is None
+            ):
+                raise ExecutionPlacementError(
+                    "Input and output allocation queues are not compatible"
+                )
+
+        exec_q = x.sycl_queue
+        _manager = SequentialOrderManager[exec_q]
+        if buf_dt is None:
+            if out is None:
+                if order == "K":
+                    out = _empty_like_orderK(x, res_dt)
+                else:
+                    if order == "A":
+                        order = "F" if x.flags.f_contiguous else "C"
+                    out = dpt_ext.empty_like(x, dtype=res_dt, order=order)
+
+            dep_evs = _manager.submitted_events
+            ht_unary_ev, unary_ev = self.unary_fn_(
+                x, out, sycl_queue=exec_q, depends=dep_evs
+            )
+            _manager.add_event_pair(ht_unary_ev, unary_ev)
+
+            if not (orig_out is None or orig_out is out):
+                # Copy the out data from temporary buffer to original memory
+                ht_copy_ev, cpy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
+                    src=out, dst=orig_out, sycl_queue=exec_q, depends=[unary_ev]
+                )
+                _manager.add_event_pair(ht_copy_ev, cpy_ev)
+                out = orig_out
+
+            return out
+
+        if order == "K":
+            buf = _empty_like_orderK(x, buf_dt)
+        else:
+            if order == "A":
+                order = "F" if x.flags.f_contiguous else "C"
+            buf = dpt_ext.empty_like(x, dtype=buf_dt, order=order)
+
+        dep_evs = _manager.submitted_events
+        ht_copy_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
+            src=x, dst=buf, sycl_queue=exec_q, depends=dep_evs
+        )
+        _manager.add_event_pair(ht_copy_ev, copy_ev)
+        if out is None:
+            if order == "K":
+                out = _empty_like_orderK(buf, res_dt)
+            else:
+                out = dpt_ext.empty_like(buf, dtype=res_dt, order=order)
+
+        ht, uf_ev = self.unary_fn_(
+            buf, out, sycl_queue=exec_q, depends=[copy_ev]
+        )
+        _manager.add_event_pair(ht, uf_ev)
+
+        return out
diff --git a/dpctl_ext/tensor/_elementwise_funcs.py b/dpctl_ext/tensor/_elementwise_funcs.py
new file mode 100644
index 000000000000..3a3c05915732
--- /dev/null
+++ b/dpctl_ext/tensor/_elementwise_funcs.py
@@ -0,0 +1,360 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+# TODO: revert to `import dpctl.tensor...`
+# when dpnp fully migrates dpctl/tensor
+import dpctl_ext.tensor._tensor_elementwise_impl as ti
+
+from ._elementwise_common import UnaryElementwiseFunc
+
+# U01: ==== ABS    (x)
+_abs_docstring_ = r"""
+abs(x, /, \*, out=None, order='K')
+
+Calculates the absolute value for each element `x_i` of input array `x`.
+
+Args:
+    x (usm_ndarray):
+        Input array. May have any data type.
+    out (Union[usm_ndarray, None], optional):
+        Output array to populate.
+        Array must have the correct shape and the expected data type.
+    order ("C","F","A","K", optional):
+        Memory layout of the new output array,
+        if parameter `out` is ``None``.
+        Default: `"K"`.
+
+Returns:
+    usm_ndarray:
+        An array containing the element-wise absolute values.
+        For complex input, the absolute value is its magnitude.
+        If `x` has a real-valued data type, the returned array has the
+        same data type as `x`. If `x` has a complex floating-point data type,
+        the returned array has a real-valued floating-point data type whose
+        precision matches the precision of `x`.
+"""
+
+abs = UnaryElementwiseFunc("abs", ti._abs_result_type, ti._abs, _abs_docstring_)
+del _abs_docstring_
+
+# U02: ==== ACOS   (x)
+_acos_docstring = r"""
+acos(x, /, \*, out=None, order='K')
+
+Computes inverse cosine for each element `x_i` for input array `x`.
+
+Args:
+    x (usm_ndarray):
+        Input array, expected to have a floating-point data type.
+    out (Union[usm_ndarray, None], optional):
+        Output array to populate.
+        Array must have the correct shape and the expected data type.
+    order ("C","F","A","K", optional):
+        Memory layout of the new output array, if parameter
+        `out` is ``None``.
+        Default: "K".
+
+Returns:
+    usm_ndarray:
+        An array containing the element-wise inverse cosine, in radians
+        and in the closed interval :math:`[0, \pi]`. The data type of the
+        returned array is determined by the Type Promotion Rules.
+"""
+
+acos = UnaryElementwiseFunc(
+    "acos", ti._acos_result_type, ti._acos, _acos_docstring
+)
+del _acos_docstring
+
+# U03: ===== ACOSH (x)
+_acosh_docstring = r"""
+acosh(x, /, \*, out=None, order='K')
+
+Computes inverse hyperbolic cosine for each element `x_i` for input array `x`.
+
+Args:
+    x (usm_ndarray):
+        Input array, expected to have a floating-point data type.
+    out (Union[usm_ndarray, None], optional):
+        Output array to populate.
+        Array must have the correct shape and the expected data type.
+    order ("C","F","A","K", optional):
+        Memory layout of the new output array, if parameter
+        `out` is ``None``.
+        Default: "K".
+
+Returns:
+    usm_ndarray:
+        An array containing the element-wise inverse hyperbolic cosine, in
+        radians and in the half-closed interval :math:`[0, \infty)`. The data
+        type of the returned array is determined by the Type Promotion Rules.
+"""
+
+acosh = UnaryElementwiseFunc(
+    "acosh", ti._acosh_result_type, ti._acosh, _acosh_docstring
+)
+del _acosh_docstring
+
+# U04: ===== ASIN  (x)
+_asin_docstring = r"""
+asin(x, /, \*, out=None, order='K')
+
+Computes inverse sine for each element `x_i` for input array `x`.
+
+Args:
+    x (usm_ndarray):
+        Input array, expected to have a floating-point data type.
+    out (Union[usm_ndarray, None], optional):
+        Output array to populate.
+        Array must have the correct shape and the expected data type.
+    order ("C","F","A","K", optional):
+        Memory layout of the new output array, if parameter
+        `out` is ``None``.
+        Default: "K".
+
+Returns:
+    usm_ndarray:
+        An array containing the element-wise inverse sine, in radians
+        and in the closed interval :math:`[-\pi/2, \pi/2]`. The data type
+        of the returned array is determined by the Type Promotion Rules.
+"""
+
+asin = UnaryElementwiseFunc(
+    "asin", ti._asin_result_type, ti._asin, _asin_docstring
+)
+del _asin_docstring
+
+# U05: ===== ASINH (x)
+_asinh_docstring = r"""
+asinh(x, /, \*, out=None, order='K')
+
+Computes inverse hyperbolic sine for each element `x_i` for input array `x`.
+
+Args:
+    x (usm_ndarray):
+        Input array, expected to have a floating-point data type.
+    out (Union[usm_ndarray, None], optional):
+        Output array to populate.
+        Array must have the correct shape and the expected data type.
+    order ("C","F","A","K", optional):
+        Memory layout of the new output array, if parameter
+        `out` is ``None``.
+        Default: "K".
+
+Returns:
+    usm_ndarray:
+        An array containing the element-wise inverse hyperbolic sine, in
+        radians. The data type of the returned array is determined by
+        the Type Promotion Rules.
+"""
+
+asinh = UnaryElementwiseFunc(
+    "asinh", ti._asinh_result_type, ti._asinh, _asinh_docstring
+)
+del _asinh_docstring
+
+# U06: ===== ATAN  (x)
+_atan_docstring = r"""
+atan(x, /, \*, out=None, order='K')
+
+Computes inverse tangent for each element `x_i` for input array `x`.
+
+Args:
+    x (usm_ndarray):
+        Input array, expected to have a floating-point data type.
+    out (Union[usm_ndarray, None], optional):
+        Output array to populate.
+        Array must have the correct shape and the expected data type.
+    order ("C","F","A","K", optional):
+        Memory layout of the new output array, if parameter
+        `out` is ``None``.
+        Default: "K".
+
+Returns:
+    usm_ndarray:
+        An array containing the element-wise inverse tangent, in radians
+        and in the closed interval :math:`[-\pi/2, \pi/2]`. The data type
+        of the returned array is determined by the Type Promotion Rules.
+"""
+
+atan = UnaryElementwiseFunc(
+    "atan", ti._atan_result_type, ti._atan, _atan_docstring
+)
+del _atan_docstring
+
+# U07: ===== ATANH (x)
+_atanh_docstring = r"""
+atanh(x, /, \*, out=None, order='K')
+
+Computes hyperbolic inverse tangent for each element `x_i` for input array `x`.
+
+Args:
+    x (usm_ndarray):
+        Input array, expected to have a floating-point data type.
+    out (Union[usm_ndarray, None], optional):
+        Output array to populate.
+        Array must have the correct shape and the expected data type.
+    order ("C","F","A","K", optional):
+        Memory layout of the new output array, if parameter
+        `out` is ``None``.
+        Default: "K".
+
+Returns:
+    usm_ndarray:
+        An array containing the element-wise hyperbolic inverse tangent, in
+        radians. The data type of the returned array is determined by
+        the Type Promotion Rules.
+"""
+
+atanh = UnaryElementwiseFunc(
+    "atanh", ti._atanh_result_type, ti._atanh, _atanh_docstring
+)
+del _atanh_docstring
+
+# U08: ===== BITWISE_INVERT        (x)
+_bitwise_invert_docstring = r"""
+bitwise_invert(x, /, \*, out=None, order='K')
+
+Inverts (flips) each bit for each element `x_i` of the input array `x`.
+
+Args:
+    x (usm_ndarray):
+        Input array, expected to have integer or boolean data type.
+    out (Union[usm_ndarray, None], optional):
+        Output array to populate.
+        Array must have the correct shape and the expected data type.
+    order ("C","F","A","K", optional):
+        Memory layout of the new output array, if parameter
+        `out` is ``None``.
+        Default: "K".
+
+Returns:
+    usm_ndarray:
+        An array containing the element-wise results.
+        The data type of the returned array is same as the data type of the
+        input array.
+"""
+
+bitwise_invert = UnaryElementwiseFunc(
+    "bitwise_invert",
+    ti._bitwise_invert_result_type,
+    ti._bitwise_invert,
+    _bitwise_invert_docstring,
+)
+del _bitwise_invert_docstring
+
+# U09: ==== CEIL          (x)
+_ceil_docstring = r"""
+ceil(x, /, \*, out=None, order='K')
+
+Returns the ceiling for each element `x_i` for input array `x`.
+
+The ceil of `x_i` is the smallest integer `n`, such that `n >= x_i`.
+
+Args:
+    x (usm_ndarray):
+        Input array, expected to have a boolean or real-valued data type.
+    out (Union[usm_ndarray, None], optional):
+        Output array to populate.
+        Array must have the correct shape and the expected data type.
+    order ("C","F","A","K", optional):
+        Memory layout of the new output array, if parameter
+        `out` is ``None``.
+        Default: "K".
+
+Returns:
+    usm_ndarray:
+        An array containing the element-wise ceiling.
+"""
+
+ceil = UnaryElementwiseFunc(
+    "ceil", ti._ceil_result_type, ti._ceil, _ceil_docstring
+)
+del _ceil_docstring
+
+# U10: ==== CONJ          (x)
+_conj_docstring = r"""
+conj(x, /, \*, out=None, order='K')
+
+Computes conjugate of each element `x_i` for input array `x`.
+
+Args:
+    x (usm_ndarray):
+        Input array. May have any data type.
+    out (Union[usm_ndarray, None], optional):
+        Output array to populate.
+        Array must have the correct shape and the expected data type.
+    order ("C","F","A","K", optional):
+        Memory layout of the new output array, if parameter
+        `out` is ``None``.
+        Default: "K".
+
+Returns:
+    usm_ndarray:
+        An array containing the element-wise conjugate values.
+"""
+
+conj = UnaryElementwiseFunc(
+    "conj", ti._conj_result_type, ti._conj, _conj_docstring
+)
+del _conj_docstring
+
+# U43: ==== ANGLE        (x)
+_angle_docstring = r"""
+angle(x, /, \*, out=None, order='K')
+
+Computes the phase angle (also called the argument) of each element `x_i` for
+input array `x`.
+
+Args:
+    x (usm_ndarray):
+        Input array, expected to have a complex floating-point data type.
+    out (Union[usm_ndarray, None], optional):
+        Output array to populate.
+        Array have the correct shape and the expected data type.
+    order ("C","F","A","K", optional):
+        Memory layout of the new output array, if parameter
+        `out` is ``None``.
+        Default: "K".
+
+Returns:
+    usm_ndarray:
+        An array containing the element-wise phase angles.
+        The returned array has a floating-point data type determined
+        by the Type Promotion Rules.
+"""
+
+angle = UnaryElementwiseFunc(
+    "angle",
+    ti._angle_result_type,
+    ti._angle,
+    _angle_docstring,
+)
+del _angle_docstring
+
+del ti
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/abs.hpp b/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/abs.hpp
new file mode 100644
index 000000000000..1f0b3df33e4e
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/abs.hpp
@@ -0,0 +1,239 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for elementwise evaluation of ABS(x) function.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <complex>
+#include <cstddef>
+#include <cstdint>
+#include <type_traits>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "cabs_impl.hpp"
+#include "vec_size_util.hpp"
+
+#include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+
+#include "utils/type_dispatch_building.hpp"
+#include "utils/type_utils.hpp"
+
+namespace dpctl::tensor::kernels::abs
+{
+
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+using dpctl::tensor::ssize_t;
+using dpctl::tensor::type_utils::is_complex;
+
+template <typename argT, typename resT>
+struct AbsFunctor
+{
+
+    using is_constant = typename std::false_type;
+    // constexpr resT constant_value = resT{};
+    using supports_vec = typename std::false_type;
+    using supports_sg_loadstore = typename std::negation<
+        std::disjunction<is_complex<resT>, is_complex<argT>>>;
+
+    resT operator()(const argT &x) const
+    {
+
+        if constexpr (std::is_same_v<argT, bool> ||
+                      (std::is_integral<argT>::value &&
+                       std::is_unsigned<argT>::value))
+        {
+            static_assert(std::is_same_v<resT, argT>);
+            return x;
+        }
+        else {
+            if constexpr (is_complex<argT>::value) {
+                return detail::cabs(x);
+            }
+            else if constexpr (std::is_same_v<argT, sycl::half> ||
+                               std::is_floating_point_v<argT>)
+            {
+                return (sycl::signbit(x) ? -x : x);
+            }
+            else {
+                return sycl::abs(x);
+            }
+        }
+    }
+};
+
+template <typename argT,
+          typename resT = argT,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
+          bool enable_sg_loadstore = true>
+using AbsContigFunctor =
+    elementwise_common::UnaryContigFunctor<argT,
+                                           resT,
+                                           AbsFunctor<argT, resT>,
+                                           vec_sz,
+                                           n_vecs,
+                                           enable_sg_loadstore>;
+
+template <typename T>
+struct AbsOutputType
+{
+    using value_type = typename std::disjunction<
+        td_ns::TypeMapResultEntry<T, bool>,
+        td_ns::TypeMapResultEntry<T, std::uint8_t>,
+        td_ns::TypeMapResultEntry<T, std::uint16_t>,
+        td_ns::TypeMapResultEntry<T, std::uint32_t>,
+        td_ns::TypeMapResultEntry<T, std::uint64_t>,
+        td_ns::TypeMapResultEntry<T, std::int8_t>,
+        td_ns::TypeMapResultEntry<T, std::int16_t>,
+        td_ns::TypeMapResultEntry<T, std::int32_t>,
+        td_ns::TypeMapResultEntry<T, std::int64_t>,
+        td_ns::TypeMapResultEntry<T, sycl::half>,
+        td_ns::TypeMapResultEntry<T, float>,
+        td_ns::TypeMapResultEntry<T, double>,
+        td_ns::TypeMapResultEntry<T, std::complex<float>, float>,
+        td_ns::TypeMapResultEntry<T, std::complex<double>, double>,
+        td_ns::DefaultResultEntry<void>>::result_type;
+
+    static constexpr bool is_defined = !std::is_same_v<value_type, void>;
+};
+
+namespace hyperparam_detail
+{
+
+namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
+
+using vsu_ns::ContigHyperparameterSetDefault;
+
+template <typename argTy>
+struct AbsContigHyperparameterSet
+{
+    using value_type =
+        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
+
+    constexpr static auto vec_sz = value_type::vec_sz;
+    constexpr static auto n_vecs = value_type::n_vecs;
+};
+
+} // namespace hyperparam_detail
+
+template <typename T1, typename T2, std::uint8_t vec_sz, std::uint8_t n_vecs>
+class abs_contig_kernel;
+
+template <typename argTy>
+sycl::event abs_contig_impl(sycl::queue &exec_q,
+                            std::size_t nelems,
+                            const char *arg_p,
+                            char *res_p,
+                            const std::vector<sycl::event> &depends = {})
+{
+    using AbsHS = hyperparam_detail::AbsContigHyperparameterSet<argTy>;
+    static constexpr std::uint8_t vec_sz = AbsHS::vec_sz;
+    static constexpr std::uint8_t n_vec = AbsHS::n_vecs;
+
+    return elementwise_common::unary_contig_impl<
+        argTy, AbsOutputType, AbsContigFunctor, abs_contig_kernel, vec_sz,
+        n_vec>(exec_q, nelems, arg_p, res_p, depends);
+}
+
+template <typename fnT, typename T>
+struct AbsContigFactory
+{
+    fnT get()
+    {
+        if constexpr (!AbsOutputType<T>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = abs_contig_impl<T>;
+            return fn;
+        }
+    }
+};
+
+template <typename fnT, typename T>
+struct AbsTypeMapFactory
+{
+    /*! @brief get typeid for output type of abs(T x) */
+    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
+    {
+        using rT = typename AbsOutputType<T>::value_type;
+        return td_ns::GetTypeid<rT>{}.get();
+    }
+};
+
+template <typename argTy, typename resTy, typename IndexerT>
+using AbsStridedFunctor = elementwise_common::
+    UnaryStridedFunctor<argTy, resTy, IndexerT, AbsFunctor<argTy, resTy>>;
+
+template <typename T1, typename T2, typename T3>
+class abs_strided_kernel;
+
+template <typename argTy>
+sycl::event abs_strided_impl(sycl::queue &exec_q,
+                             std::size_t nelems,
+                             int nd,
+                             const ssize_t *shape_and_strides,
+                             const char *arg_p,
+                             ssize_t arg_offset,
+                             char *res_p,
+                             ssize_t res_offset,
+                             const std::vector<sycl::event> &depends,
+                             const std::vector<sycl::event> &additional_depends)
+{
+    return elementwise_common::unary_strided_impl<
+        argTy, AbsOutputType, AbsStridedFunctor, abs_strided_kernel>(
+        exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p,
+        res_offset, depends, additional_depends);
+}
+
+template <typename fnT, typename T>
+struct AbsStridedFactory
+{
+    fnT get()
+    {
+        if constexpr (!AbsOutputType<T>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = abs_strided_impl<T>;
+            return fn;
+        }
+    }
+};
+
+} // namespace dpctl::tensor::kernels::abs
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/acos.hpp b/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/acos.hpp
new file mode 100644
index 000000000000..9ceeb0947439
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/acos.hpp
@@ -0,0 +1,273 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for elementwise evaluation of ACOS(x) function.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <cmath>
+#include <complex>
+#include <cstddef>
+#include <cstdint>
+#include <limits>
+#include <type_traits>
+#include <vector>
+
+#include "sycl_complex.hpp"
+#include "vec_size_util.hpp"
+
+#include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+
+#include "utils/type_dispatch_building.hpp"
+#include "utils/type_utils.hpp"
+
+namespace dpctl::tensor::kernels::acos
+{
+
+using dpctl::tensor::ssize_t;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+using dpctl::tensor::type_utils::is_complex;
+
+template <typename argT, typename resT>
+struct AcosFunctor
+{
+
+    // is function constant for given argT
+    using is_constant = typename std::false_type;
+    // constant value, if constant
+    // constexpr resT constant_value = resT{};
+    // is function defined for sycl::vec
+    using supports_vec = typename std::false_type;
+    // do both argTy and resTy support sugroup store/load operation
+    using supports_sg_loadstore = typename std::negation<
+        std::disjunction<is_complex<resT>, is_complex<argT>>>;
+
+    resT operator()(const argT &in) const
+    {
+        if constexpr (is_complex<argT>::value) {
+            using realT = typename argT::value_type;
+
+            static constexpr realT q_nan =
+                std::numeric_limits<realT>::quiet_NaN();
+
+            const realT x = std::real(in);
+            const realT y = std::imag(in);
+
+            if (std::isnan(x)) {
+                /* acos(NaN + I*+-Inf) = NaN + I*-+Inf */
+                if (std::isinf(y)) {
+                    return resT{q_nan, -y};
+                }
+
+                /* all other cases involving NaN return NaN + I*NaN. */
+                return resT{q_nan, q_nan};
+            }
+            if (std::isnan(y)) {
+                /* acos(+-Inf + I*NaN) = NaN + I*opt(-)Inf */
+                if (std::isinf(x)) {
+                    return resT{q_nan, -std::numeric_limits<realT>::infinity()};
+                }
+                /* acos(0 + I*NaN) = PI/2 + I*NaN with inexact */
+                if (x == realT(0)) {
+                    const realT res_re = sycl::atan(realT(1)) * 2; // PI/2
+                    return resT{res_re, q_nan};
+                }
+
+                /* all other cases involving NaN return NaN + I*NaN. */
+                return resT{q_nan, q_nan};
+            }
+
+            /*
+             * For large x or y including acos(+-Inf + I*+-Inf)
+             */
+            static constexpr realT r_eps =
+                realT(1) / std::numeric_limits<realT>::epsilon();
+            if (sycl::fabs(x) > r_eps || sycl::fabs(y) > r_eps) {
+                using sycl_complexT = exprm_ns::complex<realT>;
+                sycl_complexT log_in =
+                    exprm_ns::log(exprm_ns::complex<realT>(in));
+
+                const realT wx = log_in.real();
+                const realT wy = log_in.imag();
+                const realT rx = sycl::fabs(wy);
+
+                realT ry = wx + sycl::log(realT(2));
+                return resT{rx, (sycl::signbit(y)) ? ry : -ry};
+            }
+
+            /* ordinary cases */
+            return exprm_ns::acos(exprm_ns::complex<realT>(in)); // acos(in);
+        }
+        else {
+            static_assert(std::is_floating_point_v<argT> ||
+                          std::is_same_v<argT, sycl::half>);
+            return sycl::acos(in);
+        }
+    }
+};
+
+template <typename argTy,
+          typename resTy = argTy,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
+          bool enable_sg_loadstore = true>
+using AcosContigFunctor =
+    elementwise_common::UnaryContigFunctor<argTy,
+                                           resTy,
+                                           AcosFunctor<argTy, resTy>,
+                                           vec_sz,
+                                           n_vecs,
+                                           enable_sg_loadstore>;
+
+template <typename argTy, typename resTy, typename IndexerT>
+using AcosStridedFunctor = elementwise_common::
+    UnaryStridedFunctor<argTy, resTy, IndexerT, AcosFunctor<argTy, resTy>>;
+
+template <typename T>
+struct AcosOutputType
+{
+    using value_type = typename std::disjunction<
+        td_ns::TypeMapResultEntry<T, sycl::half>,
+        td_ns::TypeMapResultEntry<T, float>,
+        td_ns::TypeMapResultEntry<T, double>,
+        td_ns::TypeMapResultEntry<T, std::complex<float>>,
+        td_ns::TypeMapResultEntry<T, std::complex<double>>,
+        td_ns::DefaultResultEntry<void>>::result_type;
+
+    static constexpr bool is_defined = !std::is_same_v<value_type, void>;
+};
+
+namespace hyperparam_detail
+{
+
+namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
+
+using vsu_ns::ContigHyperparameterSetDefault;
+
+template <typename argTy>
+struct AcosContigHyperparameterSet
+{
+    using value_type =
+        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
+
+    constexpr static auto vec_sz = value_type::vec_sz;
+    constexpr static auto n_vecs = value_type::n_vecs;
+};
+
+} // end of namespace hyperparam_detail
+
+template <typename T1, typename T2, std::uint8_t vec_sz, std::uint8_t n_vecs>
+class acos_contig_kernel;
+
+template <typename argTy>
+sycl::event acos_contig_impl(sycl::queue &exec_q,
+                             std::size_t nelems,
+                             const char *arg_p,
+                             char *res_p,
+                             const std::vector<sycl::event> &depends = {})
+{
+    using AcosHS = hyperparam_detail::AcosContigHyperparameterSet<argTy>;
+    static constexpr std::uint8_t vec_sz = AcosHS::vec_sz;
+    static constexpr std::uint8_t n_vec = AcosHS::n_vecs;
+
+    return elementwise_common::unary_contig_impl<
+        argTy, AcosOutputType, AcosContigFunctor, acos_contig_kernel, vec_sz,
+        n_vec>(exec_q, nelems, arg_p, res_p, depends);
+}
+
+template <typename fnT, typename T>
+struct AcosContigFactory
+{
+    fnT get()
+    {
+        if constexpr (!AcosOutputType<T>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = acos_contig_impl<T>;
+            return fn;
+        }
+    }
+};
+
+template <typename fnT, typename T>
+struct AcosTypeMapFactory
+{
+    /*! @brief get typeid for output type of sycl::acos(T x) */
+    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
+    {
+        using rT = typename AcosOutputType<T>::value_type;
+        return td_ns::GetTypeid<rT>{}.get();
+    }
+};
+
+template <typename T1, typename T2, typename T3>
+class acos_strided_kernel;
+
+template <typename argTy>
+sycl::event
+    acos_strided_impl(sycl::queue &exec_q,
+                      std::size_t nelems,
+                      int nd,
+                      const ssize_t *shape_and_strides,
+                      const char *arg_p,
+                      ssize_t arg_offset,
+                      char *res_p,
+                      ssize_t res_offset,
+                      const std::vector<sycl::event> &depends,
+                      const std::vector<sycl::event> &additional_depends)
+{
+    return elementwise_common::unary_strided_impl<
+        argTy, AcosOutputType, AcosStridedFunctor, acos_strided_kernel>(
+        exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p,
+        res_offset, depends, additional_depends);
+}
+
+template <typename fnT, typename T>
+struct AcosStridedFactory
+{
+    fnT get()
+    {
+        if constexpr (!AcosOutputType<T>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = acos_strided_impl<T>;
+            return fn;
+        }
+    }
+};
+
+} // namespace dpctl::tensor::kernels::acos
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/acosh.hpp b/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/acosh.hpp
new file mode 100644
index 000000000000..e356b37361d8
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/acosh.hpp
@@ -0,0 +1,304 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for elementwise evaluation of ACOSH(x) function.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <cmath>
+#include <complex>
+#include <cstddef>
+#include <cstdint>
+#include <limits>
+#include <type_traits>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "sycl_complex.hpp"
+#include "vec_size_util.hpp"
+
+#include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+
+#include "utils/type_dispatch_building.hpp"
+#include "utils/type_utils.hpp"
+
+namespace dpctl::tensor::kernels::acosh
+{
+
+using dpctl::tensor::ssize_t;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+using dpctl::tensor::type_utils::is_complex;
+
+template <typename argT, typename resT>
+struct AcoshFunctor
+{
+
+    // is function constant for given argT
+    using is_constant = typename std::false_type;
+    // constant value, if constant
+    // constexpr resT constant_value = resT{};
+    // is function defined for sycl::vec
+    using supports_vec = typename std::false_type;
+    // do both argTy and resTy support sugroup store/load operation
+    using supports_sg_loadstore = typename std::negation<
+        std::disjunction<is_complex<resT>, is_complex<argT>>>;
+
+    resT operator()(const argT &in) const
+    {
+        if constexpr (is_complex<argT>::value) {
+            using realT = typename argT::value_type;
+
+            static constexpr realT q_nan =
+                std::numeric_limits<realT>::quiet_NaN();
+            /*
+             * acosh(in) = I*acos(in) or -I*acos(in)
+             * where the sign is chosen so Re(acosh(in)) >= 0.
+             * So, we first calculate acos(in) and then acosh(in).
+             */
+            const realT x = std::real(in);
+            const realT y = std::imag(in);
+
+            resT acos_in;
+            if (std::isnan(x)) {
+                /* acos(NaN + I*+-Inf) = NaN + I*-+Inf */
+                if (std::isinf(y)) {
+                    acos_in = resT{q_nan, -y};
+                }
+                else {
+                    acos_in = resT{q_nan, q_nan};
+                }
+            }
+            else if (std::isnan(y)) {
+                /* acos(+-Inf + I*NaN) = NaN + I*opt(-)Inf */
+                static constexpr realT inf =
+                    std::numeric_limits<realT>::infinity();
+
+                if (std::isinf(x)) {
+                    acos_in = resT{q_nan, -inf};
+                }
+                /* acos(0 + I*NaN) = Pi/2 + I*NaN with inexact */
+                else if (x == realT(0)) {
+                    const realT pi_half = sycl::atan(realT(1)) * 2;
+                    acos_in = resT{pi_half, q_nan};
+                }
+                else {
+                    acos_in = resT{q_nan, q_nan};
+                }
+            }
+
+            static constexpr realT r_eps =
+                realT(1) / std::numeric_limits<realT>::epsilon();
+            /*
+             * For large x or y including acos(+-Inf + I*+-Inf)
+             */
+            if (sycl::fabs(x) > r_eps || sycl::fabs(y) > r_eps) {
+                using sycl_complexT = typename exprm_ns::complex<realT>;
+                const sycl_complexT log_in = exprm_ns::log(sycl_complexT(in));
+                const realT wx = log_in.real();
+                const realT wy = log_in.imag();
+                const realT rx = sycl::fabs(wy);
+                realT ry = wx + sycl::log(realT(2));
+                acos_in = resT{rx, (sycl::signbit(y)) ? ry : -ry};
+            }
+            else {
+                /* ordinary cases */
+                acos_in =
+                    exprm_ns::acos(exprm_ns::complex<realT>(in)); // acos(in);
+            }
+
+            /* Now we calculate acosh(z) */
+            const realT rx = std::real(acos_in);
+            const realT ry = std::imag(acos_in);
+
+            /* acosh(NaN + I*NaN) = NaN + I*NaN */
+            if (std::isnan(rx) && std::isnan(ry)) {
+                return resT{ry, rx};
+            }
+            /* acosh(NaN + I*+-Inf) = +Inf + I*NaN */
+            /* acosh(+-Inf + I*NaN) = +Inf + I*NaN */
+            if (std::isnan(rx)) {
+                return resT{sycl::fabs(ry), rx};
+            }
+            /* acosh(0 + I*NaN) = NaN + I*NaN */
+            if (std::isnan(ry)) {
+                return resT{ry, ry};
+            }
+            /* ordinary cases */
+            const realT res_im = sycl::copysign(rx, std::imag(in));
+            return resT{sycl::fabs(ry), res_im};
+        }
+        else {
+            static_assert(std::is_floating_point_v<argT> ||
+                          std::is_same_v<argT, sycl::half>);
+            return sycl::acosh(in);
+        }
+    }
+};
+
+template <typename argTy,
+          typename resTy = argTy,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
+          bool enable_sg_loadstore = true>
+using AcoshContigFunctor =
+    elementwise_common::UnaryContigFunctor<argTy,
+                                           resTy,
+                                           AcoshFunctor<argTy, resTy>,
+                                           vec_sz,
+                                           n_vecs,
+                                           enable_sg_loadstore>;
+
+template <typename argTy, typename resTy, typename IndexerT>
+using AcoshStridedFunctor = elementwise_common::
+    UnaryStridedFunctor<argTy, resTy, IndexerT, AcoshFunctor<argTy, resTy>>;
+
+template <typename T>
+struct AcoshOutputType
+{
+    using value_type = typename std::disjunction<
+        td_ns::TypeMapResultEntry<T, sycl::half>,
+        td_ns::TypeMapResultEntry<T, float>,
+        td_ns::TypeMapResultEntry<T, double>,
+        td_ns::TypeMapResultEntry<T, std::complex<float>>,
+        td_ns::TypeMapResultEntry<T, std::complex<double>>,
+        td_ns::DefaultResultEntry<void>>::result_type;
+
+    static constexpr bool is_defined = !std::is_same_v<value_type, void>;
+};
+
+namespace hyperparam_detail
+{
+
+namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
+
+using vsu_ns::ContigHyperparameterSetDefault;
+using vsu_ns::UnaryContigHyperparameterSetEntry;
+
+template <typename argTy>
+struct AcoshContigHyperparameterSet
+{
+    using value_type =
+        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
+
+    constexpr static auto vec_sz = value_type::vec_sz;
+    constexpr static auto n_vecs = value_type::n_vecs;
+};
+
+} // end of namespace hyperparam_detail
+
+template <typename T1, typename T2, std::uint8_t vec_sz, std::uint8_t n_vecs>
+class acosh_contig_kernel;
+
+template <typename argTy>
+sycl::event acosh_contig_impl(sycl::queue &exec_q,
+                              std::size_t nelems,
+                              const char *arg_p,
+                              char *res_p,
+                              const std::vector<sycl::event> &depends = {})
+{
+    using AcoshHS = hyperparam_detail::AcoshContigHyperparameterSet<argTy>;
+    static constexpr std::uint8_t vec_sz = AcoshHS::vec_sz;
+    static constexpr std::uint8_t n_vec = AcoshHS::n_vecs;
+
+    return elementwise_common::unary_contig_impl<
+        argTy, AcoshOutputType, AcoshContigFunctor, acosh_contig_kernel, vec_sz,
+        n_vec>(exec_q, nelems, arg_p, res_p, depends);
+}
+
+template <typename fnT, typename T>
+struct AcoshContigFactory
+{
+    fnT get()
+    {
+        if constexpr (!AcoshOutputType<T>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = acosh_contig_impl<T>;
+            return fn;
+        }
+    }
+};
+
+template <typename fnT, typename T>
+struct AcoshTypeMapFactory
+{
+    /*! @brief get typeid for output type of sycl::acosh(T x) */
+    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
+    {
+        using rT = typename AcoshOutputType<T>::value_type;
+        return td_ns::GetTypeid<rT>{}.get();
+    }
+};
+
+template <typename T1, typename T2, typename T3>
+class acosh_strided_kernel;
+
+template <typename argTy>
+sycl::event
+    acosh_strided_impl(sycl::queue &exec_q,
+                       std::size_t nelems,
+                       int nd,
+                       const ssize_t *shape_and_strides,
+                       const char *arg_p,
+                       ssize_t arg_offset,
+                       char *res_p,
+                       ssize_t res_offset,
+                       const std::vector<sycl::event> &depends,
+                       const std::vector<sycl::event> &additional_depends)
+{
+    return elementwise_common::unary_strided_impl<
+        argTy, AcoshOutputType, AcoshStridedFunctor, acosh_strided_kernel>(
+        exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p,
+        res_offset, depends, additional_depends);
+}
+
+template <typename fnT, typename T>
+struct AcoshStridedFactory
+{
+    fnT get()
+    {
+        if constexpr (!AcoshOutputType<T>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = acosh_strided_impl<T>;
+            return fn;
+        }
+    }
+};
+
+} // namespace dpctl::tensor::kernels::acosh
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/angle.hpp b/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/angle.hpp
new file mode 100644
index 000000000000..93dbd648e575
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/angle.hpp
@@ -0,0 +1,215 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for elementwise evaluation of ANGLE(x) function.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <complex>
+#include <cstddef>
+#include <cstdint>
+#include <type_traits>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "sycl_complex.hpp"
+#include "vec_size_util.hpp"
+
+#include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+
+#include "utils/type_dispatch_building.hpp"
+#include "utils/type_utils.hpp"
+
+namespace dpctl::tensor::kernels::angle
+{
+
+using dpctl::tensor::ssize_t;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+using dpctl::tensor::type_utils::is_complex;
+
+template <typename argT, typename resT>
+struct AngleFunctor
+{
+
+    // is function constant for given argT
+    using is_constant = typename std::false_type;
+    // constant value, if constant
+    // constexpr resT constant_value = resT{};
+    // is function defined for sycl::vec
+    using supports_vec = typename std::false_type;
+    // do both argTy and resTy support sugroup store/load operation
+    using supports_sg_loadstore = typename std::negation<
+        std::disjunction<is_complex<resT>, is_complex<argT>>>;
+
+    resT operator()(const argT &in) const
+    {
+        using rT = typename argT::value_type;
+
+        return exprm_ns::arg(exprm_ns::complex<rT>(in)); // arg(in);
+    }
+};
+
+template <typename argTy,
+          typename resTy = argTy,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
+          bool enable_sg_loadstore = true>
+using AngleContigFunctor =
+    elementwise_common::UnaryContigFunctor<argTy,
+                                           resTy,
+                                           AngleFunctor<argTy, resTy>,
+                                           vec_sz,
+                                           n_vecs,
+                                           enable_sg_loadstore>;
+
+template <typename argTy, typename resTy, typename IndexerT>
+using AngleStridedFunctor = elementwise_common::
+    UnaryStridedFunctor<argTy, resTy, IndexerT, AngleFunctor<argTy, resTy>>;
+
+template <typename T>
+struct AngleOutputType
+{
+    using value_type = typename std::disjunction<
+        td_ns::TypeMapResultEntry<T, std::complex<float>, float>,
+        td_ns::TypeMapResultEntry<T, std::complex<double>, double>,
+        td_ns::DefaultResultEntry<void>>::result_type;
+
+    static constexpr bool is_defined = !std::is_same_v<value_type, void>;
+};
+
+namespace hyperparam_detail
+{
+
+namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
+
+using vsu_ns::ContigHyperparameterSetDefault;
+using vsu_ns::UnaryContigHyperparameterSetEntry;
+
+template <typename argTy>
+struct AngleContigHyperparameterSet
+{
+    using value_type =
+        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
+
+    constexpr static auto vec_sz = value_type::vec_sz;
+    constexpr static auto n_vecs = value_type::n_vecs;
+};
+
+} // end of namespace hyperparam_detail
+
+template <typename T1, typename T2, std::uint8_t vec_sz, std::uint8_t n_vecs>
+class angle_contig_kernel;
+
+template <typename argTy>
+sycl::event angle_contig_impl(sycl::queue &exec_q,
+                              std::size_t nelems,
+                              const char *arg_p,
+                              char *res_p,
+                              const std::vector<sycl::event> &depends = {})
+{
+    using AngleHS = hyperparam_detail::AngleContigHyperparameterSet<argTy>;
+    static constexpr std::uint8_t vec_sz = AngleHS::vec_sz;
+    static constexpr std::uint8_t n_vec = AngleHS::n_vecs;
+
+    return elementwise_common::unary_contig_impl<
+        argTy, AngleOutputType, AngleContigFunctor, angle_contig_kernel, vec_sz,
+        n_vec>(exec_q, nelems, arg_p, res_p, depends);
+}
+
+template <typename fnT, typename T>
+struct AngleContigFactory
+{
+    fnT get()
+    {
+        if constexpr (!AngleOutputType<T>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = angle_contig_impl<T>;
+            return fn;
+        }
+    }
+};
+
+template <typename fnT, typename T>
+struct AngleTypeMapFactory
+{
+    /*! @brief get typeid for output type of std::arg(T x) */
+    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
+    {
+        using rT = typename AngleOutputType<T>::value_type;
+        return td_ns::GetTypeid<rT>{}.get();
+    }
+};
+
+template <typename T1, typename T2, typename T3>
+class angle_strided_kernel;
+
+template <typename argTy>
+sycl::event
+    angle_strided_impl(sycl::queue &exec_q,
+                       std::size_t nelems,
+                       int nd,
+                       const ssize_t *shape_and_strides,
+                       const char *arg_p,
+                       ssize_t arg_offset,
+                       char *res_p,
+                       ssize_t res_offset,
+                       const std::vector<sycl::event> &depends,
+                       const std::vector<sycl::event> &additional_depends)
+{
+    return elementwise_common::unary_strided_impl<
+        argTy, AngleOutputType, AngleStridedFunctor, angle_strided_kernel>(
+        exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p,
+        res_offset, depends, additional_depends);
+}
+
+template <typename fnT, typename T>
+struct AngleStridedFactory
+{
+    fnT get()
+    {
+        if constexpr (!AngleOutputType<T>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = angle_strided_impl<T>;
+            return fn;
+        }
+    }
+};
+
+} // namespace dpctl::tensor::kernels::angle
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/asin.hpp b/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/asin.hpp
new file mode 100644
index 000000000000..d367c1243628
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/asin.hpp
@@ -0,0 +1,296 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for elementwise evaluation of ASIN(x) function.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <cmath>
+#include <complex>
+#include <cstddef>
+#include <cstdint>
+#include <limits>
+#include <type_traits>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "sycl_complex.hpp"
+#include "vec_size_util.hpp"
+
+#include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+
+#include "utils/type_dispatch_building.hpp"
+#include "utils/type_utils.hpp"
+
+namespace dpctl::tensor::kernels::asin
+{
+
+using dpctl::tensor::ssize_t;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+using dpctl::tensor::type_utils::is_complex;
+
+template <typename argT, typename resT>
+struct AsinFunctor
+{
+
+    // is function constant for given argT
+    using is_constant = typename std::false_type;
+    // constant value, if constant
+    // constexpr resT constant_value = resT{};
+    // is function defined for sycl::vec
+    using supports_vec = typename std::false_type;
+    // do both argTy and resTy support sugroup store/load operation
+    using supports_sg_loadstore = typename std::negation<
+        std::disjunction<is_complex<resT>, is_complex<argT>>>;
+
+    resT operator()(const argT &in) const
+    {
+        if constexpr (is_complex<argT>::value) {
+            using realT = typename argT::value_type;
+
+            static constexpr realT q_nan =
+                std::numeric_limits<realT>::quiet_NaN();
+
+            /*
+             * asin(in) = I * conj( asinh(I * conj(in)) )
+             * so we first calculate w = asinh(I * conj(in)) with
+             * x = real(I * conj(in)) = imag(in)
+             * y = imag(I * conj(in)) = real(in)
+             * and then return {imag(w), real(w)} which is asin(in)
+             */
+            const realT x = std::imag(in);
+            const realT y = std::real(in);
+
+            if (std::isnan(x)) {
+                /* asinh(NaN + I*+-Inf) = opt(+-)Inf + I*NaN */
+                if (std::isinf(y)) {
+                    const realT asinh_re = y;
+                    const realT asinh_im = q_nan;
+                    return resT{asinh_im, asinh_re};
+                }
+                /* asinh(NaN + I*0) = NaN + I*0 */
+                if (y == realT(0)) {
+                    const realT asinh_re = q_nan;
+                    const realT asinh_im = y;
+                    return resT{asinh_im, asinh_re};
+                }
+                /* All other cases involving NaN return NaN + I*NaN. */
+                return resT{q_nan, q_nan};
+            }
+            else if (std::isnan(y)) {
+                /* asinh(+-Inf + I*NaN) = +-Inf + I*NaN */
+                if (std::isinf(x)) {
+                    const realT asinh_re = x;
+                    const realT asinh_im = q_nan;
+                    return resT{asinh_im, asinh_re};
+                }
+                /* All other cases involving NaN return NaN + I*NaN. */
+                return resT{q_nan, q_nan};
+            }
+
+            /*
+             * For large x or y including asinh(+-Inf + I*+-Inf)
+             * asinh(in) = sign(x)*log(sign(x)*in) + O(1/in^2)   as in ->
+             * infinity The above formula works for the imaginary part as well,
+             * because Im(asinh(in)) = sign(x)*atan2(sign(x)*y, fabs(x)) +
+             * O(y/in^3) as in -> infinity, uniformly in y
+             */
+            static constexpr realT r_eps =
+                realT(1) / std::numeric_limits<realT>::epsilon();
+            if (sycl::fabs(x) > r_eps || sycl::fabs(y) > r_eps) {
+                using sycl_complexT = exprm_ns::complex<realT>;
+                const sycl_complexT z{x, y};
+                realT wx, wy;
+                if (!sycl::signbit(x)) {
+                    const auto log_z = exprm_ns::log(z);
+                    wx = log_z.real() + sycl::log(realT(2));
+                    wy = log_z.imag();
+                }
+                else {
+                    const auto log_mz = exprm_ns::log(-z);
+                    wx = log_mz.real() + sycl::log(realT(2));
+                    wy = log_mz.imag();
+                }
+                const realT asinh_re = sycl::copysign(wx, x);
+                const realT asinh_im = sycl::copysign(wy, y);
+                return resT{asinh_im, asinh_re};
+            }
+            /* ordinary cases */
+            return exprm_ns::asin(
+                exprm_ns::complex<realT>(in)); // sycl::asin(in);
+        }
+        else {
+            static_assert(std::is_floating_point_v<argT> ||
+                          std::is_same_v<argT, sycl::half>);
+            return sycl::asin(in);
+        }
+    }
+};
+
+template <typename argTy,
+          typename resTy = argTy,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
+          bool enable_sg_loadstore = true>
+using AsinContigFunctor =
+    elementwise_common::UnaryContigFunctor<argTy,
+                                           resTy,
+                                           AsinFunctor<argTy, resTy>,
+                                           vec_sz,
+                                           n_vecs,
+                                           enable_sg_loadstore>;
+
+template <typename argTy, typename resTy, typename IndexerT>
+using AsinStridedFunctor = elementwise_common::
+    UnaryStridedFunctor<argTy, resTy, IndexerT, AsinFunctor<argTy, resTy>>;
+
+template <typename T>
+struct AsinOutputType
+{
+    using value_type = typename std::disjunction<
+        td_ns::TypeMapResultEntry<T, sycl::half>,
+        td_ns::TypeMapResultEntry<T, float>,
+        td_ns::TypeMapResultEntry<T, double>,
+        td_ns::TypeMapResultEntry<T, std::complex<float>>,
+        td_ns::TypeMapResultEntry<T, std::complex<double>>,
+        td_ns::DefaultResultEntry<void>>::result_type;
+
+    static constexpr bool is_defined = !std::is_same_v<value_type, void>;
+};
+
+namespace hyperparam_detail
+{
+
+namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
+
+using vsu_ns::ContigHyperparameterSetDefault;
+using vsu_ns::UnaryContigHyperparameterSetEntry;
+
+template <typename argTy>
+struct AsinContigHyperparameterSet
+{
+    using value_type =
+        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
+
+    constexpr static auto vec_sz = value_type::vec_sz;
+    constexpr static auto n_vecs = value_type::n_vecs;
+};
+
+} // end of namespace hyperparam_detail
+
+template <typename T1, typename T2, std::uint8_t vec_sz, std::uint8_t n_vecs>
+class asin_contig_kernel;
+
+template <typename argTy>
+sycl::event asin_contig_impl(sycl::queue &exec_q,
+                             std::size_t nelems,
+                             const char *arg_p,
+                             char *res_p,
+                             const std::vector<sycl::event> &depends = {})
+{
+    using AddHS = hyperparam_detail::AsinContigHyperparameterSet<argTy>;
+    static constexpr std::uint8_t vec_sz = AddHS::vec_sz;
+    static constexpr std::uint8_t n_vec = AddHS::n_vecs;
+
+    return elementwise_common::unary_contig_impl<
+        argTy, AsinOutputType, AsinContigFunctor, asin_contig_kernel, vec_sz,
+        n_vec>(exec_q, nelems, arg_p, res_p, depends);
+}
+
+template <typename fnT, typename T>
+struct AsinContigFactory
+{
+    fnT get()
+    {
+        if constexpr (!AsinOutputType<T>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = asin_contig_impl<T>;
+            return fn;
+        }
+    }
+};
+
+template <typename fnT, typename T>
+struct AsinTypeMapFactory
+{
+    /*! @brief get typeid for output type of sycl::asin(T x) */
+    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
+    {
+        using rT = typename AsinOutputType<T>::value_type;
+        return td_ns::GetTypeid<rT>{}.get();
+    }
+};
+
+template <typename T1, typename T2, typename T3>
+class asin_strided_kernel;
+
+template <typename argTy>
+sycl::event
+    asin_strided_impl(sycl::queue &exec_q,
+                      std::size_t nelems,
+                      int nd,
+                      const ssize_t *shape_and_strides,
+                      const char *arg_p,
+                      ssize_t arg_offset,
+                      char *res_p,
+                      ssize_t res_offset,
+                      const std::vector<sycl::event> &depends,
+                      const std::vector<sycl::event> &additional_depends)
+{
+    return elementwise_common::unary_strided_impl<
+        argTy, AsinOutputType, AsinStridedFunctor, asin_strided_kernel>(
+        exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p,
+        res_offset, depends, additional_depends);
+}
+
+template <typename fnT, typename T>
+struct AsinStridedFactory
+{
+    fnT get()
+    {
+        if constexpr (!AsinOutputType<T>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = asin_strided_impl<T>;
+            return fn;
+        }
+    }
+};
+
+} // namespace dpctl::tensor::kernels::asin
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/asinh.hpp b/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/asinh.hpp
new file mode 100644
index 000000000000..472e04f7cbe8
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/asinh.hpp
@@ -0,0 +1,279 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for elementwise evaluation of ASINH(x) function.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <cmath>
+#include <complex>
+#include <cstddef>
+#include <cstdint>
+#include <limits>
+#include <type_traits>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "sycl_complex.hpp"
+#include "vec_size_util.hpp"
+
+#include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+
+#include "utils/type_dispatch_building.hpp"
+#include "utils/type_utils.hpp"
+
+namespace dpctl::tensor::kernels::asinh
+{
+
+using dpctl::tensor::ssize_t;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+using dpctl::tensor::type_utils::is_complex;
+
+template <typename argT, typename resT>
+struct AsinhFunctor
+{
+
+    // is function constant for given argT
+    using is_constant = typename std::false_type;
+    // constant value, if constant
+    // constexpr resT constant_value = resT{};
+    // is function defined for sycl::vec
+    using supports_vec = typename std::false_type;
+    // do both argTy and resTy support sugroup store/load operation
+    using supports_sg_loadstore = typename std::negation<
+        std::disjunction<is_complex<resT>, is_complex<argT>>>;
+
+    resT operator()(const argT &in) const
+    {
+        if constexpr (is_complex<argT>::value) {
+            using realT = typename argT::value_type;
+
+            static constexpr realT q_nan =
+                std::numeric_limits<realT>::quiet_NaN();
+
+            const realT x = std::real(in);
+            const realT y = std::imag(in);
+
+            if (std::isnan(x)) {
+                /* asinh(NaN + I*+-Inf) = opt(+-)Inf + I*NaN */
+                if (std::isinf(y)) {
+                    return resT{y, q_nan};
+                }
+                /* asinh(NaN + I*0) = NaN + I*0 */
+                if (y == realT(0)) {
+                    return resT{q_nan, y};
+                }
+                /* All other cases involving NaN return NaN + I*NaN. */
+                return resT{q_nan, q_nan};
+            }
+
+            if (std::isnan(y)) {
+                /* asinh(+-Inf + I*NaN) = +-Inf + I*NaN */
+                if (std::isinf(x)) {
+                    return resT{x, q_nan};
+                }
+                /* All other cases involving NaN return NaN + I*NaN. */
+                return resT{q_nan, q_nan};
+            }
+
+            /*
+             * For large x or y including asinh(+-Inf + I*+-Inf)
+             * asinh(in) = sign(x)*log(sign(x)*in) + O(1/in^2)   as in ->
+             * infinity The above formula works for the imaginary part as well,
+             * because Im(asinh(in)) = sign(x)*atan2(sign(x)*y, fabs(x)) +
+             * O(y/in^3) as in -> infinity, uniformly in y
+             */
+            static constexpr realT r_eps =
+                realT(1) / std::numeric_limits<realT>::epsilon();
+
+            if (sycl::fabs(x) > r_eps || sycl::fabs(y) > r_eps) {
+                using sycl_complexT = exprm_ns::complex<realT>;
+                sycl_complexT log_in = (sycl::signbit(x))
+                                           ? exprm_ns::log(sycl_complexT(-in))
+                                           : exprm_ns::log(sycl_complexT(in));
+                realT wx = log_in.real() + sycl::log(realT(2));
+                realT wy = log_in.imag();
+
+                const realT res_re = sycl::copysign(wx, x);
+                const realT res_im = sycl::copysign(wy, y);
+                return resT{res_re, res_im};
+            }
+
+            /* ordinary cases */
+            return exprm_ns::asinh(exprm_ns::complex<realT>(in)); // asinh(in);
+        }
+        else {
+            static_assert(std::is_floating_point_v<argT> ||
+                          std::is_same_v<argT, sycl::half>);
+            return sycl::asinh(in);
+        }
+    }
+};
+
+template <typename argTy,
+          typename resTy = argTy,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
+          bool enable_sg_loadstore = true>
+using AsinhContigFunctor =
+    elementwise_common::UnaryContigFunctor<argTy,
+                                           resTy,
+                                           AsinhFunctor<argTy, resTy>,
+                                           vec_sz,
+                                           n_vecs,
+                                           enable_sg_loadstore>;
+
+template <typename argTy, typename resTy, typename IndexerT>
+using AsinhStridedFunctor = elementwise_common::
+    UnaryStridedFunctor<argTy, resTy, IndexerT, AsinhFunctor<argTy, resTy>>;
+
+template <typename T>
+struct AsinhOutputType
+{
+    using value_type = typename std::disjunction<
+        td_ns::TypeMapResultEntry<T, sycl::half>,
+        td_ns::TypeMapResultEntry<T, float>,
+        td_ns::TypeMapResultEntry<T, double>,
+        td_ns::TypeMapResultEntry<T, std::complex<float>>,
+        td_ns::TypeMapResultEntry<T, std::complex<double>>,
+        td_ns::DefaultResultEntry<void>>::result_type;
+
+    static constexpr bool is_defined = !std::is_same_v<value_type, void>;
+};
+
+namespace hyperparam_detail
+{
+
+namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
+
+using vsu_ns::ContigHyperparameterSetDefault;
+using vsu_ns::UnaryContigHyperparameterSetEntry;
+
+template <typename argTy>
+struct AsinhContigHyperparameterSet
+{
+    using value_type =
+        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
+
+    constexpr static auto vec_sz = value_type::vec_sz;
+    constexpr static auto n_vecs = value_type::n_vecs;
+};
+
+} // end of namespace hyperparam_detail
+
+template <typename T1, typename T2, std::uint8_t vec_sz, std::uint8_t n_vecs>
+class asinh_contig_kernel;
+
+template <typename argTy>
+sycl::event asinh_contig_impl(sycl::queue &exec_q,
+                              std::size_t nelems,
+                              const char *arg_p,
+                              char *res_p,
+                              const std::vector<sycl::event> &depends = {})
+{
+    using AsinhHS = hyperparam_detail::AsinhContigHyperparameterSet<argTy>;
+    static constexpr std::uint8_t vec_sz = AsinhHS::vec_sz;
+    static constexpr std::uint8_t n_vec = AsinhHS::n_vecs;
+
+    return elementwise_common::unary_contig_impl<
+        argTy, AsinhOutputType, AsinhContigFunctor, asinh_contig_kernel, vec_sz,
+        n_vec>(exec_q, nelems, arg_p, res_p, depends);
+}
+
+template <typename fnT, typename T>
+struct AsinhContigFactory
+{
+    fnT get()
+    {
+        if constexpr (!AsinhOutputType<T>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = asinh_contig_impl<T>;
+            return fn;
+        }
+    }
+};
+
+template <typename fnT, typename T>
+struct AsinhTypeMapFactory
+{
+    /*! @brief get typeid for output type of sycl::asinh(T x) */
+    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
+    {
+        using rT = typename AsinhOutputType<T>::value_type;
+        return td_ns::GetTypeid<rT>{}.get();
+    }
+};
+
+template <typename T1, typename T2, typename T3>
+class asinh_strided_kernel;
+
+template <typename argTy>
+sycl::event
+    asinh_strided_impl(sycl::queue &exec_q,
+                       std::size_t nelems,
+                       int nd,
+                       const ssize_t *shape_and_strides,
+                       const char *arg_p,
+                       ssize_t arg_offset,
+                       char *res_p,
+                       ssize_t res_offset,
+                       const std::vector<sycl::event> &depends,
+                       const std::vector<sycl::event> &additional_depends)
+{
+    return elementwise_common::unary_strided_impl<
+        argTy, AsinhOutputType, AsinhStridedFunctor, asinh_strided_kernel>(
+        exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p,
+        res_offset, depends, additional_depends);
+}
+
+template <typename fnT, typename T>
+struct AsinhStridedFactory
+{
+    fnT get()
+    {
+        if constexpr (!AsinhOutputType<T>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = asinh_strided_impl<T>;
+            return fn;
+        }
+    }
+};
+
+} // namespace dpctl::tensor::kernels::asinh
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/atan.hpp b/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/atan.hpp
new file mode 100644
index 000000000000..ab07a3fce3e0
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/atan.hpp
@@ -0,0 +1,288 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for elementwise evaluation of ATAN(x) function.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <cmath>
+#include <complex>
+#include <cstddef>
+#include <cstdint>
+#include <limits>
+#include <type_traits>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "sycl_complex.hpp"
+#include "vec_size_util.hpp"
+
+#include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+
+#include "utils/type_dispatch_building.hpp"
+#include "utils/type_utils.hpp"
+
+namespace dpctl::tensor::kernels::atan
+{
+
+using dpctl::tensor::ssize_t;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+using dpctl::tensor::kernels::vec_size_utils::ContigHyperparameterSetDefault;
+using dpctl::tensor::kernels::vec_size_utils::UnaryContigHyperparameterSetEntry;
+
+using dpctl::tensor::type_utils::is_complex;
+
+template <typename argT, typename resT>
+struct AtanFunctor
+{
+
+    // is function constant for given argT
+    using is_constant = typename std::false_type;
+    // constant value, if constant
+    // constexpr resT constant_value = resT{};
+    // is function defined for sycl::vec
+    using supports_vec = typename std::false_type;
+    // do both argTy and resTy support sugroup store/load operation
+    using supports_sg_loadstore = typename std::negation<
+        std::disjunction<is_complex<resT>, is_complex<argT>>>;
+
+    resT operator()(const argT &in) const
+    {
+        if constexpr (is_complex<argT>::value) {
+            using realT = typename argT::value_type;
+
+            static constexpr realT q_nan =
+                std::numeric_limits<realT>::quiet_NaN();
+            /*
+             * atan(in) = I * conj( atanh(I * conj(in)) )
+             * so we first calculate w = atanh(I * conj(in)) with
+             * x = real(I * conj(in)) = imag(in)
+             * y = imag(I * conj(in)) = real(in)
+             * and then return {imag(w), real(w)} which is atan(in)
+             */
+            const realT x = std::imag(in);
+            const realT y = std::real(in);
+            if (std::isnan(x)) {
+                /* atanh(NaN + I*+-Inf) = sign(NaN)*0 + I*+-Pi/2 */
+                if (std::isinf(y)) {
+                    const realT pi_half = sycl::atan(realT(1)) * 2;
+
+                    const realT atanh_re = sycl::copysign(realT(0), x);
+                    const realT atanh_im = sycl::copysign(pi_half, y);
+                    return resT{atanh_im, atanh_re};
+                }
+                /*
+                 * All other cases involving NaN return NaN + I*NaN.
+                 */
+                return resT{q_nan, q_nan};
+            }
+            else if (std::isnan(y)) {
+                /* atanh(+-Inf + I*NaN) = +-0 + I*NaN */
+                if (std::isinf(x)) {
+                    const realT atanh_re = sycl::copysign(realT(0), x);
+                    const realT atanh_im = q_nan;
+                    return resT{atanh_im, atanh_re};
+                }
+                /* atanh(+-0 + I*NaN) = +-0 + I*NaN */
+                if (x == realT(0)) {
+                    return resT{q_nan, x};
+                }
+                /*
+                 * All other cases involving NaN return NaN + I*NaN.
+                 */
+                return resT{q_nan, q_nan};
+            }
+
+            /*
+             * For large x or y including
+             * atanh(+-Inf + I*+-Inf) = 0 + I*+-PI/2
+             * The sign of pi/2 depends on the sign of imaginary part of the
+             * input.
+             */
+            static constexpr realT r_eps =
+                realT(1) / std::numeric_limits<realT>::epsilon();
+            if (sycl::fabs(x) > r_eps || sycl::fabs(y) > r_eps) {
+                const realT pi_half = sycl::atan(realT(1)) * 2;
+
+                const realT atanh_re = realT(0);
+                const realT atanh_im = sycl::copysign(pi_half, y);
+                return resT{atanh_im, atanh_re};
+            }
+            /* ordinary cases */
+            return exprm_ns::atan(exprm_ns::complex<realT>(in)); // atan(in);
+        }
+        else {
+            static_assert(std::is_floating_point_v<argT> ||
+                          std::is_same_v<argT, sycl::half>);
+            return sycl::atan(in);
+        }
+    }
+};
+
+template <typename argTy,
+          typename resTy = argTy,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
+          bool enable_sg_loadstore = true>
+using AtanContigFunctor =
+    elementwise_common::UnaryContigFunctor<argTy,
+                                           resTy,
+                                           AtanFunctor<argTy, resTy>,
+                                           vec_sz,
+                                           n_vecs,
+                                           enable_sg_loadstore>;
+
+template <typename argTy, typename resTy, typename IndexerT>
+using AtanStridedFunctor = elementwise_common::
+    UnaryStridedFunctor<argTy, resTy, IndexerT, AtanFunctor<argTy, resTy>>;
+
+template <typename T>
+struct AtanOutputType
+{
+    using value_type = typename std::disjunction<
+        td_ns::TypeMapResultEntry<T, sycl::half>,
+        td_ns::TypeMapResultEntry<T, float>,
+        td_ns::TypeMapResultEntry<T, double>,
+        td_ns::TypeMapResultEntry<T, std::complex<float>>,
+        td_ns::TypeMapResultEntry<T, std::complex<double>>,
+        td_ns::DefaultResultEntry<void>>::result_type;
+
+    static constexpr bool is_defined = !std::is_same_v<value_type, void>;
+};
+
+namespace hyperparam_detail
+{
+
+namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
+
+using vsu_ns::ContigHyperparameterSetDefault;
+using vsu_ns::UnaryContigHyperparameterSetEntry;
+
+template <typename argTy>
+struct AtanContigHyperparameterSet
+{
+    using value_type =
+        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
+
+    constexpr static auto vec_sz = value_type::vec_sz;
+    constexpr static auto n_vecs = value_type::n_vecs;
+};
+
+} // end of namespace hyperparam_detail
+
+template <typename T1, typename T2, std::uint8_t vec_sz, std::uint8_t n_vecs>
+class atan_contig_kernel;
+
+template <typename argTy>
+sycl::event atan_contig_impl(sycl::queue &exec_q,
+                             std::size_t nelems,
+                             const char *arg_p,
+                             char *res_p,
+                             const std::vector<sycl::event> &depends = {})
+{
+    using AtanHS = hyperparam_detail::AtanContigHyperparameterSet<argTy>;
+    static constexpr std::uint8_t vec_sz = AtanHS::vec_sz;
+    static constexpr std::uint8_t n_vec = AtanHS::n_vecs;
+
+    return elementwise_common::unary_contig_impl<
+        argTy, AtanOutputType, AtanContigFunctor, atan_contig_kernel, vec_sz,
+        n_vec>(exec_q, nelems, arg_p, res_p, depends);
+}
+
+template <typename fnT, typename T>
+struct AtanContigFactory
+{
+    fnT get()
+    {
+        if constexpr (!AtanOutputType<T>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = atan_contig_impl<T>;
+            return fn;
+        }
+    }
+};
+
+template <typename fnT, typename T>
+struct AtanTypeMapFactory
+{
+    /*! @brief get typeid for output type of sycl::atan(T x) */
+    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
+    {
+        using rT = typename AtanOutputType<T>::value_type;
+        return td_ns::GetTypeid<rT>{}.get();
+    }
+};
+
+template <typename T1, typename T2, typename T3>
+class atan_strided_kernel;
+
+template <typename argTy>
+sycl::event
+    atan_strided_impl(sycl::queue &exec_q,
+                      std::size_t nelems,
+                      int nd,
+                      const ssize_t *shape_and_strides,
+                      const char *arg_p,
+                      ssize_t arg_offset,
+                      char *res_p,
+                      ssize_t res_offset,
+                      const std::vector<sycl::event> &depends,
+                      const std::vector<sycl::event> &additional_depends)
+{
+    return elementwise_common::unary_strided_impl<
+        argTy, AtanOutputType, AtanStridedFunctor, atan_strided_kernel>(
+        exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p,
+        res_offset, depends, additional_depends);
+}
+
+template <typename fnT, typename T>
+struct AtanStridedFactory
+{
+    fnT get()
+    {
+        if constexpr (!AtanOutputType<T>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = atan_strided_impl<T>;
+            return fn;
+        }
+    }
+};
+
+} // namespace dpctl::tensor::kernels::atan
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/atanh.hpp b/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/atanh.hpp
new file mode 100644
index 000000000000..f72380ae3de9
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/atanh.hpp
@@ -0,0 +1,280 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for elementwise evaluation of ATANH(x) function.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+
+#include <cmath>
+#include <complex>
+#include <cstddef>
+#include <cstdint>
+#include <limits>
+#include <type_traits>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "sycl_complex.hpp"
+#include "vec_size_util.hpp"
+
+#include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+
+#include "utils/type_dispatch_building.hpp"
+#include "utils/type_utils.hpp"
+
+namespace dpctl::tensor::kernels::atanh
+{
+
+using dpctl::tensor::ssize_t;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+using dpctl::tensor::type_utils::is_complex;
+
+template <typename argT, typename resT>
+struct AtanhFunctor
+{
+
+    // is function constant for given argT
+    using is_constant = typename std::false_type;
+    // constant value, if constant
+    // constexpr resT constant_value = resT{};
+    // is function defined for sycl::vec
+    using supports_vec = typename std::false_type;
+    // do both argTy and resTy support sugroup store/load operation
+    using supports_sg_loadstore = typename std::negation<
+        std::disjunction<is_complex<resT>, is_complex<argT>>>;
+
+    resT operator()(const argT &in) const
+    {
+        if constexpr (is_complex<argT>::value) {
+            using realT = typename argT::value_type;
+            static constexpr realT q_nan =
+                std::numeric_limits<realT>::quiet_NaN();
+
+            const realT x = std::real(in);
+            const realT y = std::imag(in);
+
+            if (std::isnan(x)) {
+                /* atanh(NaN + I*+-Inf) = sign(NaN)0 + I*+-PI/2 */
+                if (std::isinf(y)) {
+                    const realT pi_half = sycl::atan(realT(1)) * 2;
+
+                    const realT res_re = sycl::copysign(realT(0), x);
+                    const realT res_im = sycl::copysign(pi_half, y);
+                    return resT{res_re, res_im};
+                }
+                /*
+                 * All other cases involving NaN return NaN + I*NaN.
+                 */
+                return resT{q_nan, q_nan};
+            }
+            else if (std::isnan(y)) {
+                /* atanh(+-Inf + I*NaN) = +-0 + I*NaN */
+                if (std::isinf(x)) {
+                    const realT res_re = sycl::copysign(realT(0), x);
+                    return resT{res_re, q_nan};
+                }
+                /* atanh(+-0 + I*NaN) = +-0 + I*NaN */
+                if (x == realT(0)) {
+                    return resT{x, q_nan};
+                }
+                /*
+                 * All other cases involving NaN return NaN + I*NaN.
+                 */
+                return resT{q_nan, q_nan};
+            }
+
+            /*
+             * For large x or y including
+             * atanh(+-Inf + I*+-Inf) = 0 + I*+-PI/2
+             * The sign of PI/2 depends on the sign of imaginary part of the
+             * input.
+             */
+            const realT RECIP_EPSILON =
+                realT(1) / std::numeric_limits<realT>::epsilon();
+            if (sycl::fabs(x) > RECIP_EPSILON || sycl::fabs(y) > RECIP_EPSILON)
+            {
+                const realT pi_half = sycl::atan(realT(1)) * 2;
+
+                const realT res_re = realT(0);
+                const realT res_im = sycl::copysign(pi_half, y);
+                return resT{res_re, res_im};
+            }
+            /* ordinary cases */
+            return exprm_ns::atanh(exprm_ns::complex<realT>(in)); // atanh(in);
+        }
+        else {
+            static_assert(std::is_floating_point_v<argT> ||
+                          std::is_same_v<argT, sycl::half>);
+            return sycl::atanh(in);
+        }
+    }
+};
+
+template <typename argTy,
+          typename resTy = argTy,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
+          bool enable_sg_loadstore = true>
+using AtanhContigFunctor =
+    elementwise_common::UnaryContigFunctor<argTy,
+                                           resTy,
+                                           AtanhFunctor<argTy, resTy>,
+                                           vec_sz,
+                                           n_vecs,
+                                           enable_sg_loadstore>;
+
+template <typename argTy, typename resTy, typename IndexerT>
+using AtanhStridedFunctor = elementwise_common::
+    UnaryStridedFunctor<argTy, resTy, IndexerT, AtanhFunctor<argTy, resTy>>;
+
+template <typename T>
+struct AtanhOutputType
+{
+    using value_type = typename std::disjunction<
+        td_ns::TypeMapResultEntry<T, sycl::half>,
+        td_ns::TypeMapResultEntry<T, float>,
+        td_ns::TypeMapResultEntry<T, double>,
+        td_ns::TypeMapResultEntry<T, std::complex<float>>,
+        td_ns::TypeMapResultEntry<T, std::complex<double>>,
+        td_ns::DefaultResultEntry<void>>::result_type;
+
+    static constexpr bool is_defined = !std::is_same_v<value_type, void>;
+};
+
+namespace hyperparam_detail
+{
+
+namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
+
+using vsu_ns::ContigHyperparameterSetDefault;
+using vsu_ns::UnaryContigHyperparameterSetEntry;
+
+template <typename argTy>
+struct AtanhContigHyperparameterSet
+{
+    using value_type =
+        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
+
+    constexpr static auto vec_sz = value_type::vec_sz;
+    constexpr static auto n_vecs = value_type::n_vecs;
+};
+
+} // end of namespace hyperparam_detail
+
+template <typename T1, typename T2, std::uint8_t vec_sz, std::uint8_t n_vecs>
+class atanh_contig_kernel;
+
+template <typename argTy>
+sycl::event atanh_contig_impl(sycl::queue &exec_q,
+                              std::size_t nelems,
+                              const char *arg_p,
+                              char *res_p,
+                              const std::vector<sycl::event> &depends = {})
+{
+    using AtanhHS = hyperparam_detail::AtanhContigHyperparameterSet<argTy>;
+    static constexpr std::uint8_t vec_sz = AtanhHS::vec_sz;
+    static constexpr std::uint8_t n_vec = AtanhHS::n_vecs;
+
+    return elementwise_common::unary_contig_impl<
+        argTy, AtanhOutputType, AtanhContigFunctor, atanh_contig_kernel, vec_sz,
+        n_vec>(exec_q, nelems, arg_p, res_p, depends);
+}
+
+template <typename fnT, typename T>
+struct AtanhContigFactory
+{
+    fnT get()
+    {
+        if constexpr (!AtanhOutputType<T>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = atanh_contig_impl<T>;
+            return fn;
+        }
+    }
+};
+
+template <typename fnT, typename T>
+struct AtanhTypeMapFactory
+{
+    /*! @brief get typeid for output type of sycl::atanh(T x) */
+    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
+    {
+        using rT = typename AtanhOutputType<T>::value_type;
+        return td_ns::GetTypeid<rT>{}.get();
+    }
+};
+
+template <typename T1, typename T2, typename T3>
+class atanh_strided_kernel;
+
+template <typename argTy>
+sycl::event
+    atanh_strided_impl(sycl::queue &exec_q,
+                       std::size_t nelems,
+                       int nd,
+                       const ssize_t *shape_and_strides,
+                       const char *arg_p,
+                       ssize_t arg_offset,
+                       char *res_p,
+                       ssize_t res_offset,
+                       const std::vector<sycl::event> &depends,
+                       const std::vector<sycl::event> &additional_depends)
+{
+    return elementwise_common::unary_strided_impl<
+        argTy, AtanhOutputType, AtanhStridedFunctor, atanh_strided_kernel>(
+        exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p,
+        res_offset, depends, additional_depends);
+}
+
+template <typename fnT, typename T>
+struct AtanhStridedFactory
+{
+    fnT get()
+    {
+        if constexpr (!AtanhOutputType<T>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = atanh_strided_impl<T>;
+            return fn;
+        }
+    }
+};
+
+} // namespace dpctl::tensor::kernels::atanh
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/bitwise_invert.hpp b/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/bitwise_invert.hpp
new file mode 100644
index 000000000000..96da6b9627ab
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/bitwise_invert.hpp
@@ -0,0 +1,231 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for elementwise evaluation of bitwise_invert(x)
+/// function that inverts bits of binary representation of the argument.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <cstddef>
+#include <cstdint>
+#include <type_traits>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "vec_size_util.hpp"
+
+#include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+
+#include "utils/type_dispatch_building.hpp"
+
+namespace dpctl::tensor::kernels::bitwise_invert
+{
+
+using dpctl::tensor::ssize_t;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+template <typename argT, typename resT>
+struct BitwiseInvertFunctor
+{
+    static_assert(std::is_same_v<argT, resT>);
+    static_assert(std::is_integral_v<argT> || std::is_same_v<argT, bool>);
+
+    using is_constant = typename std::false_type;
+    // constexpr resT constant_value = resT{};
+    using supports_vec = typename std::negation<std::is_same<argT, bool>>;
+    using supports_sg_loadstore = typename std::true_type;
+
+    resT operator()(const argT &in) const
+    {
+        if constexpr (std::is_same_v<argT, bool>) {
+            return !in;
+        }
+        else {
+            return ~in;
+        }
+    }
+
+    template <int vec_sz>
+    sycl::vec<resT, vec_sz> operator()(const sycl::vec<argT, vec_sz> &in) const
+    {
+        return ~in;
+    }
+};
+
+template <typename argT,
+          typename resT = argT,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
+          bool enable_sg_loadstore = true>
+using BitwiseInvertContigFunctor =
+    elementwise_common::UnaryContigFunctor<argT,
+                                           resT,
+                                           BitwiseInvertFunctor<argT, resT>,
+                                           vec_sz,
+                                           n_vecs,
+                                           enable_sg_loadstore>;
+
+template <typename argTy, typename resTy, typename IndexerT>
+using BitwiseInvertStridedFunctor =
+    elementwise_common::UnaryStridedFunctor<argTy,
+                                            resTy,
+                                            IndexerT,
+                                            BitwiseInvertFunctor<argTy, resTy>>;
+
+template <typename argTy>
+struct BitwiseInvertOutputType
+{
+    using value_type = typename std::disjunction<
+        td_ns::TypeMapResultEntry<argTy, bool>,
+        td_ns::TypeMapResultEntry<argTy, std::uint8_t>,
+        td_ns::TypeMapResultEntry<argTy, std::uint16_t>,
+        td_ns::TypeMapResultEntry<argTy, std::uint32_t>,
+        td_ns::TypeMapResultEntry<argTy, std::uint64_t>,
+        td_ns::TypeMapResultEntry<argTy, std::int8_t>,
+        td_ns::TypeMapResultEntry<argTy, std::int16_t>,
+        td_ns::TypeMapResultEntry<argTy, std::int32_t>,
+        td_ns::TypeMapResultEntry<argTy, std::int64_t>,
+        td_ns::DefaultResultEntry<void>>::result_type;
+
+    static constexpr bool is_defined = !std::is_same_v<value_type, void>;
+};
+
+namespace hyperparam_detail
+{
+
+namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
+
+using vsu_ns::ContigHyperparameterSetDefault;
+using vsu_ns::UnaryContigHyperparameterSetEntry;
+
+template <typename argTy>
+struct BitwiseInvertContigHyperparameterSet
+{
+    using value_type =
+        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
+
+    constexpr static auto vec_sz = value_type::vec_sz;
+    constexpr static auto n_vecs = value_type::n_vecs;
+};
+
+} // end of namespace hyperparam_detail
+
+template <typename T1, typename T2, std::uint8_t vec_sz, std::uint8_t n_vecs>
+class bitwise_invert_contig_kernel;
+
+template <typename argTy>
+sycl::event
+    bitwise_invert_contig_impl(sycl::queue &exec_q,
+                               std::size_t nelems,
+                               const char *arg_p,
+                               char *res_p,
+                               const std::vector<sycl::event> &depends = {})
+{
+    using BitwiseInvertHS =
+        hyperparam_detail::BitwiseInvertContigHyperparameterSet<argTy>;
+    static constexpr std::uint8_t vec_sz = BitwiseInvertHS::vec_sz;
+    static constexpr std::uint8_t n_vec = BitwiseInvertHS::n_vecs;
+
+    return elementwise_common::unary_contig_impl<
+        argTy, BitwiseInvertOutputType, BitwiseInvertContigFunctor,
+        bitwise_invert_contig_kernel, vec_sz, n_vec>(exec_q, nelems, arg_p,
+                                                     res_p, depends);
+}
+
+template <typename fnT, typename T>
+struct BitwiseInvertContigFactory
+{
+    fnT get()
+    {
+        if constexpr (!BitwiseInvertOutputType<T>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = bitwise_invert_contig_impl<T>;
+            return fn;
+        }
+    }
+};
+
+template <typename fnT, typename T>
+struct BitwiseInvertTypeMapFactory
+{
+    /*! @brief get typeid for output type of sycl::logical_not(T x) */
+    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
+    {
+        using rT = typename BitwiseInvertOutputType<T>::value_type;
+        return td_ns::GetTypeid<rT>{}.get();
+    }
+};
+
+template <typename T1, typename T2, typename T3>
+class bitwise_invert_strided_kernel;
+
+template <typename argTy>
+sycl::event bitwise_invert_strided_impl(
+    sycl::queue &exec_q,
+    std::size_t nelems,
+    int nd,
+    const ssize_t *shape_and_strides,
+    const char *arg_p,
+    ssize_t arg_offset,
+    char *res_p,
+    ssize_t res_offset,
+    const std::vector<sycl::event> &depends,
+    const std::vector<sycl::event> &additional_depends)
+{
+    return elementwise_common::unary_strided_impl<
+        argTy, BitwiseInvertOutputType, BitwiseInvertStridedFunctor,
+        bitwise_invert_strided_kernel>(exec_q, nelems, nd, shape_and_strides,
+                                       arg_p, arg_offset, res_p, res_offset,
+                                       depends, additional_depends);
+}
+
+template <typename fnT, typename T>
+struct BitwiseInvertStridedFactory
+{
+    fnT get()
+    {
+        if constexpr (!BitwiseInvertOutputType<T>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = bitwise_invert_strided_impl<T>;
+            return fn;
+        }
+    }
+};
+
+} // namespace dpctl::tensor::kernels::bitwise_invert
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/cabs_impl.hpp b/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/cabs_impl.hpp
new file mode 100644
index 000000000000..ae632061571f
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/cabs_impl.hpp
@@ -0,0 +1,77 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines an implementation of the complex absolute value.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <cmath>
+#include <complex>
+#include <limits>
+
+#include "sycl_complex.hpp"
+
+namespace dpctl::tensor::kernels::detail
+{
+
+template <typename realT>
+realT cabs(std::complex<realT> const &z)
+{
+    // Special values for cabs( x + y * 1j):
+    //   * If x is either +infinity or -infinity and y is any value
+    //   (including NaN), the result is +infinity.
+    //   * If x is any value (including NaN) and y is either +infinity or
+    //   -infinity, the result is +infinity.
+    //   * If x is either +0 or -0, the result is equal to abs(y).
+    //   * If y is either +0 or -0, the result is equal to abs(x).
+    //   * If x is NaN and y is a finite number, the result is NaN.
+    //   * If x is a finite number and y is NaN, the result is NaN.
+    //   * If x is NaN and y is NaN, the result is NaN.
+
+    const realT x = std::real(z);
+    const realT y = std::imag(z);
+
+    static constexpr realT q_nan = std::numeric_limits<realT>::quiet_NaN();
+    static constexpr realT p_inf = std::numeric_limits<realT>::infinity();
+
+    const realT res =
+        std::isinf(x)
+            ? p_inf
+            : ((std::isinf(y)
+                    ? p_inf
+                    : ((std::isnan(x)
+                            ? q_nan
+                            : exprm_ns::abs(exprm_ns::complex<realT>(z))))));
+
+    return res;
+}
+
+} // namespace dpctl::tensor::kernels::detail
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/ceil.hpp b/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/ceil.hpp
new file mode 100644
index 000000000000..08fd4da2fb50
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/ceil.hpp
@@ -0,0 +1,230 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for elementwise evaluation of CEIL(x) function.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+
+#include <cstddef>
+#include <cstdint>
+#include <type_traits>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "vec_size_util.hpp"
+
+#include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+
+#include "utils/type_dispatch_building.hpp"
+#include "utils/type_utils.hpp"
+
+namespace dpctl::tensor::kernels::ceil
+{
+
+using dpctl::tensor::ssize_t;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+using dpctl::tensor::type_utils::is_complex;
+
+template <typename argT, typename resT>
+struct CeilFunctor
+{
+
+    // is function constant for given argT
+    using is_constant = typename std::false_type;
+    // constant value, if constant
+    // constexpr resT constant_value = resT{};
+    // is function defined for sycl::vec
+    using supports_vec = typename std::false_type;
+    // do both argTy and resTy support sugroup store/load operation
+    using supports_sg_loadstore = typename std::negation<
+        std::disjunction<is_complex<resT>, is_complex<argT>>>;
+
+    resT operator()(const argT &in) const
+    {
+        if constexpr (std::is_integral_v<argT>) {
+            return in;
+        }
+        else {
+            if (in == 0) {
+                return in;
+            }
+            return sycl::ceil(in);
+        }
+    }
+};
+
+template <typename argTy,
+          typename resTy = argTy,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
+          bool enable_sg_loadstore = true>
+using CeilContigFunctor =
+    elementwise_common::UnaryContigFunctor<argTy,
+                                           resTy,
+                                           CeilFunctor<argTy, resTy>,
+                                           vec_sz,
+                                           n_vecs,
+                                           enable_sg_loadstore>;
+
+template <typename argTy, typename resTy, typename IndexerT>
+using CeilStridedFunctor = elementwise_common::
+    UnaryStridedFunctor<argTy, resTy, IndexerT, CeilFunctor<argTy, resTy>>;
+
+template <typename T>
+struct CeilOutputType
+{
+    using value_type =
+        typename std::disjunction<td_ns::TypeMapResultEntry<T, bool>,
+                                  td_ns::TypeMapResultEntry<T, std::uint8_t>,
+                                  td_ns::TypeMapResultEntry<T, std::uint16_t>,
+                                  td_ns::TypeMapResultEntry<T, std::uint32_t>,
+                                  td_ns::TypeMapResultEntry<T, std::uint64_t>,
+                                  td_ns::TypeMapResultEntry<T, std::int8_t>,
+                                  td_ns::TypeMapResultEntry<T, std::int16_t>,
+                                  td_ns::TypeMapResultEntry<T, std::int32_t>,
+                                  td_ns::TypeMapResultEntry<T, std::int64_t>,
+                                  td_ns::TypeMapResultEntry<T, sycl::half>,
+                                  td_ns::TypeMapResultEntry<T, float>,
+                                  td_ns::TypeMapResultEntry<T, double>,
+                                  td_ns::DefaultResultEntry<void>>::result_type;
+
+    static constexpr bool is_defined = !std::is_same_v<value_type, void>;
+};
+
+namespace hyperparam_detail
+{
+
+namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
+
+using vsu_ns::ContigHyperparameterSetDefault;
+using vsu_ns::UnaryContigHyperparameterSetEntry;
+
+template <typename argTy>
+struct CeilContigHyperparameterSet
+{
+    using value_type =
+        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
+
+    constexpr static auto vec_sz = value_type::vec_sz;
+    constexpr static auto n_vecs = value_type::n_vecs;
+};
+
+} // end of namespace hyperparam_detail
+
+template <typename T1, typename T2, std::uint8_t vec_sz, std::uint8_t n_vecs>
+class ceil_contig_kernel;
+
+template <typename argTy>
+sycl::event ceil_contig_impl(sycl::queue &exec_q,
+                             std::size_t nelems,
+                             const char *arg_p,
+                             char *res_p,
+                             const std::vector<sycl::event> &depends = {})
+{
+    using CeilHS = hyperparam_detail::CeilContigHyperparameterSet<argTy>;
+    static constexpr std::uint8_t vec_sz = CeilHS::vec_sz;
+    static constexpr std::uint8_t n_vecs = CeilHS::n_vecs;
+
+    return elementwise_common::unary_contig_impl<
+        argTy, CeilOutputType, CeilContigFunctor, ceil_contig_kernel, vec_sz,
+        n_vecs>(exec_q, nelems, arg_p, res_p, depends);
+}
+
+template <typename fnT, typename T>
+struct CeilContigFactory
+{
+    fnT get()
+    {
+        if constexpr (!CeilOutputType<T>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = ceil_contig_impl<T>;
+            return fn;
+        }
+    }
+};
+
+template <typename fnT, typename T>
+struct CeilTypeMapFactory
+{
+    /*! @brief get typeid for output type of sycl::ceil(T x) */
+    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
+    {
+        using rT = typename CeilOutputType<T>::value_type;
+        return td_ns::GetTypeid<rT>{}.get();
+    }
+};
+
+template <typename T1, typename T2, typename T3>
+class ceil_strided_kernel;
+
+template <typename argTy>
+sycl::event
+    ceil_strided_impl(sycl::queue &exec_q,
+                      std::size_t nelems,
+                      int nd,
+                      const ssize_t *shape_and_strides,
+                      const char *arg_p,
+                      ssize_t arg_offset,
+                      char *res_p,
+                      ssize_t res_offset,
+                      const std::vector<sycl::event> &depends,
+                      const std::vector<sycl::event> &additional_depends)
+{
+    return elementwise_common::unary_strided_impl<
+        argTy, CeilOutputType, CeilStridedFunctor, ceil_strided_kernel>(
+        exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p,
+        res_offset, depends, additional_depends);
+}
+
+template <typename fnT, typename T>
+struct CeilStridedFactory
+{
+    fnT get()
+    {
+        if constexpr (!CeilOutputType<T>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = ceil_strided_impl<T>;
+            return fn;
+        }
+    }
+};
+
+} // namespace dpctl::tensor::kernels::ceil
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/common.hpp b/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/common.hpp
index e83426df8aa9..1c072dc58fdc 100644
--- a/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/common.hpp
+++ b/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/common.hpp
@@ -35,6 +35,7 @@
 #include <algorithm>
 #include <cstddef>
 #include <cstdint>
+#include <iterator>
 #include <type_traits>
 #include <vector>
 
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/conj.hpp b/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/conj.hpp
new file mode 100644
index 000000000000..2c965b236c87
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/conj.hpp
@@ -0,0 +1,234 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for elementwise evaluation of CONJ(x) function.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <complex>
+#include <cstddef>
+#include <cstdint>
+#include <type_traits>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "sycl_complex.hpp"
+#include "vec_size_util.hpp"
+
+#include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+
+#include "utils/type_dispatch_building.hpp"
+#include "utils/type_utils.hpp"
+
+namespace dpctl::tensor::kernels::conj
+{
+
+using dpctl::tensor::ssize_t;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+using dpctl::tensor::type_utils::is_complex;
+
+template <typename argT, typename resT>
+struct ConjFunctor
+{
+
+    // is function constant for given argT
+    using is_constant = typename std::false_type;
+    // constant value, if constant
+    // constexpr resT constant_value = resT{};
+    // is function defined for sycl::vec
+    using supports_vec = typename std::false_type;
+    // do both argTy and resTy support sugroup store/load operation
+    using supports_sg_loadstore = typename std::negation<
+        std::disjunction<is_complex<resT>, is_complex<argT>>>;
+
+    resT operator()(const argT &in) const
+    {
+        if constexpr (is_complex<argT>::value) {
+            using rT = typename argT::value_type;
+
+            return exprm_ns::conj(exprm_ns::complex<rT>(in)); // conj(in);
+        }
+        else {
+            if constexpr (!std::is_same_v<argT, bool>)
+                static_assert(std::is_same_v<resT, argT>);
+            return in;
+        }
+    }
+};
+
+template <typename argTy,
+          typename resTy = argTy,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
+          bool enable_sg_loadstore = true>
+using ConjContigFunctor =
+    elementwise_common::UnaryContigFunctor<argTy,
+                                           resTy,
+                                           ConjFunctor<argTy, resTy>,
+                                           vec_sz,
+                                           n_vecs,
+                                           enable_sg_loadstore>;
+
+template <typename argTy, typename resTy, typename IndexerT>
+using ConjStridedFunctor = elementwise_common::
+    UnaryStridedFunctor<argTy, resTy, IndexerT, ConjFunctor<argTy, resTy>>;
+
+template <typename T>
+struct ConjOutputType
+{
+    using value_type = typename std::disjunction<
+        td_ns::TypeMapResultEntry<T, bool, std::int8_t>,
+        td_ns::TypeMapResultEntry<T, std::uint8_t>,
+        td_ns::TypeMapResultEntry<T, std::uint16_t>,
+        td_ns::TypeMapResultEntry<T, std::uint32_t>,
+        td_ns::TypeMapResultEntry<T, std::uint64_t>,
+        td_ns::TypeMapResultEntry<T, std::int8_t>,
+        td_ns::TypeMapResultEntry<T, std::int16_t>,
+        td_ns::TypeMapResultEntry<T, std::int32_t>,
+        td_ns::TypeMapResultEntry<T, std::int64_t>,
+        td_ns::TypeMapResultEntry<T, sycl::half>,
+        td_ns::TypeMapResultEntry<T, float>,
+        td_ns::TypeMapResultEntry<T, double>,
+        td_ns::TypeMapResultEntry<T, std::complex<float>>,
+        td_ns::TypeMapResultEntry<T, std::complex<double>>,
+        td_ns::DefaultResultEntry<void>>::result_type;
+
+    static constexpr bool is_defined = !std::is_same_v<value_type, void>;
+};
+
+namespace hyperparam_detail
+{
+
+namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
+
+using vsu_ns::ContigHyperparameterSetDefault;
+using vsu_ns::UnaryContigHyperparameterSetEntry;
+
+template <typename argTy>
+struct ConjContigHyperparameterSet
+{
+    using value_type =
+        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
+
+    constexpr static auto vec_sz = value_type::vec_sz;
+    constexpr static auto n_vecs = value_type::n_vecs;
+};
+
+} // end of namespace hyperparam_detail
+
+template <typename T1, typename T2, std::uint8_t vec_sz, std::uint8_t n_vecs>
+class conj_contig_kernel;
+
+template <typename argTy>
+sycl::event conj_contig_impl(sycl::queue &exec_q,
+                             std::size_t nelems,
+                             const char *arg_p,
+                             char *res_p,
+                             const std::vector<sycl::event> &depends = {})
+{
+    using ConjHS = hyperparam_detail::ConjContigHyperparameterSet<argTy>;
+    static constexpr std::uint8_t vec_sz = ConjHS::vec_sz;
+    static constexpr std::uint8_t n_vecs = ConjHS::n_vecs;
+
+    return elementwise_common::unary_contig_impl<
+        argTy, ConjOutputType, ConjContigFunctor, conj_contig_kernel, vec_sz,
+        n_vecs>(exec_q, nelems, arg_p, res_p, depends);
+}
+
+template <typename fnT, typename T>
+struct ConjContigFactory
+{
+    fnT get()
+    {
+        if constexpr (!ConjOutputType<T>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = conj_contig_impl<T>;
+            return fn;
+        }
+    }
+};
+
+template <typename fnT, typename T>
+struct ConjTypeMapFactory
+{
+    /*! @brief get typeid for output type of std::conj(T x) */
+    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
+    {
+        using rT = typename ConjOutputType<T>::value_type;
+        return td_ns::GetTypeid<rT>{}.get();
+    }
+};
+
+template <typename T1, typename T2, typename T3>
+class conj_strided_kernel;
+
+template <typename argTy>
+sycl::event
+    conj_strided_impl(sycl::queue &exec_q,
+                      std::size_t nelems,
+                      int nd,
+                      const ssize_t *shape_and_strides,
+                      const char *arg_p,
+                      ssize_t arg_offset,
+                      char *res_p,
+                      ssize_t res_offset,
+                      const std::vector<sycl::event> &depends,
+                      const std::vector<sycl::event> &additional_depends)
+{
+    return elementwise_common::unary_strided_impl<
+        argTy, ConjOutputType, ConjStridedFunctor, conj_strided_kernel>(
+        exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p,
+        res_offset, depends, additional_depends);
+}
+
+template <typename fnT, typename T>
+struct ConjStridedFactory
+{
+    fnT get()
+    {
+        if constexpr (!ConjOutputType<T>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = conj_strided_impl<T>;
+            return fn;
+        }
+    }
+};
+
+} // namespace dpctl::tensor::kernels::conj
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/abs.cpp b/dpctl_ext/tensor/libtensor/source/elementwise_functions/abs.cpp
new file mode 100644
index 000000000000..067a201099de
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/elementwise_functions/abs.cpp
@@ -0,0 +1,125 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "abs.hpp"
+#include "elementwise_functions.hpp"
+#include "utils/type_dispatch.hpp"
+#include "utils/type_dispatch_building.hpp"
+
+#include "kernels/elementwise_functions/abs.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
+using ew_cmn_ns::unary_contig_impl_fn_ptr_t;
+using ew_cmn_ns::unary_strided_impl_fn_ptr_t;
+
+// U01: ==== ABS   (x)
+namespace impl
+{
+
+namespace abs_fn_ns = dpctl::tensor::kernels::abs;
+
+static unary_contig_impl_fn_ptr_t abs_contig_dispatch_vector[td_ns::num_types];
+static int abs_output_typeid_vector[td_ns::num_types];
+static unary_strided_impl_fn_ptr_t
+    abs_strided_dispatch_vector[td_ns::num_types];
+
+void populate_abs_dispatch_vectors(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = abs_fn_ns;
+
+    using fn_ns::AbsContigFactory;
+    DispatchVectorBuilder<unary_contig_impl_fn_ptr_t, AbsContigFactory,
+                          num_types>
+        dvb1;
+    dvb1.populate_dispatch_vector(abs_contig_dispatch_vector);
+
+    using fn_ns::AbsStridedFactory;
+    DispatchVectorBuilder<unary_strided_impl_fn_ptr_t, AbsStridedFactory,
+                          num_types>
+        dvb2;
+    dvb2.populate_dispatch_vector(abs_strided_dispatch_vector);
+
+    using fn_ns::AbsTypeMapFactory;
+    DispatchVectorBuilder<int, AbsTypeMapFactory, num_types> dvb3;
+    dvb3.populate_dispatch_vector(abs_output_typeid_vector);
+};
+
+} // namespace impl
+
+void init_abs(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        impl::populate_abs_dispatch_vectors();
+        using impl::abs_contig_dispatch_vector;
+        using impl::abs_output_typeid_vector;
+        using impl::abs_strided_dispatch_vector;
+
+        auto abs_pyapi = [&](const arrayT &src, const arrayT &dst,
+                             sycl::queue &exec_q,
+                             const event_vecT &depends = {}) {
+            return py_unary_ufunc(
+                src, dst, exec_q, depends, abs_output_typeid_vector,
+                abs_contig_dispatch_vector, abs_strided_dispatch_vector);
+        };
+        m.def("_abs", abs_pyapi, "", py::arg("src"), py::arg("dst"),
+              py::arg("sycl_queue"), py::arg("depends") = py::list());
+
+        auto abs_result_type_pyapi = [&](const py::dtype &dtype) {
+            return py_unary_ufunc_result_type(dtype, abs_output_typeid_vector);
+        };
+        m.def("_abs_result_type", abs_result_type_pyapi);
+    }
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/abs.hpp b/dpctl_ext/tensor/libtensor/source/elementwise_functions/abs.hpp
new file mode 100644
index 000000000000..b496f1e694ac
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/elementwise_functions/abs.hpp
@@ -0,0 +1,46 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern void init_abs(py::module_ m);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/acos.cpp b/dpctl_ext/tensor/libtensor/source/elementwise_functions/acos.cpp
new file mode 100644
index 000000000000..52d962cd828e
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/elementwise_functions/acos.cpp
@@ -0,0 +1,125 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "acos.hpp"
+#include "elementwise_functions.hpp"
+#include "utils/type_dispatch.hpp"
+#include "utils/type_dispatch_building.hpp"
+
+#include "kernels/elementwise_functions/acos.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
+using ew_cmn_ns::unary_contig_impl_fn_ptr_t;
+using ew_cmn_ns::unary_strided_impl_fn_ptr_t;
+
+// U02: ==== ACOS   (x)
+namespace impl
+{
+
+namespace acos_fn_ns = dpctl::tensor::kernels::acos;
+
+static unary_contig_impl_fn_ptr_t acos_contig_dispatch_vector[td_ns::num_types];
+static int acos_output_typeid_vector[td_ns::num_types];
+static unary_strided_impl_fn_ptr_t
+    acos_strided_dispatch_vector[td_ns::num_types];
+
+void populate_acos_dispatch_vectors(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = acos_fn_ns;
+
+    using fn_ns::AcosContigFactory;
+    DispatchVectorBuilder<unary_contig_impl_fn_ptr_t, AcosContigFactory,
+                          num_types>
+        dvb1;
+    dvb1.populate_dispatch_vector(acos_contig_dispatch_vector);
+
+    using fn_ns::AcosStridedFactory;
+    DispatchVectorBuilder<unary_strided_impl_fn_ptr_t, AcosStridedFactory,
+                          num_types>
+        dvb2;
+    dvb2.populate_dispatch_vector(acos_strided_dispatch_vector);
+
+    using fn_ns::AcosTypeMapFactory;
+    DispatchVectorBuilder<int, AcosTypeMapFactory, num_types> dvb3;
+    dvb3.populate_dispatch_vector(acos_output_typeid_vector);
+};
+
+} // namespace impl
+
+void init_acos(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        impl::populate_acos_dispatch_vectors();
+        using impl::acos_contig_dispatch_vector;
+        using impl::acos_output_typeid_vector;
+        using impl::acos_strided_dispatch_vector;
+
+        auto acos_pyapi = [&](const arrayT &src, const arrayT &dst,
+                              sycl::queue &exec_q,
+                              const event_vecT &depends = {}) {
+            return py_unary_ufunc(
+                src, dst, exec_q, depends, acos_output_typeid_vector,
+                acos_contig_dispatch_vector, acos_strided_dispatch_vector);
+        };
+        m.def("_acos", acos_pyapi, "", py::arg("src"), py::arg("dst"),
+              py::arg("sycl_queue"), py::arg("depends") = py::list());
+
+        auto acos_result_type_pyapi = [&](const py::dtype &dtype) {
+            return py_unary_ufunc_result_type(dtype, acos_output_typeid_vector);
+        };
+        m.def("_acos_result_type", acos_result_type_pyapi);
+    }
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/acos.hpp b/dpctl_ext/tensor/libtensor/source/elementwise_functions/acos.hpp
new file mode 100644
index 000000000000..608b684c4e18
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/elementwise_functions/acos.hpp
@@ -0,0 +1,46 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern void init_acos(py::module_ m);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/acosh.cpp b/dpctl_ext/tensor/libtensor/source/elementwise_functions/acosh.cpp
new file mode 100644
index 000000000000..c2334804e422
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/elementwise_functions/acosh.cpp
@@ -0,0 +1,127 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "acosh.hpp"
+#include "elementwise_functions.hpp"
+#include "utils/type_dispatch.hpp"
+#include "utils/type_dispatch_building.hpp"
+
+#include "kernels/elementwise_functions/acosh.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
+using ew_cmn_ns::unary_contig_impl_fn_ptr_t;
+using ew_cmn_ns::unary_strided_impl_fn_ptr_t;
+
+// U03: ==== ACOSH   (x)
+namespace impl
+{
+
+namespace acosh_fn_ns = dpctl::tensor::kernels::acosh;
+
+static unary_contig_impl_fn_ptr_t
+    acosh_contig_dispatch_vector[td_ns::num_types];
+static int acosh_output_typeid_vector[td_ns::num_types];
+static unary_strided_impl_fn_ptr_t
+    acosh_strided_dispatch_vector[td_ns::num_types];
+
+void populate_acosh_dispatch_vectors(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = acosh_fn_ns;
+
+    using fn_ns::AcoshContigFactory;
+    DispatchVectorBuilder<unary_contig_impl_fn_ptr_t, AcoshContigFactory,
+                          num_types>
+        dvb1;
+    dvb1.populate_dispatch_vector(acosh_contig_dispatch_vector);
+
+    using fn_ns::AcoshStridedFactory;
+    DispatchVectorBuilder<unary_strided_impl_fn_ptr_t, AcoshStridedFactory,
+                          num_types>
+        dvb2;
+    dvb2.populate_dispatch_vector(acosh_strided_dispatch_vector);
+
+    using fn_ns::AcoshTypeMapFactory;
+    DispatchVectorBuilder<int, AcoshTypeMapFactory, num_types> dvb3;
+    dvb3.populate_dispatch_vector(acosh_output_typeid_vector);
+};
+
+} // namespace impl
+
+void init_acosh(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        impl::populate_acosh_dispatch_vectors();
+        using impl::acosh_contig_dispatch_vector;
+        using impl::acosh_output_typeid_vector;
+        using impl::acosh_strided_dispatch_vector;
+
+        auto acosh_pyapi = [&](const arrayT &src, const arrayT &dst,
+                               sycl::queue &exec_q,
+                               const event_vecT &depends = {}) {
+            return py_unary_ufunc(
+                src, dst, exec_q, depends, acosh_output_typeid_vector,
+                acosh_contig_dispatch_vector, acosh_strided_dispatch_vector);
+        };
+        m.def("_acosh", acosh_pyapi, "", py::arg("src"), py::arg("dst"),
+              py::arg("sycl_queue"), py::arg("depends") = py::list());
+
+        auto acosh_result_type_pyapi = [&](const py::dtype &dtype) {
+            return py_unary_ufunc_result_type(dtype,
+                                              acosh_output_typeid_vector);
+        };
+        m.def("_acosh_result_type", acosh_result_type_pyapi);
+    }
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/acosh.hpp b/dpctl_ext/tensor/libtensor/source/elementwise_functions/acosh.hpp
new file mode 100644
index 000000000000..fc74fa99874f
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/elementwise_functions/acosh.hpp
@@ -0,0 +1,46 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern void init_acosh(py::module_ m);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/angle.cpp b/dpctl_ext/tensor/libtensor/source/elementwise_functions/angle.cpp
new file mode 100644
index 000000000000..df2b97fe7644
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/elementwise_functions/angle.cpp
@@ -0,0 +1,127 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "angle.hpp"
+#include "elementwise_functions.hpp"
+#include "utils/type_dispatch.hpp"
+#include "utils/type_dispatch_building.hpp"
+
+#include "kernels/elementwise_functions/angle.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
+using ew_cmn_ns::unary_contig_impl_fn_ptr_t;
+using ew_cmn_ns::unary_strided_impl_fn_ptr_t;
+
+// U43: ==== ANGLE   (x)
+namespace impl
+{
+
+namespace angle_fn_ns = dpctl::tensor::kernels::angle;
+
+static unary_contig_impl_fn_ptr_t
+    angle_contig_dispatch_vector[td_ns::num_types];
+static int angle_output_typeid_vector[td_ns::num_types];
+static unary_strided_impl_fn_ptr_t
+    angle_strided_dispatch_vector[td_ns::num_types];
+
+void populate_angle_dispatch_vectors(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = angle_fn_ns;
+
+    using fn_ns::AngleContigFactory;
+    DispatchVectorBuilder<unary_contig_impl_fn_ptr_t, AngleContigFactory,
+                          num_types>
+        dvb1;
+    dvb1.populate_dispatch_vector(angle_contig_dispatch_vector);
+
+    using fn_ns::AngleStridedFactory;
+    DispatchVectorBuilder<unary_strided_impl_fn_ptr_t, AngleStridedFactory,
+                          num_types>
+        dvb2;
+    dvb2.populate_dispatch_vector(angle_strided_dispatch_vector);
+
+    using fn_ns::AngleTypeMapFactory;
+    DispatchVectorBuilder<int, AngleTypeMapFactory, num_types> dvb3;
+    dvb3.populate_dispatch_vector(angle_output_typeid_vector);
+};
+
+} // namespace impl
+
+void init_angle(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        impl::populate_angle_dispatch_vectors();
+        using impl::angle_contig_dispatch_vector;
+        using impl::angle_output_typeid_vector;
+        using impl::angle_strided_dispatch_vector;
+
+        auto angle_pyapi = [&](const arrayT &src, const arrayT &dst,
+                               sycl::queue &exec_q,
+                               const event_vecT &depends = {}) {
+            return py_unary_ufunc(
+                src, dst, exec_q, depends, angle_output_typeid_vector,
+                angle_contig_dispatch_vector, angle_strided_dispatch_vector);
+        };
+        m.def("_angle", angle_pyapi, "", py::arg("src"), py::arg("dst"),
+              py::arg("sycl_queue"), py::arg("depends") = py::list());
+
+        auto angle_result_type_pyapi = [&](const py::dtype &dtype) {
+            return py_unary_ufunc_result_type(dtype,
+                                              angle_output_typeid_vector);
+        };
+        m.def("_angle_result_type", angle_result_type_pyapi);
+    }
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/angle.hpp b/dpctl_ext/tensor/libtensor/source/elementwise_functions/angle.hpp
new file mode 100644
index 000000000000..73071b945d7b
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/elementwise_functions/angle.hpp
@@ -0,0 +1,46 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern void init_angle(py::module_ m);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/asin.cpp b/dpctl_ext/tensor/libtensor/source/elementwise_functions/asin.cpp
new file mode 100644
index 000000000000..32d71c67527e
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/elementwise_functions/asin.cpp
@@ -0,0 +1,125 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "asin.hpp"
+#include "elementwise_functions.hpp"
+#include "utils/type_dispatch.hpp"
+#include "utils/type_dispatch_building.hpp"
+
+#include "kernels/elementwise_functions/asin.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
+using ew_cmn_ns::unary_contig_impl_fn_ptr_t;
+using ew_cmn_ns::unary_strided_impl_fn_ptr_t;
+
+// U04: ==== ASIN   (x)
+namespace impl
+{
+
+namespace asin_fn_ns = dpctl::tensor::kernels::asin;
+
+static unary_contig_impl_fn_ptr_t asin_contig_dispatch_vector[td_ns::num_types];
+static int asin_output_typeid_vector[td_ns::num_types];
+static unary_strided_impl_fn_ptr_t
+    asin_strided_dispatch_vector[td_ns::num_types];
+
+void populate_asin_dispatch_vectors(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = asin_fn_ns;
+
+    using fn_ns::AsinContigFactory;
+    DispatchVectorBuilder<unary_contig_impl_fn_ptr_t, AsinContigFactory,
+                          num_types>
+        dvb1;
+    dvb1.populate_dispatch_vector(asin_contig_dispatch_vector);
+
+    using fn_ns::AsinStridedFactory;
+    DispatchVectorBuilder<unary_strided_impl_fn_ptr_t, AsinStridedFactory,
+                          num_types>
+        dvb2;
+    dvb2.populate_dispatch_vector(asin_strided_dispatch_vector);
+
+    using fn_ns::AsinTypeMapFactory;
+    DispatchVectorBuilder<int, AsinTypeMapFactory, num_types> dvb3;
+    dvb3.populate_dispatch_vector(asin_output_typeid_vector);
+};
+
+} // namespace impl
+
+void init_asin(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        impl::populate_asin_dispatch_vectors();
+        using impl::asin_contig_dispatch_vector;
+        using impl::asin_output_typeid_vector;
+        using impl::asin_strided_dispatch_vector;
+
+        auto asin_pyapi = [&](const arrayT &src, const arrayT &dst,
+                              sycl::queue &exec_q,
+                              const event_vecT &depends = {}) {
+            return py_unary_ufunc(
+                src, dst, exec_q, depends, asin_output_typeid_vector,
+                asin_contig_dispatch_vector, asin_strided_dispatch_vector);
+        };
+        m.def("_asin", asin_pyapi, "", py::arg("src"), py::arg("dst"),
+              py::arg("sycl_queue"), py::arg("depends") = py::list());
+
+        auto asin_result_type_pyapi = [&](const py::dtype &dtype) {
+            return py_unary_ufunc_result_type(dtype, asin_output_typeid_vector);
+        };
+        m.def("_asin_result_type", asin_result_type_pyapi);
+    }
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/asin.hpp b/dpctl_ext/tensor/libtensor/source/elementwise_functions/asin.hpp
new file mode 100644
index 000000000000..39230000bdfc
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/elementwise_functions/asin.hpp
@@ -0,0 +1,46 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern void init_asin(py::module_ m);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/asinh.cpp b/dpctl_ext/tensor/libtensor/source/elementwise_functions/asinh.cpp
new file mode 100644
index 000000000000..47f8a7dbf190
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/elementwise_functions/asinh.cpp
@@ -0,0 +1,127 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "asinh.hpp"
+#include "elementwise_functions.hpp"
+#include "utils/type_dispatch.hpp"
+#include "utils/type_dispatch_building.hpp"
+
+#include "kernels/elementwise_functions/asinh.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
+using ew_cmn_ns::unary_contig_impl_fn_ptr_t;
+using ew_cmn_ns::unary_strided_impl_fn_ptr_t;
+
+// U05: ==== ASINH   (x)
+namespace impl
+{
+
+namespace asinh_fn_ns = dpctl::tensor::kernels::asinh;
+
+static unary_contig_impl_fn_ptr_t
+    asinh_contig_dispatch_vector[td_ns::num_types];
+static int asinh_output_typeid_vector[td_ns::num_types];
+static unary_strided_impl_fn_ptr_t
+    asinh_strided_dispatch_vector[td_ns::num_types];
+
+void populate_asinh_dispatch_vectors(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = asinh_fn_ns;
+
+    using fn_ns::AsinhContigFactory;
+    DispatchVectorBuilder<unary_contig_impl_fn_ptr_t, AsinhContigFactory,
+                          num_types>
+        dvb1;
+    dvb1.populate_dispatch_vector(asinh_contig_dispatch_vector);
+
+    using fn_ns::AsinhStridedFactory;
+    DispatchVectorBuilder<unary_strided_impl_fn_ptr_t, AsinhStridedFactory,
+                          num_types>
+        dvb2;
+    dvb2.populate_dispatch_vector(asinh_strided_dispatch_vector);
+
+    using fn_ns::AsinhTypeMapFactory;
+    DispatchVectorBuilder<int, AsinhTypeMapFactory, num_types> dvb3;
+    dvb3.populate_dispatch_vector(asinh_output_typeid_vector);
+};
+
+} // namespace impl
+
+void init_asinh(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        impl::populate_asinh_dispatch_vectors();
+        using impl::asinh_contig_dispatch_vector;
+        using impl::asinh_output_typeid_vector;
+        using impl::asinh_strided_dispatch_vector;
+
+        auto asinh_pyapi = [&](const arrayT &src, const arrayT &dst,
+                               sycl::queue &exec_q,
+                               const event_vecT &depends = {}) {
+            return py_unary_ufunc(
+                src, dst, exec_q, depends, asinh_output_typeid_vector,
+                asinh_contig_dispatch_vector, asinh_strided_dispatch_vector);
+        };
+        m.def("_asinh", asinh_pyapi, "", py::arg("src"), py::arg("dst"),
+              py::arg("sycl_queue"), py::arg("depends") = py::list());
+
+        auto asinh_result_type_pyapi = [&](const py::dtype &dtype) {
+            return py_unary_ufunc_result_type(dtype,
+                                              asinh_output_typeid_vector);
+        };
+        m.def("_asinh_result_type", asinh_result_type_pyapi);
+    }
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/asinh.hpp b/dpctl_ext/tensor/libtensor/source/elementwise_functions/asinh.hpp
new file mode 100644
index 000000000000..0d761f082ae3
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/elementwise_functions/asinh.hpp
@@ -0,0 +1,46 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern void init_asinh(py::module_ m);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/atan.cpp b/dpctl_ext/tensor/libtensor/source/elementwise_functions/atan.cpp
new file mode 100644
index 000000000000..74ee82edbbc9
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/elementwise_functions/atan.cpp
@@ -0,0 +1,125 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "atan.hpp"
+#include "elementwise_functions.hpp"
+#include "utils/type_dispatch.hpp"
+#include "utils/type_dispatch_building.hpp"
+
+#include "kernels/elementwise_functions/atan.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
+using ew_cmn_ns::unary_contig_impl_fn_ptr_t;
+using ew_cmn_ns::unary_strided_impl_fn_ptr_t;
+
+// U06: ==== ATAN   (x)
+namespace impl
+{
+
+namespace atan_fn_ns = dpctl::tensor::kernels::atan;
+
+static unary_contig_impl_fn_ptr_t atan_contig_dispatch_vector[td_ns::num_types];
+static int atan_output_typeid_vector[td_ns::num_types];
+static unary_strided_impl_fn_ptr_t
+    atan_strided_dispatch_vector[td_ns::num_types];
+
+void populate_atan_dispatch_vectors(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = atan_fn_ns;
+
+    using fn_ns::AtanContigFactory;
+    DispatchVectorBuilder<unary_contig_impl_fn_ptr_t, AtanContigFactory,
+                          num_types>
+        dvb1;
+    dvb1.populate_dispatch_vector(atan_contig_dispatch_vector);
+
+    using fn_ns::AtanStridedFactory;
+    DispatchVectorBuilder<unary_strided_impl_fn_ptr_t, AtanStridedFactory,
+                          num_types>
+        dvb2;
+    dvb2.populate_dispatch_vector(atan_strided_dispatch_vector);
+
+    using fn_ns::AtanTypeMapFactory;
+    DispatchVectorBuilder<int, AtanTypeMapFactory, num_types> dvb3;
+    dvb3.populate_dispatch_vector(atan_output_typeid_vector);
+};
+
+} // namespace impl
+
+void init_atan(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        impl::populate_atan_dispatch_vectors();
+        using impl::atan_contig_dispatch_vector;
+        using impl::atan_output_typeid_vector;
+        using impl::atan_strided_dispatch_vector;
+
+        auto atan_pyapi = [&](const arrayT &src, const arrayT &dst,
+                              sycl::queue &exec_q,
+                              const event_vecT &depends = {}) {
+            return py_unary_ufunc(
+                src, dst, exec_q, depends, atan_output_typeid_vector,
+                atan_contig_dispatch_vector, atan_strided_dispatch_vector);
+        };
+        m.def("_atan", atan_pyapi, "", py::arg("src"), py::arg("dst"),
+              py::arg("sycl_queue"), py::arg("depends") = py::list());
+
+        auto atan_result_type_pyapi = [&](const py::dtype &dtype) {
+            return py_unary_ufunc_result_type(dtype, atan_output_typeid_vector);
+        };
+        m.def("_atan_result_type", atan_result_type_pyapi);
+    }
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/atan.hpp b/dpctl_ext/tensor/libtensor/source/elementwise_functions/atan.hpp
new file mode 100644
index 000000000000..c4eb3f3baf92
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/elementwise_functions/atan.hpp
@@ -0,0 +1,46 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern void init_atan(py::module_ m);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/atanh.cpp b/dpctl_ext/tensor/libtensor/source/elementwise_functions/atanh.cpp
new file mode 100644
index 000000000000..2857f9ab8c10
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/elementwise_functions/atanh.cpp
@@ -0,0 +1,127 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "atanh.hpp"
+#include "elementwise_functions.hpp"
+#include "utils/type_dispatch.hpp"
+#include "utils/type_dispatch_building.hpp"
+
+#include "kernels/elementwise_functions/atanh.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
+using ew_cmn_ns::unary_contig_impl_fn_ptr_t;
+using ew_cmn_ns::unary_strided_impl_fn_ptr_t;
+
+// U07: ==== ATANH   (x)
+namespace impl
+{
+
+namespace atanh_fn_ns = dpctl::tensor::kernels::atanh;
+
+static unary_contig_impl_fn_ptr_t
+    atanh_contig_dispatch_vector[td_ns::num_types];
+static int atanh_output_typeid_vector[td_ns::num_types];
+static unary_strided_impl_fn_ptr_t
+    atanh_strided_dispatch_vector[td_ns::num_types];
+
+void populate_atanh_dispatch_vectors(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = atanh_fn_ns;
+
+    using fn_ns::AtanhContigFactory;
+    DispatchVectorBuilder<unary_contig_impl_fn_ptr_t, AtanhContigFactory,
+                          num_types>
+        dvb1;
+    dvb1.populate_dispatch_vector(atanh_contig_dispatch_vector);
+
+    using fn_ns::AtanhStridedFactory;
+    DispatchVectorBuilder<unary_strided_impl_fn_ptr_t, AtanhStridedFactory,
+                          num_types>
+        dvb2;
+    dvb2.populate_dispatch_vector(atanh_strided_dispatch_vector);
+
+    using fn_ns::AtanhTypeMapFactory;
+    DispatchVectorBuilder<int, AtanhTypeMapFactory, num_types> dvb3;
+    dvb3.populate_dispatch_vector(atanh_output_typeid_vector);
+};
+
+} // namespace impl
+
+void init_atanh(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        impl::populate_atanh_dispatch_vectors();
+        using impl::atanh_contig_dispatch_vector;
+        using impl::atanh_output_typeid_vector;
+        using impl::atanh_strided_dispatch_vector;
+
+        auto atanh_pyapi = [&](const arrayT &src, const arrayT &dst,
+                               sycl::queue &exec_q,
+                               const event_vecT &depends = {}) {
+            return py_unary_ufunc(
+                src, dst, exec_q, depends, atanh_output_typeid_vector,
+                atanh_contig_dispatch_vector, atanh_strided_dispatch_vector);
+        };
+        m.def("_atanh", atanh_pyapi, "", py::arg("src"), py::arg("dst"),
+              py::arg("sycl_queue"), py::arg("depends") = py::list());
+
+        auto atanh_result_type_pyapi = [&](const py::dtype &dtype) {
+            return py_unary_ufunc_result_type(dtype,
+                                              atanh_output_typeid_vector);
+        };
+        m.def("_atanh_result_type", atanh_result_type_pyapi);
+    }
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/atanh.hpp b/dpctl_ext/tensor/libtensor/source/elementwise_functions/atanh.hpp
new file mode 100644
index 000000000000..5604e48deef6
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/elementwise_functions/atanh.hpp
@@ -0,0 +1,46 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern void init_atanh(py::module_ m);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/bitwise_invert.cpp b/dpctl_ext/tensor/libtensor/source/elementwise_functions/bitwise_invert.cpp
new file mode 100644
index 000000000000..05e7f4eeb61b
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/elementwise_functions/bitwise_invert.cpp
@@ -0,0 +1,129 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "bitwise_invert.hpp"
+#include "elementwise_functions.hpp"
+#include "utils/type_dispatch.hpp"
+#include "utils/type_dispatch_building.hpp"
+
+#include "kernels/elementwise_functions/bitwise_invert.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
+using ew_cmn_ns::unary_contig_impl_fn_ptr_t;
+using ew_cmn_ns::unary_strided_impl_fn_ptr_t;
+
+// U08: ===== BITWISE_INVERT        (x)
+namespace impl
+{
+
+namespace bitwise_invert_fn_ns = dpctl::tensor::kernels::bitwise_invert;
+
+static unary_contig_impl_fn_ptr_t
+    bitwise_invert_contig_dispatch_vector[td_ns::num_types];
+static int bitwise_invert_output_typeid_vector[td_ns::num_types];
+static unary_strided_impl_fn_ptr_t
+    bitwise_invert_strided_dispatch_vector[td_ns::num_types];
+
+void populate_bitwise_invert_dispatch_vectors(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = bitwise_invert_fn_ns;
+
+    using fn_ns::BitwiseInvertContigFactory;
+    DispatchVectorBuilder<unary_contig_impl_fn_ptr_t,
+                          BitwiseInvertContigFactory, num_types>
+        dvb1;
+    dvb1.populate_dispatch_vector(bitwise_invert_contig_dispatch_vector);
+
+    using fn_ns::BitwiseInvertStridedFactory;
+    DispatchVectorBuilder<unary_strided_impl_fn_ptr_t,
+                          BitwiseInvertStridedFactory, num_types>
+        dvb2;
+    dvb2.populate_dispatch_vector(bitwise_invert_strided_dispatch_vector);
+
+    using fn_ns::BitwiseInvertTypeMapFactory;
+    DispatchVectorBuilder<int, BitwiseInvertTypeMapFactory, num_types> dvb3;
+    dvb3.populate_dispatch_vector(bitwise_invert_output_typeid_vector);
+};
+
+} // namespace impl
+
+void init_bitwise_invert(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        impl::populate_bitwise_invert_dispatch_vectors();
+        using impl::bitwise_invert_contig_dispatch_vector;
+        using impl::bitwise_invert_output_typeid_vector;
+        using impl::bitwise_invert_strided_dispatch_vector;
+
+        auto bitwise_invert_pyapi = [&](const arrayT &src, const arrayT &dst,
+                                        sycl::queue &exec_q,
+                                        const event_vecT &depends = {}) {
+            return py_unary_ufunc(src, dst, exec_q, depends,
+                                  bitwise_invert_output_typeid_vector,
+                                  bitwise_invert_contig_dispatch_vector,
+                                  bitwise_invert_strided_dispatch_vector);
+        };
+        m.def("_bitwise_invert", bitwise_invert_pyapi, "", py::arg("src"),
+              py::arg("dst"), py::arg("sycl_queue"),
+              py::arg("depends") = py::list());
+
+        auto bitwise_invert_result_type_pyapi = [&](const py::dtype &dtype) {
+            return py_unary_ufunc_result_type(
+                dtype, bitwise_invert_output_typeid_vector);
+        };
+        m.def("_bitwise_invert_result_type", bitwise_invert_result_type_pyapi);
+    }
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/bitwise_invert.hpp b/dpctl_ext/tensor/libtensor/source/elementwise_functions/bitwise_invert.hpp
new file mode 100644
index 000000000000..e20c0df3cf11
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/elementwise_functions/bitwise_invert.hpp
@@ -0,0 +1,46 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern void init_bitwise_invert(py::module_ m);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/ceil.cpp b/dpctl_ext/tensor/libtensor/source/elementwise_functions/ceil.cpp
new file mode 100644
index 000000000000..4c4604e31692
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/elementwise_functions/ceil.cpp
@@ -0,0 +1,125 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "ceil.hpp"
+#include "elementwise_functions.hpp"
+#include "utils/type_dispatch.hpp"
+#include "utils/type_dispatch_building.hpp"
+
+#include "kernels/elementwise_functions/ceil.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
+using ew_cmn_ns::unary_contig_impl_fn_ptr_t;
+using ew_cmn_ns::unary_strided_impl_fn_ptr_t;
+
+// U09: ==== CEIL   (x)
+namespace impl
+{
+
+namespace ceil_fn_ns = dpctl::tensor::kernels::ceil;
+
+static unary_contig_impl_fn_ptr_t ceil_contig_dispatch_vector[td_ns::num_types];
+static int ceil_output_typeid_vector[td_ns::num_types];
+static unary_strided_impl_fn_ptr_t
+    ceil_strided_dispatch_vector[td_ns::num_types];
+
+void populate_ceil_dispatch_vectors(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = ceil_fn_ns;
+
+    using fn_ns::CeilContigFactory;
+    DispatchVectorBuilder<unary_contig_impl_fn_ptr_t, CeilContigFactory,
+                          num_types>
+        dvb1;
+    dvb1.populate_dispatch_vector(ceil_contig_dispatch_vector);
+
+    using fn_ns::CeilStridedFactory;
+    DispatchVectorBuilder<unary_strided_impl_fn_ptr_t, CeilStridedFactory,
+                          num_types>
+        dvb2;
+    dvb2.populate_dispatch_vector(ceil_strided_dispatch_vector);
+
+    using fn_ns::CeilTypeMapFactory;
+    DispatchVectorBuilder<int, CeilTypeMapFactory, num_types> dvb3;
+    dvb3.populate_dispatch_vector(ceil_output_typeid_vector);
+};
+
+} // namespace impl
+
+void init_ceil(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        impl::populate_ceil_dispatch_vectors();
+        using impl::ceil_contig_dispatch_vector;
+        using impl::ceil_output_typeid_vector;
+        using impl::ceil_strided_dispatch_vector;
+
+        auto ceil_pyapi = [&](const arrayT &src, const arrayT &dst,
+                              sycl::queue &exec_q,
+                              const event_vecT &depends = {}) {
+            return py_unary_ufunc(
+                src, dst, exec_q, depends, ceil_output_typeid_vector,
+                ceil_contig_dispatch_vector, ceil_strided_dispatch_vector);
+        };
+        m.def("_ceil", ceil_pyapi, "", py::arg("src"), py::arg("dst"),
+              py::arg("sycl_queue"), py::arg("depends") = py::list());
+
+        auto ceil_result_type_pyapi = [&](const py::dtype &dtype) {
+            return py_unary_ufunc_result_type(dtype, ceil_output_typeid_vector);
+        };
+        m.def("_ceil_result_type", ceil_result_type_pyapi);
+    }
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/ceil.hpp b/dpctl_ext/tensor/libtensor/source/elementwise_functions/ceil.hpp
new file mode 100644
index 000000000000..436cb5f89b2b
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/elementwise_functions/ceil.hpp
@@ -0,0 +1,46 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern void init_ceil(py::module_ m);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/conj.cpp b/dpctl_ext/tensor/libtensor/source/elementwise_functions/conj.cpp
new file mode 100644
index 000000000000..cee977f719f4
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/elementwise_functions/conj.cpp
@@ -0,0 +1,125 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "conj.hpp"
+#include "elementwise_functions.hpp"
+#include "utils/type_dispatch.hpp"
+#include "utils/type_dispatch_building.hpp"
+
+#include "kernels/elementwise_functions/common.hpp"
+#include "kernels/elementwise_functions/conj.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
+using ew_cmn_ns::unary_contig_impl_fn_ptr_t;
+using ew_cmn_ns::unary_strided_impl_fn_ptr_t;
+
+// U10: ==== CONJ   (x)
+namespace impl
+{
+
+namespace conj_fn_ns = dpctl::tensor::kernels::conj;
+
+static unary_contig_impl_fn_ptr_t conj_contig_dispatch_vector[td_ns::num_types];
+static int conj_output_typeid_vector[td_ns::num_types];
+static unary_strided_impl_fn_ptr_t
+    conj_strided_dispatch_vector[td_ns::num_types];
+
+void populate_conj_dispatch_vectors(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = conj_fn_ns;
+
+    using fn_ns::ConjContigFactory;
+    DispatchVectorBuilder<unary_contig_impl_fn_ptr_t, ConjContigFactory,
+                          num_types>
+        dvb1;
+    dvb1.populate_dispatch_vector(conj_contig_dispatch_vector);
+
+    using fn_ns::ConjStridedFactory;
+    DispatchVectorBuilder<unary_strided_impl_fn_ptr_t, ConjStridedFactory,
+                          num_types>
+        dvb2;
+    dvb2.populate_dispatch_vector(conj_strided_dispatch_vector);
+
+    using fn_ns::ConjTypeMapFactory;
+    DispatchVectorBuilder<int, ConjTypeMapFactory, num_types> dvb3;
+    dvb3.populate_dispatch_vector(conj_output_typeid_vector);
+};
+
+} // namespace impl
+
+void init_conj(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        impl::populate_conj_dispatch_vectors();
+        using impl::conj_contig_dispatch_vector;
+        using impl::conj_output_typeid_vector;
+        using impl::conj_strided_dispatch_vector;
+
+        auto conj_pyapi = [&](const arrayT &src, const arrayT &dst,
+                              sycl::queue &exec_q,
+                              const event_vecT &depends = {}) {
+            return py_unary_ufunc(
+                src, dst, exec_q, depends, conj_output_typeid_vector,
+                conj_contig_dispatch_vector, conj_strided_dispatch_vector);
+        };
+        m.def("_conj", conj_pyapi, "", py::arg("src"), py::arg("dst"),
+              py::arg("sycl_queue"), py::arg("depends") = py::list());
+
+        auto conj_result_type_pyapi = [&](const py::dtype &dtype) {
+            return py_unary_ufunc_result_type(dtype, conj_output_typeid_vector);
+        };
+        m.def("_conj_result_type", conj_result_type_pyapi);
+    }
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/conj.hpp b/dpctl_ext/tensor/libtensor/source/elementwise_functions/conj.hpp
new file mode 100644
index 000000000000..4c0aeb17260b
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/elementwise_functions/conj.hpp
@@ -0,0 +1,46 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern void init_conj(py::module_ m);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/elementwise_common.cpp b/dpctl_ext/tensor/libtensor/source/elementwise_functions/elementwise_common.cpp
new file mode 100644
index 000000000000..c2a0f3762b01
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/elementwise_functions/elementwise_common.cpp
@@ -0,0 +1,191 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#include <pybind11/pybind11.h>
+
+#include "abs.hpp"
+#include "acos.hpp"
+#include "acosh.hpp"
+// #include "add.hpp"
+#include "angle.hpp"
+#include "asin.hpp"
+#include "asinh.hpp"
+#include "atan.hpp"
+// #include "atan2.hpp"
+#include "atanh.hpp"
+// #include "bitwise_and.hpp"
+#include "bitwise_invert.hpp"
+// #include "bitwise_left_shift.hpp"
+// #include "bitwise_or.hpp"
+// #include "bitwise_right_shift.hpp"
+// #include "bitwise_xor.hpp"
+// #include "cbrt.hpp"
+#include "ceil.hpp"
+#include "conj.hpp"
+// #include "copysign.hpp"
+// #include "cos.hpp"
+// #include "cosh.hpp"
+// #include "equal.hpp"
+// #include "exp.hpp"
+// #include "exp2.hpp"
+// #include "expm1.hpp"
+// #include "floor.hpp"
+// #include "floor_divide.hpp"
+// #include "greater.hpp"
+// #include "greater_equal.hpp"
+// #include "hypot.hpp"
+// #include "imag.hpp"
+// #include "isfinite.hpp"
+// #include "isinf.hpp"
+// #include "isnan.hpp"
+// #include "less.hpp"
+// #include "less_equal.hpp"
+// #include "log.hpp"
+// #include "log10.hpp"
+// #include "log1p.hpp"
+// #include "log2.hpp"
+// #include "logaddexp.hpp"
+// #include "logical_and.hpp"
+// #include "logical_not.hpp"
+// #include "logical_or.hpp"
+// #include "logical_xor.hpp"
+// #include "maximum.hpp"
+// #include "minimum.hpp"
+// #include "multiply.hpp"
+// #include "negative.hpp"
+// #include "nextafter.hpp"
+// #include "not_equal.hpp"
+// #include "positive.hpp"
+// #include "pow.hpp"
+// #include "proj.hpp"
+// #include "real.hpp"
+// #include "reciprocal.hpp"
+// #include "remainder.hpp"
+// #include "round.hpp"
+// #include "rsqrt.hpp"
+// #include "sign.hpp"
+// #include "signbit.hpp"
+// #include "sin.hpp"
+// #include "sinh.hpp"
+// #include "sqrt.hpp"
+// #include "square.hpp"
+// #include "subtract.hpp"
+// #include "tan.hpp"
+// #include "tanh.hpp"
+// #include "true_divide.hpp"
+// #include "trunc.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace py = pybind11;
+
+/*! @brief Add elementwise functions to Python module */
+void init_elementwise_functions(py::module_ m)
+{
+    init_abs(m);
+    init_acos(m);
+    init_acosh(m);
+    // init_add(m);
+    init_angle(m);
+    init_asin(m);
+    init_asinh(m);
+    init_atan(m);
+    // init_atan2(m);
+    init_atanh(m);
+    // init_bitwise_and(m);
+    init_bitwise_invert(m);
+    // init_bitwise_left_shift(m);
+    // init_bitwise_or(m);
+    // init_bitwise_right_shift(m);
+    // init_bitwise_xor(m);
+    // init_cbrt(m);
+    init_ceil(m);
+    init_conj(m);
+    // init_copysign(m);
+    // init_cos(m);
+    // init_cosh(m);
+    // init_divide(m);
+    // init_equal(m);
+    // init_exp(m);
+    // init_exp2(m);
+    // init_expm1(m);
+    // init_floor(m);
+    // init_floor_divide(m);
+    // init_greater(m);
+    // init_greater_equal(m);
+    // init_hypot(m);
+    // init_imag(m);
+    // init_isfinite(m);
+    // init_isinf(m);
+    // init_isnan(m);
+    // init_less(m);
+    // init_less_equal(m);
+    // init_log(m);
+    // init_log10(m);
+    // init_log1p(m);
+    // init_log2(m);
+    // init_logaddexp(m);
+    // init_logical_and(m);
+    // init_logical_not(m);
+    // init_logical_or(m);
+    // init_logical_xor(m);
+    // init_maximum(m);
+    // init_minimum(m);
+    // init_multiply(m);
+    // init_nextafter(m);
+    // init_negative(m);
+    // init_not_equal(m);
+    // init_positive(m);
+    // init_pow(m);
+    // init_proj(m);
+    // init_real(m);
+    // init_reciprocal(m);
+    // init_remainder(m);
+    // init_round(m);
+    // init_rsqrt(m);
+    // init_sign(m);
+    // init_signbit(m);
+    // init_sin(m);
+    // init_sinh(m);
+    // init_sqrt(m);
+    // init_square(m);
+    // init_subtract(m);
+    // init_tan(m);
+    // init_tanh(m);
+    // init_trunc(m);
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/elementwise_common.hpp b/dpctl_ext/tensor/libtensor/source/elementwise_functions/elementwise_common.hpp
new file mode 100644
index 000000000000..0c385f2d15a5
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/elementwise_functions/elementwise_common.hpp
@@ -0,0 +1,46 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern void init_elementwise_functions(py::module_);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/elementwise_functions.hpp b/dpctl_ext/tensor/libtensor/source/elementwise_functions/elementwise_functions.hpp
new file mode 100644
index 000000000000..cd56c5707264
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/elementwise_functions/elementwise_functions.hpp
@@ -0,0 +1,284 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+
+#include <cstddef>
+#include <stdexcept>
+#include <string>
+#include <type_traits>
+#include <utility>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+
+#include "elementwise_functions_type_utils.hpp"
+#include "kernels/dpctl_tensor_types.hpp"
+#include "simplify_iteration_space.hpp"
+#include "utils/memory_overlap.hpp"
+#include "utils/offset_utils.hpp"
+#include "utils/output_validation.hpp"
+#include "utils/sycl_alloc_utils.hpp"
+#include "utils/type_dispatch.hpp"
+
+static_assert(std::is_same_v<py::ssize_t, dpctl::tensor::ssize_t>);
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+/*! @brief Template implementing Python API for unary elementwise functions */
+template <typename output_typesT,
+          typename contig_dispatchT,
+          typename strided_dispatchT>
+std::pair<sycl::event, sycl::event>
+    py_unary_ufunc(const dpctl::tensor::usm_ndarray &src,
+                   const dpctl::tensor::usm_ndarray &dst,
+                   sycl::queue &q,
+                   const std::vector<sycl::event> &depends,
+                   //
+                   const output_typesT &output_type_vec,
+                   const contig_dispatchT &contig_dispatch_vector,
+                   const strided_dispatchT &strided_dispatch_vector)
+{
+    int src_typenum = src.get_typenum();
+    int dst_typenum = dst.get_typenum();
+
+    const auto &array_types = td_ns::usm_ndarray_types();
+    int src_typeid = array_types.typenum_to_lookup_id(src_typenum);
+    int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum);
+
+    int func_output_typeid = output_type_vec[src_typeid];
+
+    // check that types are supported
+    if (dst_typeid != func_output_typeid) {
+        throw py::value_error(
+            "Destination array has unexpected elemental data type.");
+    }
+
+    // check that queues are compatible
+    if (!dpctl::utils::queues_are_compatible(q, {src, dst})) {
+        throw py::value_error(
+            "Execution queue is not compatible with allocation queues");
+    }
+
+    dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst);
+
+    // check that dimensions are the same
+    int src_nd = src.get_ndim();
+    if (src_nd != dst.get_ndim()) {
+        throw py::value_error("Array dimensions are not the same.");
+    }
+
+    // check that shapes are the same
+    const py::ssize_t *src_shape = src.get_shape_raw();
+    const py::ssize_t *dst_shape = dst.get_shape_raw();
+    bool shapes_equal(true);
+    std::size_t src_nelems(1);
+
+    for (int i = 0; i < src_nd; ++i) {
+        src_nelems *= static_cast<std::size_t>(src_shape[i]);
+        shapes_equal = shapes_equal && (src_shape[i] == dst_shape[i]);
+    }
+    if (!shapes_equal) {
+        throw py::value_error("Array shapes are not the same.");
+    }
+
+    // if nelems is zero, return
+    if (src_nelems == 0) {
+        return std::make_pair(sycl::event(), sycl::event());
+    }
+
+    dpctl::tensor::validation::AmpleMemory::throw_if_not_ample(dst, src_nelems);
+
+    // check memory overlap
+    auto const &overlap = dpctl::tensor::overlap::MemoryOverlap();
+    auto const &same_logical_tensors =
+        dpctl::tensor::overlap::SameLogicalTensors();
+    if (overlap(src, dst) && !same_logical_tensors(src, dst)) {
+        throw py::value_error("Arrays index overlapping segments of memory");
+    }
+
+    const char *src_data = src.get_data();
+    char *dst_data = dst.get_data();
+
+    // handle contiguous inputs
+    bool is_src_c_contig = src.is_c_contiguous();
+    bool is_src_f_contig = src.is_f_contiguous();
+
+    bool is_dst_c_contig = dst.is_c_contiguous();
+    bool is_dst_f_contig = dst.is_f_contiguous();
+
+    bool both_c_contig = (is_src_c_contig && is_dst_c_contig);
+    bool both_f_contig = (is_src_f_contig && is_dst_f_contig);
+
+    if (both_c_contig || both_f_contig) {
+        auto contig_fn = contig_dispatch_vector[src_typeid];
+
+        if (contig_fn == nullptr) {
+            throw std::runtime_error(
+                "Contiguous implementation is missing for src_typeid=" +
+                std::to_string(src_typeid));
+        }
+
+        auto comp_ev = contig_fn(q, src_nelems, src_data, dst_data, depends);
+        sycl::event ht_ev =
+            dpctl::utils::keep_args_alive(q, {src, dst}, {comp_ev});
+
+        return std::make_pair(ht_ev, comp_ev);
+    }
+
+    // simplify iteration space
+    //     if 1d with strides 1 - input is contig
+    //     dispatch to strided
+
+    auto const &src_strides = src.get_strides_vector();
+    auto const &dst_strides = dst.get_strides_vector();
+
+    using shT = std::vector<py::ssize_t>;
+    shT simplified_shape;
+    shT simplified_src_strides;
+    shT simplified_dst_strides;
+    py::ssize_t src_offset(0);
+    py::ssize_t dst_offset(0);
+
+    int nd = src_nd;
+    const py::ssize_t *shape = src_shape;
+
+    simplify_iteration_space(nd, shape, src_strides, dst_strides,
+                             // output
+                             simplified_shape, simplified_src_strides,
+                             simplified_dst_strides, src_offset, dst_offset);
+
+    if (nd == 1 && simplified_src_strides[0] == 1 &&
+        simplified_dst_strides[0] == 1) {
+        // Special case of contiguous data
+        auto contig_fn = contig_dispatch_vector[src_typeid];
+
+        if (contig_fn == nullptr) {
+            throw std::runtime_error(
+                "Contiguous implementation is missing for src_typeid=" +
+                std::to_string(src_typeid));
+        }
+
+        int src_elem_size = src.get_elemsize();
+        int dst_elem_size = dst.get_elemsize();
+        auto comp_ev =
+            contig_fn(q, src_nelems, src_data + src_elem_size * src_offset,
+                      dst_data + dst_elem_size * dst_offset, depends);
+
+        sycl::event ht_ev =
+            dpctl::utils::keep_args_alive(q, {src, dst}, {comp_ev});
+
+        return std::make_pair(ht_ev, comp_ev);
+    }
+
+    // Strided implementation
+    auto strided_fn = strided_dispatch_vector[src_typeid];
+
+    if (strided_fn == nullptr) {
+        throw std::runtime_error(
+            "Strided implementation is missing for src_typeid=" +
+            std::to_string(src_typeid));
+    }
+
+    using dpctl::tensor::offset_utils::device_allocate_and_pack;
+
+    std::vector<sycl::event> host_tasks{};
+    host_tasks.reserve(2);
+
+    auto ptr_size_event_triple_ = device_allocate_and_pack<py::ssize_t>(
+        q, host_tasks, simplified_shape, simplified_src_strides,
+        simplified_dst_strides);
+    auto shape_strides_owner = std::move(std::get<0>(ptr_size_event_triple_));
+    const auto &copy_shape_ev = std::get<2>(ptr_size_event_triple_);
+    const py::ssize_t *shape_strides = shape_strides_owner.get();
+
+    sycl::event strided_fn_ev =
+        strided_fn(q, src_nelems, nd, shape_strides, src_data, src_offset,
+                   dst_data, dst_offset, depends, {copy_shape_ev});
+
+    // async free of shape_strides temporary
+    sycl::event tmp_cleanup_ev = dpctl::tensor::alloc_utils::async_smart_free(
+        q, {strided_fn_ev}, shape_strides_owner);
+
+    host_tasks.push_back(tmp_cleanup_ev);
+
+    return std::make_pair(
+        dpctl::utils::keep_args_alive(q, {src, dst}, host_tasks),
+        strided_fn_ev);
+}
+
+/*! @brief Template implementing Python API for querying of type support by
+ *         unary elementwise functions */
+template <typename output_typesT>
+py::object py_unary_ufunc_result_type(const py::dtype &input_dtype,
+                                      const output_typesT &output_types)
+{
+    int tn = input_dtype.num(); // NumPy type numbers are the same as in dpctl
+    int src_typeid = -1;
+
+    auto array_types = td_ns::usm_ndarray_types();
+
+    try {
+        src_typeid = array_types.typenum_to_lookup_id(tn);
+    } catch (const std::exception &e) {
+        throw py::value_error(e.what());
+    }
+
+    using type_utils::_result_typeid;
+    int dst_typeid = _result_typeid(src_typeid, output_types);
+
+    if (dst_typeid < 0) {
+        auto res = py::none();
+        return py::cast<py::object>(res);
+    }
+    else {
+        using type_utils::_dtype_from_typenum;
+
+        auto dst_typenum_t = static_cast<td_ns::typenum_t>(dst_typeid);
+        auto dt = _dtype_from_typenum(dst_typenum_t);
+
+        return py::cast<py::object>(dt);
+    }
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/elementwise_functions_type_utils.cpp b/dpctl_ext/tensor/libtensor/source/elementwise_functions/elementwise_functions_type_utils.cpp
new file mode 100644
index 000000000000..7d327ada7349
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/elementwise_functions/elementwise_functions_type_utils.cpp
@@ -0,0 +1,96 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions for looking of supported types in elementwise
+/// functions.
+//===---------------------------------------------------------------------===//
+
+#include <string>
+
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+
+#include "elementwise_functions_type_utils.hpp"
+#include "utils/type_dispatch.hpp"
+
+namespace dpctl::tensor::py_internal::type_utils
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+py::dtype _dtype_from_typenum(td_ns::typenum_t dst_typenum_t)
+{
+    switch (dst_typenum_t) {
+    case td_ns::typenum_t::BOOL:
+        return py::dtype("?");
+    case td_ns::typenum_t::INT8:
+        return py::dtype("i1");
+    case td_ns::typenum_t::UINT8:
+        return py::dtype("u1");
+    case td_ns::typenum_t::INT16:
+        return py::dtype("i2");
+    case td_ns::typenum_t::UINT16:
+        return py::dtype("u2");
+    case td_ns::typenum_t::INT32:
+        return py::dtype("i4");
+    case td_ns::typenum_t::UINT32:
+        return py::dtype("u4");
+    case td_ns::typenum_t::INT64:
+        return py::dtype("i8");
+    case td_ns::typenum_t::UINT64:
+        return py::dtype("u8");
+    case td_ns::typenum_t::HALF:
+        return py::dtype("f2");
+    case td_ns::typenum_t::FLOAT:
+        return py::dtype("f4");
+    case td_ns::typenum_t::DOUBLE:
+        return py::dtype("f8");
+    case td_ns::typenum_t::CFLOAT:
+        return py::dtype("c8");
+    case td_ns::typenum_t::CDOUBLE:
+        return py::dtype("c16");
+    default:
+        throw py::value_error("Unrecognized dst_typeid");
+    }
+}
+
+int _result_typeid(int arg_typeid, const int *fn_output_id)
+{
+    if (arg_typeid < 0 || arg_typeid >= td_ns::num_types) {
+        throw py::value_error("Input typeid " + std::to_string(arg_typeid) +
+                              " is outside of expected bounds.");
+    }
+
+    return fn_output_id[arg_typeid];
+}
+
+} // namespace dpctl::tensor::py_internal::type_utils
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/elementwise_functions_type_utils.hpp b/dpctl_ext/tensor/libtensor/source/elementwise_functions/elementwise_functions_type_utils.hpp
new file mode 100644
index 000000000000..d3324feb3470
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/elementwise_functions/elementwise_functions_type_utils.hpp
@@ -0,0 +1,56 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions for looking of supported types in elementwise
+/// functions.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+
+#include "utils/type_dispatch.hpp"
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace dpctl::tensor::py_internal::type_utils
+{
+
+/*! @brief Produce dtype from a type number */
+extern py::dtype _dtype_from_typenum(td_ns::typenum_t);
+
+/*! @brief Lookup typeid of the result from typeid of
+ *         argument and the mapping table */
+extern int _result_typeid(int, const int *);
+
+} // namespace dpctl::tensor::py_internal::type_utils
diff --git a/dpctl_ext/tensor/libtensor/source/tensor_elementwise.cpp b/dpctl_ext/tensor/libtensor/source/tensor_elementwise.cpp
new file mode 100644
index 000000000000..76b9916ca9d3
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/tensor_elementwise.cpp
@@ -0,0 +1,45 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension.
+//===---------------------------------------------------------------------===//
+
+#include <pybind11/pybind11.h>
+
+#include "elementwise_functions/elementwise_common.hpp"
+
+namespace py = pybind11;
+
+PYBIND11_MODULE(_tensor_elementwise_impl, m)
+{
+    dpctl::tensor::py_internal::init_elementwise_functions(m);
+}
diff --git a/dpnp/dpnp_iface_bitwise.py b/dpnp/dpnp_iface_bitwise.py
index 733fbc697241..edf68b2f6581 100644
--- a/dpnp/dpnp_iface_bitwise.py
+++ b/dpnp/dpnp_iface_bitwise.py
@@ -46,6 +46,9 @@
 import dpctl.tensor._tensor_elementwise_impl as ti
 import numpy
 
+# TODO: revert to `import dpctl.tensor...`
+# when dpnp fully migrates dpctl/tensor
+import dpctl_ext.tensor._tensor_elementwise_impl as ti_ext
 import dpnp.backend.extensions.ufunc._ufunc_impl as ufi
 from dpnp.dpnp_algo.dpnp_elementwise_common import DPNPBinaryFunc, DPNPUnaryFunc
 
@@ -514,8 +517,8 @@ def binary_repr(num, width=None):
 
 invert = DPNPUnaryFunc(
     "invert",
-    ti._bitwise_invert_result_type,
-    ti._bitwise_invert,
+    ti_ext._bitwise_invert_result_type,
+    ti_ext._bitwise_invert,
     _INVERT_DOCSTRING,
 )
 
diff --git a/dpnp/dpnp_iface_mathematical.py b/dpnp/dpnp_iface_mathematical.py
index cdcdd3af92e4..906f814604b0 100644
--- a/dpnp/dpnp_iface_mathematical.py
+++ b/dpnp/dpnp_iface_mathematical.py
@@ -54,6 +54,7 @@
 # TODO: revert to `import dpctl.tensor...`
 # when dpnp fully migrates dpctl/tensor
 import dpctl_ext.tensor as dpt
+import dpctl_ext.tensor._tensor_elementwise_impl as ti_ext
 import dpctl_ext.tensor._type_utils as dtu
 import dpnp
 import dpnp.backend.extensions.ufunc._ufunc_impl as ufi
@@ -384,8 +385,8 @@ def _validate_interp_param(param, name, exec_q, usm_type, dtype=None):
 
 abs = DPNPUnaryFunc(
     "abs",
-    ti._abs_result_type,
-    ti._abs,
+    ti_ext._abs_result_type,
+    ti_ext._abs,
     _ABS_DOCSTRING,
     mkl_fn_to_call="_mkl_abs_to_call",
     mkl_impl_fn="_abs",
@@ -540,8 +541,8 @@ def _validate_interp_param(param, name, exec_q, usm_type, dtype=None):
 
 angle = DPNPAngle(
     "angle",
-    ti._angle_result_type,
-    ti._angle,
+    ti_ext._angle_result_type,
+    ti_ext._angle,
     _ANGLE_DOCSTRING,
     mkl_fn_to_call="_mkl_arg_to_call",
     mkl_impl_fn="_arg",
@@ -646,8 +647,8 @@ def around(x, /, decimals=0, out=None):
 
 ceil = DPNPUnaryFunc(
     "ceil",
-    ti._ceil_result_type,
-    ti._ceil,
+    ti_ext._ceil_result_type,
+    ti_ext._ceil,
     _CEIL_DOCSTRING,
     mkl_fn_to_call="_mkl_ceil_to_call",
     mkl_impl_fn="_ceil",
@@ -781,8 +782,8 @@ def clip(a, /, min=None, max=None, *, out=None, order="K", **kwargs):
 
 conj = DPNPUnaryFunc(
     "conj",
-    ti._conj_result_type,
-    ti._conj,
+    ti_ext._conj_result_type,
+    ti_ext._conj,
     _CONJ_DOCSTRING,
     mkl_fn_to_call="_mkl_conj_to_call",
     mkl_impl_fn="_conj",
diff --git a/dpnp/dpnp_iface_trigonometric.py b/dpnp/dpnp_iface_trigonometric.py
index a17c7dfd9d9a..24004fbbeaf9 100644
--- a/dpnp/dpnp_iface_trigonometric.py
+++ b/dpnp/dpnp_iface_trigonometric.py
@@ -47,6 +47,7 @@
 # TODO: revert to `import dpctl.tensor...`
 # when dpnp fully migrates dpctl/tensor
 import dpctl_ext.tensor as dpt
+import dpctl_ext.tensor._tensor_elementwise_impl as ti_ext
 import dpctl_ext.tensor._type_utils as dtu
 import dpnp
 import dpnp.backend.extensions.ufunc._ufunc_impl as ufi
@@ -138,8 +139,8 @@ def _get_accumulation_res_dt(a, dtype):
 
 acos = DPNPUnaryFunc(
     "acos",
-    ti._acos_result_type,
-    ti._acos,
+    ti_ext._acos_result_type,
+    ti_ext._acos,
     _ACOS_DOCSTRING,
     mkl_fn_to_call="_mkl_acos_to_call",
     mkl_impl_fn="_acos",
@@ -224,8 +225,8 @@ def _get_accumulation_res_dt(a, dtype):
 
 acosh = DPNPUnaryFunc(
     "acosh",
-    ti._acosh_result_type,
-    ti._acosh,
+    ti_ext._acosh_result_type,
+    ti_ext._acosh,
     _ACOSH_DOCSTRING,
     mkl_fn_to_call="_mkl_acosh_to_call",
     mkl_impl_fn="_acosh",
@@ -310,8 +311,8 @@ def _get_accumulation_res_dt(a, dtype):
 
 asin = DPNPUnaryFunc(
     "asin",
-    ti._asin_result_type,
-    ti._asin,
+    ti_ext._asin_result_type,
+    ti_ext._asin,
     _ASIN_DOCSTRING,
     mkl_fn_to_call="_mkl_asin_to_call",
     mkl_impl_fn="_asin",
@@ -394,8 +395,8 @@ def _get_accumulation_res_dt(a, dtype):
 
 asinh = DPNPUnaryFunc(
     "asinh",
-    ti._asinh_result_type,
-    ti._asinh,
+    ti_ext._asinh_result_type,
+    ti_ext._asinh,
     _ASINH_DOCSTRING,
     mkl_fn_to_call="_mkl_asinh_to_call",
     mkl_impl_fn="_asinh",
@@ -480,8 +481,8 @@ def _get_accumulation_res_dt(a, dtype):
 
 atan = DPNPUnaryFunc(
     "atan",
-    ti._atan_result_type,
-    ti._atan,
+    ti_ext._atan_result_type,
+    ti_ext._atan,
     _ATAN_DOCSTRING,
     mkl_fn_to_call="_mkl_atan_to_call",
     mkl_impl_fn="_atan",
@@ -655,8 +656,8 @@ def _get_accumulation_res_dt(a, dtype):
 
 atanh = DPNPUnaryFunc(
     "atanh",
-    ti._atanh_result_type,
-    ti._atanh,
+    ti_ext._atanh_result_type,
+    ti_ext._atanh,
     _ATANH_DOCSTRING,
     mkl_fn_to_call="_mkl_atanh_to_call",
     mkl_impl_fn="_atanh",

From ce5f54e98925dcd7807f4119c58ce07dbd0d162f Mon Sep 17 00:00:00 2001
From: vlad-perevezentsev <vladislav.perevezentsev@intel.com>
Date: Tue, 17 Mar 2026 19:23:05 +0100
Subject: [PATCH 14/43] Extend `_tensor_elementwise_impl` (unary) part 2
 (#2796)

This PR extends `_tensor_elementwise_impl` with part of the unary
functions: `cos, cosh, exp, expm1, floor, imag, isfinite, isinf, isnan,
log, log1p, log2, log10, logical_not, negative, positive`
---
 dpctl_ext/tensor/CMakeLists.txt               |  32 +-
 dpctl_ext/tensor/__init__.py                  |  32 ++
 dpctl_ext/tensor/_elementwise_funcs.py        | 458 ++++++++++++++++++
 .../kernels/elementwise_functions/cos.hpp     | 311 ++++++++++++
 .../kernels/elementwise_functions/cosh.hpp    | 301 ++++++++++++
 .../kernels/elementwise_functions/exp.hpp     | 269 ++++++++++
 .../kernels/elementwise_functions/expm1.hpp   | 282 +++++++++++
 .../kernels/elementwise_functions/floor.hpp   | 229 +++++++++
 .../kernels/elementwise_functions/imag.hpp    | 232 +++++++++
 .../elementwise_functions/isfinite.hpp        | 229 +++++++++
 .../kernels/elementwise_functions/isinf.hpp   | 224 +++++++++
 .../kernels/elementwise_functions/isnan.hpp   | 222 +++++++++
 .../kernels/elementwise_functions/log.hpp     | 222 +++++++++
 .../kernels/elementwise_functions/log10.hpp   | 240 +++++++++
 .../kernels/elementwise_functions/log1p.hpp   | 248 ++++++++++
 .../kernels/elementwise_functions/log2.hpp    | 241 +++++++++
 .../elementwise_functions/logaddexp.hpp       |   3 +-
 .../elementwise_functions/logical_not.hpp     | 199 ++++++++
 .../elementwise_functions/negative.hpp        | 222 +++++++++
 .../elementwise_functions/positive.hpp        | 238 +++++++++
 .../source/elementwise_functions/cos.cpp      | 125 +++++
 .../source/elementwise_functions/cos.hpp      |  46 ++
 .../source/elementwise_functions/cosh.cpp     | 125 +++++
 .../source/elementwise_functions/cosh.hpp     |  46 ++
 .../elementwise_common.cpp                    |  64 +--
 .../source/elementwise_functions/exp.cpp      | 125 +++++
 .../source/elementwise_functions/exp.hpp      |  46 ++
 .../source/elementwise_functions/expm1.cpp    | 127 +++++
 .../source/elementwise_functions/expm1.hpp    |  46 ++
 .../source/elementwise_functions/floor.cpp    | 127 +++++
 .../source/elementwise_functions/floor.hpp    |  46 ++
 .../source/elementwise_functions/imag.cpp     | 125 +++++
 .../source/elementwise_functions/imag.hpp     |  46 ++
 .../source/elementwise_functions/isfinite.cpp | 128 +++++
 .../source/elementwise_functions/isfinite.hpp |  46 ++
 .../source/elementwise_functions/isinf.cpp    | 127 +++++
 .../source/elementwise_functions/isinf.hpp    |  46 ++
 .../source/elementwise_functions/isnan.cpp    | 127 +++++
 .../source/elementwise_functions/isnan.hpp    |  46 ++
 .../source/elementwise_functions/log.cpp      | 125 +++++
 .../source/elementwise_functions/log.hpp      |  46 ++
 .../source/elementwise_functions/log10.cpp    | 127 +++++
 .../source/elementwise_functions/log10.hpp    |  46 ++
 .../source/elementwise_functions/log1p.cpp    | 127 +++++
 .../source/elementwise_functions/log1p.hpp    |  46 ++
 .../source/elementwise_functions/log2.cpp     | 125 +++++
 .../source/elementwise_functions/log2.hpp     |  46 ++
 .../elementwise_functions/logical_not.cpp     | 129 +++++
 .../elementwise_functions/logical_not.hpp     |  46 ++
 .../source/elementwise_functions/negative.cpp | 128 +++++
 .../source/elementwise_functions/negative.hpp |  46 ++
 .../source/elementwise_functions/positive.cpp | 128 +++++
 .../source/elementwise_functions/positive.hpp |  46 ++
 dpnp/dpnp_iface_logic.py                      |  19 +-
 dpnp/dpnp_iface_mathematical.py               |  16 +-
 dpnp/dpnp_iface_trigonometric.py              |  32 +-
 56 files changed, 7244 insertions(+), 82 deletions(-)
 create mode 100644 dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/cos.hpp
 create mode 100644 dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/cosh.hpp
 create mode 100644 dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/exp.hpp
 create mode 100644 dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/expm1.hpp
 create mode 100644 dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/floor.hpp
 create mode 100644 dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/imag.hpp
 create mode 100644 dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/isfinite.hpp
 create mode 100644 dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/isinf.hpp
 create mode 100644 dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/isnan.hpp
 create mode 100644 dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/log.hpp
 create mode 100644 dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/log10.hpp
 create mode 100644 dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/log1p.hpp
 create mode 100644 dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/log2.hpp
 create mode 100644 dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/logical_not.hpp
 create mode 100644 dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/negative.hpp
 create mode 100644 dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/positive.hpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/elementwise_functions/cos.cpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/elementwise_functions/cos.hpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/elementwise_functions/cosh.cpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/elementwise_functions/cosh.hpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/elementwise_functions/exp.cpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/elementwise_functions/exp.hpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/elementwise_functions/expm1.cpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/elementwise_functions/expm1.hpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/elementwise_functions/floor.cpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/elementwise_functions/floor.hpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/elementwise_functions/imag.cpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/elementwise_functions/imag.hpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/elementwise_functions/isfinite.cpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/elementwise_functions/isfinite.hpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/elementwise_functions/isinf.cpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/elementwise_functions/isinf.hpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/elementwise_functions/isnan.cpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/elementwise_functions/isnan.hpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/elementwise_functions/log.cpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/elementwise_functions/log.hpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/elementwise_functions/log10.cpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/elementwise_functions/log10.hpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/elementwise_functions/log1p.cpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/elementwise_functions/log1p.hpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/elementwise_functions/log2.cpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/elementwise_functions/log2.hpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/elementwise_functions/logical_not.cpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/elementwise_functions/logical_not.hpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/elementwise_functions/negative.cpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/elementwise_functions/negative.hpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/elementwise_functions/positive.cpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/elementwise_functions/positive.hpp

diff --git a/dpctl_ext/tensor/CMakeLists.txt b/dpctl_ext/tensor/CMakeLists.txt
index 1a9649b91f82..261204223ddd 100644
--- a/dpctl_ext/tensor/CMakeLists.txt
+++ b/dpctl_ext/tensor/CMakeLists.txt
@@ -92,39 +92,39 @@ set(_elementwise_sources
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/ceil.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/conj.cpp
     #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/copysign.cpp
-    #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/cos.cpp
-    #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/cosh.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/cos.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/cosh.cpp
     #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/equal.cpp
-    #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/exp.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/exp.cpp
     #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/exp2.cpp
-    #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/expm1.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/expm1.cpp
     #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/floor_divide.cpp
-    #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/floor.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/floor.cpp
     #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/greater_equal.cpp
     #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/greater.cpp
     #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/hypot.cpp
-    #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/imag.cpp
-    #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/isfinite.cpp
-    #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/isinf.cpp
-    #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/isnan.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/imag.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/isfinite.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/isinf.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/isnan.cpp
     #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/less_equal.cpp
     #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/less.cpp
-    #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/log.cpp
-    #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/log1p.cpp
-    #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/log2.cpp
-    #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/log10.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/log.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/log1p.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/log2.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/log10.cpp
     #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/logaddexp.cpp
     #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/logical_and.cpp
-    #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/logical_not.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/logical_not.cpp
     #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/logical_or.cpp
     #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/logical_xor.cpp
     #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/maximum.cpp
     #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/minimum.cpp
     #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/multiply.cpp
-    #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/negative.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/negative.cpp
     #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/nextafter.cpp
     #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/not_equal.cpp
-    #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/positive.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/positive.cpp
     #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/pow.cpp
     #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/proj.cpp
     #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/real.cpp
diff --git a/dpctl_ext/tensor/__init__.py b/dpctl_ext/tensor/__init__.py
index be7ec6851b5b..ea18c2aab35e 100644
--- a/dpctl_ext/tensor/__init__.py
+++ b/dpctl_ext/tensor/__init__.py
@@ -65,6 +65,22 @@
     bitwise_invert,
     ceil,
     conj,
+    cos,
+    cosh,
+    exp,
+    expm1,
+    floor,
+    imag,
+    isfinite,
+    isinf,
+    isnan,
+    log,
+    log1p,
+    log2,
+    log10,
+    logical_not,
+    negative,
+    positive,
 )
 from ._indexing_functions import (
     extract,
@@ -142,6 +158,8 @@
     "concat",
     "conj",
     "copy",
+    "cos",
+    "cosh",
     "count_nonzero",
     "clip",
     "cumulative_logsumexp",
@@ -153,25 +171,39 @@
     "extract",
     "expand_dims",
     "eye",
+    "exp",
+    "expm1",
     "finfo",
     "flip",
+    "floor",
     "from_numpy",
     "full",
     "full_like",
     "iinfo",
+    "imag",
+    "isfinite",
+    "isinf",
     "isdtype",
     "isin",
+    "isnan",
     "linspace",
+    "log",
+    "logical_not",
     "logsumexp",
+    "log1p",
+    "log2",
+    "log10",
     "max",
     "meshgrid",
     "min",
     "moveaxis",
     "permute_dims",
+    "negative",
     "nonzero",
     "ones",
     "ones_like",
     "place",
+    "positive",
     "prod",
     "put",
     "put_along_axis",
diff --git a/dpctl_ext/tensor/_elementwise_funcs.py b/dpctl_ext/tensor/_elementwise_funcs.py
index 3a3c05915732..b57074ae9784 100644
--- a/dpctl_ext/tensor/_elementwise_funcs.py
+++ b/dpctl_ext/tensor/_elementwise_funcs.py
@@ -31,6 +31,9 @@
 import dpctl_ext.tensor._tensor_elementwise_impl as ti
 
 from ._elementwise_common import UnaryElementwiseFunc
+from ._type_utils import (
+    _acceptance_fn_negative,
+)
 
 # U01: ==== ABS    (x)
 _abs_docstring_ = r"""
@@ -324,6 +327,461 @@
 )
 del _conj_docstring
 
+# U11: ==== COS           (x)
+_cos_docstring = r"""
+cos(x, /, \*, out=None, order='K')
+
+Computes cosine for each element `x_i` for input array `x`.
+
+Args:
+    x (usm_ndarray):
+        Input array, expected to have a floating-point data type.
+    out (Union[usm_ndarray, None], optional):
+        Output array to populate.
+        Array must have the correct shape and the expected data type.
+    order ("C","F","A","K", optional):
+        Memory layout of the new output array, if parameter
+        `out` is ``None``.
+        Default: "K".
+
+Returns:
+    usm_ndarray:
+        An array containing the element-wise cosine. The data type
+        of the returned array is determined by the Type Promotion Rules.
+"""
+
+cos = UnaryElementwiseFunc("cos", ti._cos_result_type, ti._cos, _cos_docstring)
+del _cos_docstring
+
+# U12: ==== COSH          (x)
+_cosh_docstring = r"""
+cosh(x, /, \*, out=None, order='K')
+
+Computes hyperbolic cosine for each element `x_i` for input array `x`.
+
+Args:
+    x (usm_ndarray):
+        Input array, expected to have a floating-point data type.
+    out (Union[usm_ndarray, None], optional):
+        Output array to populate.
+        Array must have the correct shape and the expected data type.
+    order ("C","F","A","K", optional):
+        Memory layout of the new output array, if parameter
+        `out` is ``None``.
+        Default: "K".
+
+Returns:
+    usm_ndarray:
+        An array containing the element-wise hyperbolic cosine. The data type
+        of the returned array is determined by the Type Promotion Rules.
+"""
+
+cosh = UnaryElementwiseFunc(
+    "cosh", ti._cosh_result_type, ti._cosh, _cosh_docstring
+)
+del _cosh_docstring
+
+# U13: ==== EXP           (x)
+_exp_docstring = r"""
+exp(x, /, \*, out=None, order='K')
+
+Computes the exponential for each element `x_i` of input array `x`.
+
+Args:
+    x (usm_ndarray):
+        Input array, expected to have a floating-point data type.
+    out (Union[usm_ndarray, None], optional):
+        Output array to populate.
+        Array must have the correct shape and the expected data type.
+    order ("C","F","A","K", optional):
+        Memory layout of the new output array, if parameter
+        `out` is ``None``.
+        Default: "K".
+
+Returns:
+    usm_ndarray:
+        An array containing the element-wise exponential of `x`.
+        The data type of the returned array is determined by
+        the Type Promotion Rules.
+"""
+
+exp = UnaryElementwiseFunc("exp", ti._exp_result_type, ti._exp, _exp_docstring)
+del _exp_docstring
+
+# U14: ==== EXPM1         (x)
+_expm1_docstring = r"""
+expm1(x, /, \*, out=None, order='K')
+
+Computes the exponential minus 1 for each element `x_i` of input array `x`.
+
+This function calculates `exp(x) - 1.0` more accurately for small values of `x`.
+
+Args:
+    x (usm_ndarray):
+        Input array, expected to have a floating-point data type.
+    out (usm_ndarray):
+        Output array to populate. Array must have the correct
+        shape and the expected data type.
+    order ("C","F","A","K", optional): memory layout of the new
+        output array, if parameter `out` is ``None``.
+        Default: "K".
+
+Returns:
+    usm_ndarray:
+        An array containing the element-wise `exp(x) - 1` results.
+        The data type of the returned array is determined by the Type
+        Promotion Rules.
+"""
+
+expm1 = UnaryElementwiseFunc(
+    "expm1", ti._expm1_result_type, ti._expm1, _expm1_docstring
+)
+del _expm1_docstring
+
+# U15: ==== FLOOR         (x)
+_floor_docstring = r"""
+floor(x, /, \*, out=None, order='K')
+
+Returns the floor for each element `x_i` for input array `x`.
+
+The floor of `x_i` is the largest integer `n`, such that `n <= x_i`.
+
+Args:
+    x (usm_ndarray):
+        Input array, expected to have a boolean or real-valued data type.
+    out (Union[usm_ndarray, None], optional):
+        Output array to populate.
+        Array must have the correct shape and the expected data type.
+    order ("C","F","A","K", optional):
+        Memory layout of the new output array, if parameter
+        `out` is ``None``.
+        Default: "K".
+
+Returns:
+    usm_ndarray:
+        An array containing the element-wise floor.
+"""
+
+floor = UnaryElementwiseFunc(
+    "floor", ti._floor_result_type, ti._floor, _floor_docstring
+)
+del _floor_docstring
+
+# U16: ==== IMAG        (x)
+_imag_docstring = r"""
+imag(x, /, \*, out=None, order='K')
+
+Computes imaginary part of each element `x_i` for input array `x`.
+
+Args:
+    x (usm_ndarray):
+        Input array. May have any data type.
+    out (Union[usm_ndarray, None], optional):
+        Output array to populate.
+        Array must have the correct shape and the expected data type.
+    order ("C","F","A","K", optional):
+        Memory layout of the new output array, if parameter
+        `out` is ``None``.
+        Default: "K".
+
+Returns:
+    usm_ndarray:
+        An array containing the element-wise imaginary component of input.
+        If the input is a real-valued data type, the returned array has
+        the same data type. If the input is a complex floating-point
+        data type, the returned array has a floating-point data type
+        with the same floating-point precision as complex input.
+"""
+
+imag = UnaryElementwiseFunc(
+    "imag", ti._imag_result_type, ti._imag, _imag_docstring
+)
+del _imag_docstring
+
+# U17: ==== ISFINITE    (x)
+_isfinite_docstring_ = r"""
+isfinite(x, /, \*, out=None, order='K')
+
+Test if each element of input array is a finite number.
+
+Args:
+    x (usm_ndarray):
+        Input array. May have any data type.
+    out (Union[usm_ndarray, None], optional):
+        Output array to populate.
+        Array must have the correct shape and the expected data type.
+    order ("C","F","A","K", optional):
+        Memory layout of the new output array, if parameter
+        `out` is ``None``.
+        Default: "K".
+
+Returns:
+    usm_ndarray:
+        An array which is True where `x` is not positive infinity,
+        negative infinity, or NaN, False otherwise.
+        The data type of the returned array is `bool`.
+"""
+
+isfinite = UnaryElementwiseFunc(
+    "isfinite", ti._isfinite_result_type, ti._isfinite, _isfinite_docstring_
+)
+del _isfinite_docstring_
+
+# U18: ==== ISINF       (x)
+_isinf_docstring_ = r"""
+isinf(x, /, \*, out=None, order='K')
+
+Test if each element of input array is an infinity.
+
+Args:
+    x (usm_ndarray):
+        Input array. May have any data type.
+    out (Union[usm_ndarray, None], optional):
+        Output array to populate.
+        Array must have the correct shape and the expected data type.
+    order ("C","F","A","K", optional):
+        Memory layout of the new output array, if parameter
+        `out` is ``None``.
+        Default: "K".
+
+Returns:
+    usm_ndarray:
+        An array which is True where `x` is positive or negative infinity,
+        False otherwise. The data type of the returned array is `bool`.
+"""
+
+isinf = UnaryElementwiseFunc(
+    "isinf", ti._isinf_result_type, ti._isinf, _isinf_docstring_
+)
+del _isinf_docstring_
+
+# U19: ==== ISNAN       (x)
+_isnan_docstring_ = r"""
+isnan(x, /, \*, out=None, order='K')
+
+Test if each element of an input array is a NaN.
+
+Args:
+    x (usm_ndarray):
+        Input array. May have any data type.
+    out (Union[usm_ndarray, None], optional):
+        Output array to populate.
+        Array must have the correct shape and the expected data type.
+    order ("C","F","A","K", optional):
+        Memory layout of the new output array, if parameter
+        `out` is ``None``.
+        Default: "K".
+
+Returns:
+    usm_ndarray:
+        An array which is True where x is NaN, False otherwise.
+        The data type of the returned array is `bool`.
+"""
+
+isnan = UnaryElementwiseFunc(
+    "isnan", ti._isnan_result_type, ti._isnan, _isnan_docstring_
+)
+del _isnan_docstring_
+
+# U20: ==== LOG         (x)
+_log_docstring = r"""
+log(x, /, \*, out=None, order='K')
+
+Computes the natural logarithm for each element `x_i` of input array `x`.
+
+Args:
+    x (usm_ndarray):
+        Input array, expected to have a floating-point data type.
+    out (usm_ndarray):
+        Output array to populate. Array must have the correct
+        shape and the expected data type.
+    order ("C","F","A","K", optional): memory layout of the new
+        output array, if parameter `out` is ``None``.
+        Default: "K".
+
+Returns:
+    usm_ndarray:
+        An array containing the element-wise natural logarithm values.
+        The data type of the returned array is determined by the Type
+        Promotion Rules.
+"""
+
+log = UnaryElementwiseFunc("log", ti._log_result_type, ti._log, _log_docstring)
+del _log_docstring
+
+# U21: ==== LOG1P       (x)
+_log1p_docstring = r"""
+log1p(x, /, \*, out=None, order='K')
+
+Computes the natural logarithm of (1 + `x`) for each element `x_i` of input
+array `x`.
+
+This function calculates `log(1 + x)` more accurately for small values of `x`.
+
+Args:
+    x (usm_ndarray):
+        Input array, expected to have a floating-point data type.
+    out (usm_ndarray):
+        Output array to populate. Array must have the correct
+        shape and the expected data type.
+    order ("C","F","A","K", optional): memory layout of the new
+        output array, if parameter `out` is ``None``.
+        Default: "K".
+
+Returns:
+    usm_ndarray:
+        An array containing the element-wise `log(1 + x)` results. The data type
+        of the returned array is determined by the Type Promotion Rules.
+"""
+
+log1p = UnaryElementwiseFunc(
+    "log1p", ti._log1p_result_type, ti._log1p, _log1p_docstring
+)
+del _log1p_docstring
+
+# U22: ==== LOG2        (x)
+_log2_docstring_ = r"""
+log2(x, /, \*, out=None, order='K')
+
+Computes the base-2 logarithm for each element `x_i` of input array `x`.
+
+Args:
+    x (usm_ndarray):
+        Input array, expected to have a floating-point data type.
+    out (Union[usm_ndarray, None], optional):
+        Output array to populate.
+        Array must have the correct shape and the expected data type.
+    order ("C","F","A","K", optional):
+        Memory layout of the new output array, if parameter
+        `out` is ``None``.
+        Default: "K".
+
+Returns:
+    usm_ndarray:
+        An array containing the element-wise base-2 logarithm of `x`.
+        The data type of the returned array is determined by the
+        Type Promotion Rules.
+"""
+
+log2 = UnaryElementwiseFunc(
+    "log2", ti._log2_result_type, ti._log2, _log2_docstring_
+)
+del _log2_docstring_
+
+# U23: ==== LOG10       (x)
+_log10_docstring_ = r"""
+log10(x, /, \*, out=None, order='K')
+
+Computes the base-10 logarithm for each element `x_i` of input array `x`.
+
+Args:
+    x (usm_ndarray):
+        Input array, expected to have a floating-point data type.
+    out (Union[usm_ndarray, None], optional):
+        Output array to populate.
+        Array must have the correct shape and the expected data type.
+    order ("C","F","A","K", optional):
+        Memory layout of the new output array, if parameter
+        `out` is ``None``.
+        Default: `"K"`.
+
+Returns:
+    usm_ndarray:
+        An array containing the element-wise base-10 logarithm of `x`.
+        The data type of the returned array is determined by the
+        Type Promotion Rules.
+"""
+
+log10 = UnaryElementwiseFunc(
+    "log10", ti._log10_result_type, ti._log10, _log10_docstring_
+)
+del _log10_docstring_
+
+# U24: ==== LOGICAL_NOT (x)
+_logical_not_docstring = r"""
+logical_not(x, /, \*, out=None, order='K')
+
+Computes the logical NOT for each element `x_i` of input array `x`.
+
+Args:
+    x (usm_ndarray):
+        Input array. May have any data type.
+    out (usm_ndarray):
+        Output array to populate. Array must have the correct
+        shape and the expected data type.
+    order ("C","F","A","K", optional): memory layout of the new
+        output array, if parameter `out` is ``None``.
+        Default: "K".
+
+Returns:
+    usm_ndarray:
+        An array containing the element-wise logical NOT results.
+"""
+
+logical_not = UnaryElementwiseFunc(
+    "logical_not",
+    ti._logical_not_result_type,
+    ti._logical_not,
+    _logical_not_docstring,
+)
+del _logical_not_docstring
+
+# U25: ==== NEGATIVE    (x)
+_negative_docstring_ = r"""
+negative(x, /, \*, out=None, order='K')
+
+Computes the numerical negative for each element `x_i` of input array `x`.
+
+Args:
+    x (usm_ndarray):
+        Input array, expected to have a numeric data type.
+    out (usm_ndarray):
+        Output array to populate. Array must have the correct
+        shape and the expected data type.
+    order ("C","F","A","K", optional): memory layout of the new
+        output array, if parameter `out` is ``None``.
+        Default: "K".
+
+Returns:
+    usm_ndarray:
+        An array containing the negative of `x`.
+"""
+
+negative = UnaryElementwiseFunc(
+    "negative",
+    ti._negative_result_type,
+    ti._negative,
+    _negative_docstring_,
+    acceptance_fn=_acceptance_fn_negative,
+)
+del _negative_docstring_
+
+# U26: ==== POSITIVE    (x)
+_positive_docstring_ = r"""
+positive(x, /, \*, out=None, order='K')
+
+Computes the numerical positive for each element `x_i` of input array `x`.
+
+Args:
+    x (usm_ndarray):
+        Input array, expected to have a numeric data type.
+    out (usm_ndarray):
+        Output array to populate. Array must have the correct
+        shape and the expected data type.
+    order ("C","F","A","K", optional): memory layout of the new
+        output array, if parameter `out` is ``None``.
+        Default: "K".
+
+Returns:
+    usm_ndarray:
+        An array containing the positive of `x`.
+"""
+
+positive = UnaryElementwiseFunc(
+    "positive", ti._positive_result_type, ti._positive, _positive_docstring_
+)
+del _positive_docstring_
+
 # U43: ==== ANGLE        (x)
 _angle_docstring = r"""
 angle(x, /, \*, out=None, order='K')
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/cos.hpp b/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/cos.hpp
new file mode 100644
index 000000000000..7bd47d54778b
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/cos.hpp
@@ -0,0 +1,311 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for elementwise evaluation of COS(x) function.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <cmath>
+#include <complex>
+#include <cstddef>
+#include <cstdint>
+#include <limits>
+#include <type_traits>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "sycl_complex.hpp"
+#include "vec_size_util.hpp"
+
+#include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+
+#include "utils/type_dispatch_building.hpp"
+#include "utils/type_utils.hpp"
+
+namespace dpctl::tensor::kernels::cos
+{
+
+using dpctl::tensor::ssize_t;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+using dpctl::tensor::type_utils::is_complex;
+
+template <typename argT, typename resT>
+struct CosFunctor
+{
+
+    // is function constant for given argT
+    using is_constant = typename std::false_type;
+    // constant value, if constant
+    // constexpr resT constant_value = resT{};
+    // is function defined for sycl::vec
+    using supports_vec = typename std::false_type;
+    // do both argTy and resTy support sugroup store/load operation
+    using supports_sg_loadstore = typename std::negation<
+        std::disjunction<is_complex<resT>, is_complex<argT>>>;
+
+    resT operator()(const argT &in) const
+    {
+        if constexpr (is_complex<argT>::value) {
+            using realT = typename argT::value_type;
+
+            static constexpr realT q_nan =
+                std::numeric_limits<realT>::quiet_NaN();
+
+            realT const &in_re = std::real(in);
+            realT const &in_im = std::imag(in);
+
+            const bool in_re_finite = std::isfinite(in_re);
+            const bool in_im_finite = std::isfinite(in_im);
+
+            /*
+             * Handle the nearly-non-exceptional cases where
+             * real and imaginary parts of input are finite.
+             */
+            if (in_re_finite && in_im_finite) {
+                return exprm_ns::cos(exprm_ns::complex<realT>(in)); // cos(in);
+            }
+
+            /*
+             * since cos(in) = cosh(I * in), for special cases,
+             * we return cosh(I * in).
+             */
+            const realT x = -in_im;
+            const realT y = in_re;
+
+            const bool xfinite = in_im_finite;
+            const bool yfinite = in_re_finite;
+            /*
+             * cosh(+-0 +- I Inf) = dNaN + I sign(d(+-0, dNaN))0.
+             * The sign of 0 in the result is unspecified.  Choice = normally
+             * the same as dNaN.
+             *
+             * cosh(+-0 +- I NaN) = d(NaN) + I sign(d(+-0, NaN))0.
+             * The sign of 0 in the result is unspecified.  Choice = normally
+             * the same as d(NaN).
+             */
+            if (x == realT(0) && !yfinite) {
+                const realT y_m_y = (y - y);
+                const realT res_im = sycl::copysign(realT(0), x * y_m_y);
+                return resT{y_m_y, res_im};
+            }
+
+            /*
+             * cosh(+-Inf +- I 0) = +Inf + I (+-)(+-)0.
+             *
+             * cosh(NaN +- I 0)   = d(NaN) + I sign(d(NaN, +-0))0.
+             * The sign of 0 in the result is unspecified.
+             */
+            if (y == realT(0) && !xfinite) {
+                const realT res_im = sycl::copysign(realT(0), x) * y;
+                return resT{x * x, res_im};
+            }
+
+            /*
+             * cosh(x +- I Inf) = dNaN + I dNaN.
+             *
+             * cosh(x + I NaN) = d(NaN) + I d(NaN).
+             */
+            if (xfinite && !yfinite) {
+                const realT y_m_y = (y - y);
+                return resT{y_m_y, x * y_m_y};
+            }
+
+            /*
+             * cosh(+-Inf + I NaN)  = +Inf + I d(NaN).
+             *
+             * cosh(+-Inf +- I Inf) = +Inf + I dNaN.
+             * The sign of Inf in the result is unspecified.  Choice = always +.
+             *
+             * cosh(+-Inf + I y)   = +Inf cos(y) +- I Inf sin(y)
+             */
+            if (std::isinf(x)) {
+                if (!yfinite) {
+                    return resT{x * x, sycl::copysign(q_nan, x)};
+                }
+                return resT{(x * x) * sycl::cos(y), x * sycl::sin(y)};
+            }
+
+            /*
+             * cosh(NaN + I NaN)  = d(NaN) + I d(NaN).
+             *
+             * cosh(NaN +- I Inf) = d(NaN) + I d(NaN).
+             *
+             * cosh(NaN + I y)    = d(NaN) + I d(NaN).
+             */
+            return resT{(x * x) * q_nan, (x + x) * q_nan};
+        }
+        else {
+            static_assert(std::is_floating_point_v<argT> ||
+                          std::is_same_v<argT, sycl::half>);
+            return sycl::cos(in);
+        }
+    }
+};
+
+template <typename argTy,
+          typename resTy = argTy,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
+          bool enable_sg_loadstore = true>
+using CosContigFunctor =
+    elementwise_common::UnaryContigFunctor<argTy,
+                                           resTy,
+                                           CosFunctor<argTy, resTy>,
+                                           vec_sz,
+                                           n_vecs,
+                                           enable_sg_loadstore>;
+
+template <typename argTy, typename resTy, typename IndexerT>
+using CosStridedFunctor = elementwise_common::
+    UnaryStridedFunctor<argTy, resTy, IndexerT, CosFunctor<argTy, resTy>>;
+
+template <typename T>
+struct CosOutputType
+{
+    using value_type = typename std::disjunction<
+        td_ns::TypeMapResultEntry<T, sycl::half, sycl::half>,
+        td_ns::TypeMapResultEntry<T, float, float>,
+        td_ns::TypeMapResultEntry<T, double, double>,
+        td_ns::TypeMapResultEntry<T, std::complex<float>, std::complex<float>>,
+        td_ns::
+            TypeMapResultEntry<T, std::complex<double>, std::complex<double>>,
+        td_ns::DefaultResultEntry<void>>::result_type;
+
+    static constexpr bool is_defined = !std::is_same_v<value_type, void>;
+};
+
+namespace hyperparam_detail
+{
+
+namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
+
+using vsu_ns::ContigHyperparameterSetDefault;
+using vsu_ns::UnaryContigHyperparameterSetEntry;
+
+template <typename argTy>
+struct CosContigHyperparameterSet
+{
+    using value_type =
+        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
+
+    constexpr static auto vec_sz = value_type::vec_sz;
+    constexpr static auto n_vecs = value_type::n_vecs;
+};
+
+} // end of namespace hyperparam_detail
+
+template <typename T1, typename T2, std::uint8_t vec_sz, std::uint8_t n_vecs>
+class cos_contig_kernel;
+
+template <typename argTy>
+sycl::event cos_contig_impl(sycl::queue &exec_q,
+                            std::size_t nelems,
+                            const char *arg_p,
+                            char *res_p,
+                            const std::vector<sycl::event> &depends = {})
+{
+    using CosHS = hyperparam_detail::CosContigHyperparameterSet<argTy>;
+    static constexpr std::uint8_t vec_sz = CosHS::vec_sz;
+    static constexpr std::uint8_t n_vecs = CosHS::n_vecs;
+
+    return elementwise_common::unary_contig_impl<
+        argTy, CosOutputType, CosContigFunctor, cos_contig_kernel, vec_sz,
+        n_vecs>(exec_q, nelems, arg_p, res_p, depends);
+}
+
+template <typename fnT, typename T>
+struct CosContigFactory
+{
+    fnT get()
+    {
+        if constexpr (!CosOutputType<T>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = cos_contig_impl<T>;
+            return fn;
+        }
+    }
+};
+
+template <typename fnT, typename T>
+struct CosTypeMapFactory
+{
+    /*! @brief get typeid for output type of sycl::cos(T x) */
+    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
+    {
+        using rT = typename CosOutputType<T>::value_type;
+        return td_ns::GetTypeid<rT>{}.get();
+    }
+};
+
+template <typename T1, typename T2, typename T3>
+class cos_strided_kernel;
+
+template <typename argTy>
+sycl::event cos_strided_impl(sycl::queue &exec_q,
+                             std::size_t nelems,
+                             int nd,
+                             const ssize_t *shape_and_strides,
+                             const char *arg_p,
+                             ssize_t arg_offset,
+                             char *res_p,
+                             ssize_t res_offset,
+                             const std::vector<sycl::event> &depends,
+                             const std::vector<sycl::event> &additional_depends)
+{
+    return elementwise_common::unary_strided_impl<
+        argTy, CosOutputType, CosStridedFunctor, cos_strided_kernel>(
+        exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p,
+        res_offset, depends, additional_depends);
+}
+
+template <typename fnT, typename T>
+struct CosStridedFactory
+{
+    fnT get()
+    {
+        if constexpr (!CosOutputType<T>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = cos_strided_impl<T>;
+            return fn;
+        }
+    }
+};
+
+} // namespace dpctl::tensor::kernels::cos
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/cosh.hpp b/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/cosh.hpp
new file mode 100644
index 000000000000..505eb5fffc29
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/cosh.hpp
@@ -0,0 +1,301 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for elementwise evaluation of COSH(x) function.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <cmath>
+#include <complex>
+#include <cstddef>
+#include <cstdint>
+#include <limits>
+#include <type_traits>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "sycl_complex.hpp"
+#include "vec_size_util.hpp"
+
+#include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+
+#include "utils/type_dispatch_building.hpp"
+#include "utils/type_utils.hpp"
+
+namespace dpctl::tensor::kernels::cosh
+{
+
+using dpctl::tensor::ssize_t;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+using dpctl::tensor::type_utils::is_complex;
+
+template <typename argT, typename resT>
+struct CoshFunctor
+{
+
+    // is function constant for given argT
+    using is_constant = typename std::false_type;
+    // constant value, if constant
+    // constexpr resT constant_value = resT{};
+    // is function defined for sycl::vec
+    using supports_vec = typename std::false_type;
+    // do both argTy and resTy support sugroup store/load operation
+    using supports_sg_loadstore = typename std::negation<
+        std::disjunction<is_complex<resT>, is_complex<argT>>>;
+
+    resT operator()(const argT &in) const
+    {
+        if constexpr (is_complex<argT>::value) {
+            using realT = typename argT::value_type;
+
+            static constexpr realT q_nan =
+                std::numeric_limits<realT>::quiet_NaN();
+
+            const realT x = std::real(in);
+            const realT y = std::imag(in);
+
+            const bool xfinite = std::isfinite(x);
+            const bool yfinite = std::isfinite(y);
+
+            /*
+             * Handle the nearly-non-exceptional cases where
+             * real and imaginary parts of input are finite.
+             */
+            if (xfinite && yfinite) {
+                return exprm_ns::cosh(
+                    exprm_ns::complex<realT>(in)); // cosh(in);
+            }
+
+            /*
+             * cosh(+-0 +- I Inf) = dNaN + I sign(d(+-0, dNaN))0.
+             * The sign of 0 in the result is unspecified.  Choice = normally
+             * the same as dNaN.
+             *
+             * cosh(+-0 +- I NaN) = d(NaN) + I sign(d(+-0, NaN))0.
+             * The sign of 0 in the result is unspecified.  Choice = normally
+             * the same as d(NaN).
+             */
+            if (x == realT(0) && !yfinite) {
+                const realT res_im = sycl::copysign(realT(0), x * q_nan);
+                return resT{q_nan, res_im};
+            }
+
+            /*
+             * cosh(+-Inf +- I 0) = +Inf + I (+-)(+-)0.
+             *
+             * cosh(NaN +- I 0)   = d(NaN) + I sign(d(NaN, +-0))0.
+             * The sign of 0 in the result is unspecified.
+             */
+            if (y == realT(0) && !xfinite) {
+                const realT res_im = sycl::copysign(realT(0), x) * y;
+                return resT{x * x, res_im};
+            }
+
+            /*
+             * cosh(x +- I Inf) = dNaN + I dNaN.
+             *
+             * cosh(x + I NaN) = d(NaN) + I d(NaN).
+             */
+            if (xfinite && !yfinite) {
+                return resT{q_nan, x * q_nan};
+            }
+
+            /*
+             * cosh(+-Inf + I NaN)  = +Inf + I d(NaN).
+             *
+             * cosh(+-Inf +- I Inf) = +Inf + I dNaN.
+             * The sign of Inf in the result is unspecified.  Choice = always +.
+             *
+             * cosh(+-Inf + I y)   = +Inf cos(y) +- I Inf sin(y)
+             */
+            if (std::isinf(x)) {
+                if (!yfinite) {
+                    return resT{x * x, x * q_nan};
+                }
+                return resT{(x * x) * sycl::cos(y), x * sycl::sin(y)};
+            }
+
+            /*
+             * cosh(NaN + I NaN)  = d(NaN) + I d(NaN).
+             *
+             * cosh(NaN +- I Inf) = d(NaN) + I d(NaN).
+             *
+             * cosh(NaN + I y)    = d(NaN) + I d(NaN).
+             */
+            return resT{(x * x) * (y - y), (x + x) * (y - y)};
+        }
+        else {
+            static_assert(std::is_floating_point_v<argT> ||
+                          std::is_same_v<argT, sycl::half>);
+            return sycl::cosh(in);
+        }
+    }
+};
+
+template <typename argTy,
+          typename resTy = argTy,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
+          bool enable_sg_loadstore = true>
+using CoshContigFunctor =
+    elementwise_common::UnaryContigFunctor<argTy,
+                                           resTy,
+                                           CoshFunctor<argTy, resTy>,
+                                           vec_sz,
+                                           n_vecs,
+                                           enable_sg_loadstore>;
+
+template <typename argTy, typename resTy, typename IndexerT>
+using CoshStridedFunctor = elementwise_common::
+    UnaryStridedFunctor<argTy, resTy, IndexerT, CoshFunctor<argTy, resTy>>;
+
+template <typename T>
+struct CoshOutputType
+{
+    using value_type = typename std::disjunction<
+        td_ns::TypeMapResultEntry<T, sycl::half>,
+        td_ns::TypeMapResultEntry<T, float>,
+        td_ns::TypeMapResultEntry<T, double>,
+        td_ns::TypeMapResultEntry<T, std::complex<float>>,
+        td_ns::TypeMapResultEntry<T, std::complex<double>>,
+        td_ns::DefaultResultEntry<void>>::result_type;
+
+    static constexpr bool is_defined = !std::is_same_v<value_type, void>;
+};
+
+namespace hyperparam_detail
+{
+
+namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
+
+using vsu_ns::ContigHyperparameterSetDefault;
+using vsu_ns::UnaryContigHyperparameterSetEntry;
+
+template <typename argTy>
+struct CoshContigHyperparameterSet
+{
+    using value_type =
+        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
+
+    constexpr static auto vec_sz = value_type::vec_sz;
+    constexpr static auto n_vecs = value_type::n_vecs;
+};
+
+} // namespace hyperparam_detail
+
+template <typename T1, typename T2, std::uint8_t vec_sz, std::uint8_t n_vecs>
+class cosh_contig_kernel;
+
+template <typename argTy>
+sycl::event cosh_contig_impl(sycl::queue &exec_q,
+                             std::size_t nelems,
+                             const char *arg_p,
+                             char *res_p,
+                             const std::vector<sycl::event> &depends = {})
+{
+    using CoshHS = hyperparam_detail::CoshContigHyperparameterSet<argTy>;
+    static constexpr std::uint8_t vec_sz = CoshHS::vec_sz;
+    static constexpr std::uint8_t n_vecs = CoshHS::n_vecs;
+
+    return elementwise_common::unary_contig_impl<
+        argTy, CoshOutputType, CoshContigFunctor, cosh_contig_kernel, vec_sz,
+        n_vecs>(exec_q, nelems, arg_p, res_p, depends);
+}
+
+template <typename fnT, typename T>
+struct CoshContigFactory
+{
+    fnT get()
+    {
+        if constexpr (!CoshOutputType<T>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = cosh_contig_impl<T>;
+            return fn;
+        }
+    }
+};
+
+template <typename fnT, typename T>
+struct CoshTypeMapFactory
+{
+    /*! @brief get typeid for output type of sycl::cosh(T x) */
+    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
+    {
+        using rT = typename CoshOutputType<T>::value_type;
+        return td_ns::GetTypeid<rT>{}.get();
+    }
+};
+
+template <typename T1, typename T2, typename T3>
+class cosh_strided_kernel;
+
+template <typename argTy>
+sycl::event
+    cosh_strided_impl(sycl::queue &exec_q,
+                      std::size_t nelems,
+                      int nd,
+                      const ssize_t *shape_and_strides,
+                      const char *arg_p,
+                      ssize_t arg_offset,
+                      char *res_p,
+                      ssize_t res_offset,
+                      const std::vector<sycl::event> &depends,
+                      const std::vector<sycl::event> &additional_depends)
+{
+    return elementwise_common::unary_strided_impl<
+        argTy, CoshOutputType, CoshStridedFunctor, cosh_strided_kernel>(
+        exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p,
+        res_offset, depends, additional_depends);
+}
+
+template <typename fnT, typename T>
+struct CoshStridedFactory
+{
+    fnT get()
+    {
+        if constexpr (!CoshOutputType<T>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = cosh_strided_impl<T>;
+            return fn;
+        }
+    }
+};
+
+} // namespace dpctl::tensor::kernels::cosh
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/exp.hpp b/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/exp.hpp
new file mode 100644
index 000000000000..97789e53bb5a
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/exp.hpp
@@ -0,0 +1,269 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for elementwise evaluation of EXP(x) function.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <cmath>
+#include <complex>
+#include <cstddef>
+#include <cstdint>
+#include <limits>
+#include <type_traits>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "sycl_complex.hpp"
+#include "vec_size_util.hpp"
+
+#include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+
+#include "utils/type_dispatch_building.hpp"
+#include "utils/type_utils.hpp"
+
+namespace dpctl::tensor::kernels::exp
+{
+
+using dpctl::tensor::ssize_t;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+using dpctl::tensor::type_utils::is_complex;
+
+template <typename argT, typename resT>
+struct ExpFunctor
+{
+    // is function constant for given argT
+    using is_constant = typename std::false_type;
+    // constant value, if constant
+    // constexpr resT constant_value = resT{};
+    // is function defined for sycl::vec
+    using supports_vec = typename std::false_type;
+    // do both argTy and resTy support sugroup store/load operation
+    using supports_sg_loadstore = typename std::negation<
+        std::disjunction<is_complex<resT>, is_complex<argT>>>;
+
+    resT operator()(const argT &in) const
+    {
+        if constexpr (is_complex<argT>::value) {
+            using realT = typename argT::value_type;
+
+            static constexpr realT q_nan =
+                std::numeric_limits<realT>::quiet_NaN();
+
+            const realT x = std::real(in);
+            const realT y = std::imag(in);
+            if (std::isfinite(x)) {
+                if (std::isfinite(y)) {
+                    return exprm_ns::exp(
+                        exprm_ns::complex<realT>(in)); // exp(in);
+                }
+                else {
+                    return resT{q_nan, q_nan};
+                }
+            }
+            else if (std::isnan(x)) {
+                /* x is nan */
+                if (y == realT(0)) {
+                    return resT{in};
+                }
+                else {
+                    return resT{x, q_nan};
+                }
+            }
+            else {
+                if (!sycl::signbit(x)) { /* x is +inf */
+                    if (y == realT(0)) {
+                        return resT{x, y};
+                    }
+                    else if (std::isfinite(y)) {
+                        return resT{x * sycl::cos(y), x * sycl::sin(y)};
+                    }
+                    else {
+                        /* x = +inf, y = +-inf || nan */
+                        return resT{x, q_nan};
+                    }
+                }
+                else { /* x is -inf */
+                    if (std::isfinite(y)) {
+                        realT exp_x = sycl::exp(x);
+                        return resT{exp_x * sycl::cos(y), exp_x * sycl::sin(y)};
+                    }
+                    else {
+                        /* x = -inf, y = +-inf || nan */
+                        return resT{0, 0};
+                    }
+                }
+            }
+        }
+        else {
+            return sycl::exp(in);
+        }
+    }
+};
+
+template <typename argTy,
+          typename resTy = argTy,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
+          bool enable_sg_loadstore = true>
+using ExpContigFunctor =
+    elementwise_common::UnaryContigFunctor<argTy,
+                                           resTy,
+                                           ExpFunctor<argTy, resTy>,
+                                           vec_sz,
+                                           n_vecs,
+                                           enable_sg_loadstore>;
+
+template <typename argTy, typename resTy, typename IndexerT>
+using ExpStridedFunctor = elementwise_common::
+    UnaryStridedFunctor<argTy, resTy, IndexerT, ExpFunctor<argTy, resTy>>;
+
+template <typename T>
+struct ExpOutputType
+{
+    using value_type = typename std::disjunction<
+        td_ns::TypeMapResultEntry<T, sycl::half>,
+        td_ns::TypeMapResultEntry<T, float>,
+        td_ns::TypeMapResultEntry<T, double>,
+        td_ns::TypeMapResultEntry<T, std::complex<float>>,
+        td_ns::TypeMapResultEntry<T, std::complex<double>>,
+        td_ns::DefaultResultEntry<void>>::result_type;
+
+    static constexpr bool is_defined = !std::is_same_v<value_type, void>;
+};
+
+namespace hyperparam_detail
+{
+
+namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
+
+using vsu_ns::ContigHyperparameterSetDefault;
+using vsu_ns::UnaryContigHyperparameterSetEntry;
+
+template <typename argTy>
+struct ExpContigHyperparameterSet
+{
+    using value_type =
+        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
+
+    constexpr static auto vec_sz = value_type::vec_sz;
+    constexpr static auto n_vecs = value_type::n_vecs;
+};
+
+} // end of namespace hyperparam_detail
+
+template <typename T1, typename T2, std::uint8_t vec_sz, std::uint8_t n_vecs>
+class exp_contig_kernel;
+
+template <typename argTy>
+sycl::event exp_contig_impl(sycl::queue &exec_q,
+                            std::size_t nelems,
+                            const char *arg_p,
+                            char *res_p,
+                            const std::vector<sycl::event> &depends = {})
+{
+    using ExpHS = hyperparam_detail::ExpContigHyperparameterSet<argTy>;
+    static constexpr std::uint8_t vec_sz = ExpHS::vec_sz;
+    static constexpr std::uint8_t n_vecs = ExpHS::n_vecs;
+
+    return elementwise_common::unary_contig_impl<
+        argTy, ExpOutputType, ExpContigFunctor, exp_contig_kernel, vec_sz,
+        n_vecs>(exec_q, nelems, arg_p, res_p, depends);
+}
+
+template <typename fnT, typename T>
+struct ExpContigFactory
+{
+    fnT get()
+    {
+        if constexpr (!ExpOutputType<T>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = exp_contig_impl<T>;
+            return fn;
+        }
+    }
+};
+
+template <typename fnT, typename T>
+struct ExpTypeMapFactory
+{
+    /*! @brief get typeid for output type of sycl::exp(T x) */
+    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
+    {
+        using rT = typename ExpOutputType<T>::value_type;
+        return td_ns::GetTypeid<rT>{}.get();
+    }
+};
+
+template <typename T1, typename T2, typename T3>
+class exp_strided_kernel;
+
+template <typename argTy>
+sycl::event exp_strided_impl(sycl::queue &exec_q,
+                             std::size_t nelems,
+                             int nd,
+                             const ssize_t *shape_and_strides,
+                             const char *arg_p,
+                             ssize_t arg_offset,
+                             char *res_p,
+                             ssize_t res_offset,
+                             const std::vector<sycl::event> &depends,
+                             const std::vector<sycl::event> &additional_depends)
+{
+    return elementwise_common::unary_strided_impl<
+        argTy, ExpOutputType, ExpStridedFunctor, exp_strided_kernel>(
+        exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p,
+        res_offset, depends, additional_depends);
+}
+
+template <typename fnT, typename T>
+struct ExpStridedFactory
+{
+    fnT get()
+    {
+        if constexpr (!ExpOutputType<T>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = exp_strided_impl<T>;
+            return fn;
+        }
+    }
+};
+
+} // namespace dpctl::tensor::kernels::exp
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/expm1.hpp b/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/expm1.hpp
new file mode 100644
index 000000000000..c29030a6dc95
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/expm1.hpp
@@ -0,0 +1,282 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for elementwise evaluation of EXPM1(x) function.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <cmath>
+#include <complex>
+#include <cstddef>
+#include <cstdint>
+#include <limits>
+#include <type_traits>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "vec_size_util.hpp"
+
+#include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+
+#include "utils/type_dispatch_building.hpp"
+#include "utils/type_utils.hpp"
+
+namespace dpctl::tensor::kernels::expm1
+{
+
+using dpctl::tensor::ssize_t;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+using dpctl::tensor::type_utils::is_complex;
+
+template <typename argT, typename resT>
+struct Expm1Functor
+{
+
+    // is function constant for given argT
+    using is_constant = typename std::false_type;
+    // constant value, if constant
+    // constexpr resT constant_value = resT{};
+    // is function defined for sycl::vec
+    using supports_vec = typename std::false_type;
+    // do both argTy and resTy support sugroup store/load operation
+    using supports_sg_loadstore = typename std::negation<
+        std::disjunction<is_complex<resT>, is_complex<argT>>>;
+
+    resT operator()(const argT &in) const
+    {
+        if constexpr (is_complex<argT>::value) {
+            using realT = typename argT::value_type;
+            // expm1(x + I*y) = expm1(x)*cos(y) - 2*sin(y / 2)^2 +
+            // I*exp(x)*sin(y)
+            const realT x = std::real(in);
+            const realT y = std::imag(in);
+
+            // special cases
+            if (std::isinf(x)) {
+                if (x > realT(0)) {
+                    // positive infinity cases
+                    if (!std::isfinite(y)) {
+                        return resT{x, std::numeric_limits<realT>::quiet_NaN()};
+                    }
+                    else if (y == realT(0)) {
+                        return in;
+                    }
+                    else {
+                        return (resT{sycl::copysign(x, sycl::cos(y)),
+                                     sycl::copysign(x, sycl::sin(y))});
+                    }
+                }
+                else {
+                    // negative infinity cases
+                    if (!std::isfinite(y)) {
+                        // copy sign of y to guarantee
+                        // conj(expm1(x)) == expm1(conj(x))
+                        return resT{realT(-1), sycl::copysign(realT(0), y)};
+                    }
+                    else {
+                        return resT{realT(-1),
+                                    sycl::copysign(realT(0), sycl::sin(y))};
+                    }
+                }
+            }
+
+            if (std::isnan(x)) {
+                if (y == realT(0)) {
+                    return in;
+                }
+                else {
+                    return resT{std::numeric_limits<realT>::quiet_NaN(),
+                                std::numeric_limits<realT>::quiet_NaN()};
+                }
+            }
+
+            // x, y finite numbers
+            const realT cosY_val = sycl::cos(y);
+            const realT sinY_val = (y == 0) ? y : sycl::sin(y);
+            const realT sinhalfY_val = (y == 0) ? y : sycl::sin(y / 2);
+
+            const realT res_re =
+                sycl::expm1(x) * cosY_val - 2 * sinhalfY_val * sinhalfY_val;
+            realT res_im = sycl::exp(x) * sinY_val;
+            return resT{res_re, res_im};
+        }
+        else {
+            static_assert(std::is_floating_point_v<argT> ||
+                          std::is_same_v<argT, sycl::half>);
+            static_assert(std::is_same_v<argT, resT>);
+            if (in == 0) {
+                return in;
+            }
+            return sycl::expm1(in);
+        }
+    }
+};
+
+template <typename argTy,
+          typename resTy = argTy,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
+          bool enable_sg_loadstore = true>
+using Expm1ContigFunctor =
+    elementwise_common::UnaryContigFunctor<argTy,
+                                           resTy,
+                                           Expm1Functor<argTy, resTy>,
+                                           vec_sz,
+                                           n_vecs,
+                                           enable_sg_loadstore>;
+
+template <typename argTy, typename resTy, typename IndexerT>
+using Expm1StridedFunctor = elementwise_common::
+    UnaryStridedFunctor<argTy, resTy, IndexerT, Expm1Functor<argTy, resTy>>;
+
+template <typename T>
+struct Expm1OutputType
+{
+    using value_type = typename std::disjunction<
+        td_ns::TypeMapResultEntry<T, sycl::half, sycl::half>,
+        td_ns::TypeMapResultEntry<T, float, float>,
+        td_ns::TypeMapResultEntry<T, double, double>,
+        td_ns::TypeMapResultEntry<T, std::complex<float>, std::complex<float>>,
+        td_ns::
+            TypeMapResultEntry<T, std::complex<double>, std::complex<double>>,
+        td_ns::DefaultResultEntry<void>>::result_type;
+
+    static constexpr bool is_defined = !std::is_same_v<value_type, void>;
+};
+
+namespace hyperparam_detail
+{
+
+namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
+
+using vsu_ns::ContigHyperparameterSetDefault;
+using vsu_ns::UnaryContigHyperparameterSetEntry;
+
+template <typename argTy>
+struct Expm1ContigHyperparameterSet
+{
+    using value_type =
+        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
+
+    constexpr static auto vec_sz = value_type::vec_sz;
+    constexpr static auto n_vecs = value_type::n_vecs;
+};
+
+} // end of namespace hyperparam_detail
+
+template <typename T1, typename T2, std::uint8_t vec_sz, std::uint8_t n_vecs>
+class expm1_contig_kernel;
+
+template <typename argTy>
+sycl::event expm1_contig_impl(sycl::queue &exec_q,
+                              std::size_t nelems,
+                              const char *arg_p,
+                              char *res_p,
+                              const std::vector<sycl::event> &depends = {})
+{
+    using Expm1HS = hyperparam_detail::Expm1ContigHyperparameterSet<argTy>;
+    static constexpr std::uint8_t vec_sz = Expm1HS::vec_sz;
+    static constexpr std::uint8_t n_vecs = Expm1HS::n_vecs;
+
+    return elementwise_common::unary_contig_impl<
+        argTy, Expm1OutputType, Expm1ContigFunctor, expm1_contig_kernel, vec_sz,
+        n_vecs>(exec_q, nelems, arg_p, res_p, depends);
+}
+
+template <typename fnT, typename T>
+struct Expm1ContigFactory
+{
+    fnT get()
+    {
+        if constexpr (!Expm1OutputType<T>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = expm1_contig_impl<T>;
+            return fn;
+        }
+    }
+};
+
+template <typename fnT, typename T>
+struct Expm1TypeMapFactory
+{
+    /*! @brief get typeid for output type of sycl::expm1(T x) */
+    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
+    {
+        using rT = typename Expm1OutputType<T>::value_type;
+        return td_ns::GetTypeid<rT>{}.get();
+    }
+};
+
+template <typename T1, typename T2, typename T3>
+class expm1_strided_kernel;
+
+template <typename argTy>
+sycl::event
+    expm1_strided_impl(sycl::queue &exec_q,
+                       std::size_t nelems,
+                       int nd,
+                       const ssize_t *shape_and_strides,
+                       const char *arg_p,
+                       ssize_t arg_offset,
+                       char *res_p,
+                       ssize_t res_offset,
+                       const std::vector<sycl::event> &depends,
+                       const std::vector<sycl::event> &additional_depends)
+{
+    return elementwise_common::unary_strided_impl<
+        argTy, Expm1OutputType, Expm1StridedFunctor, expm1_strided_kernel>(
+        exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p,
+        res_offset, depends, additional_depends);
+}
+
+template <typename fnT, typename T>
+struct Expm1StridedFactory
+{
+    fnT get()
+    {
+        if constexpr (!Expm1OutputType<T>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = expm1_strided_impl<T>;
+            return fn;
+        }
+    }
+};
+
+} // namespace dpctl::tensor::kernels::expm1
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/floor.hpp b/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/floor.hpp
new file mode 100644
index 000000000000..375659b94a12
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/floor.hpp
@@ -0,0 +1,229 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for elementwise evaluation of FLOOR(x) function.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <cstddef>
+#include <cstdint>
+#include <type_traits>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "vec_size_util.hpp"
+
+#include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+
+#include "utils/type_dispatch_building.hpp"
+#include "utils/type_utils.hpp"
+
+namespace dpctl::tensor::kernels::floor
+{
+
+using dpctl::tensor::ssize_t;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+using dpctl::tensor::type_utils::is_complex;
+
+template <typename argT, typename resT>
+struct FloorFunctor
+{
+
+    // is function constant for given argT
+    using is_constant = typename std::false_type;
+    // constant value, if constant
+    // constexpr resT constant_value = resT{};
+    // is function defined for sycl::vec
+    using supports_vec = typename std::false_type;
+    // do both argTy and resTy support sugroup store/load operation
+    using supports_sg_loadstore = typename std::negation<
+        std::disjunction<is_complex<resT>, is_complex<argT>>>;
+
+    resT operator()(const argT &in) const
+    {
+        if constexpr (std::is_integral_v<argT>) {
+            return in;
+        }
+        else {
+            if (in == 0) {
+                return in;
+            }
+            return sycl::floor(in);
+        }
+    }
+};
+
+template <typename argTy,
+          typename resTy = argTy,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
+          bool enable_sg_loadstore = true>
+using FloorContigFunctor =
+    elementwise_common::UnaryContigFunctor<argTy,
+                                           resTy,
+                                           FloorFunctor<argTy, resTy>,
+                                           vec_sz,
+                                           n_vecs,
+                                           enable_sg_loadstore>;
+
+template <typename argTy, typename resTy, typename IndexerT>
+using FloorStridedFunctor = elementwise_common::
+    UnaryStridedFunctor<argTy, resTy, IndexerT, FloorFunctor<argTy, resTy>>;
+
+template <typename T>
+struct FloorOutputType
+{
+    using value_type =
+        typename std::disjunction<td_ns::TypeMapResultEntry<T, bool>,
+                                  td_ns::TypeMapResultEntry<T, std::uint8_t>,
+                                  td_ns::TypeMapResultEntry<T, std::uint16_t>,
+                                  td_ns::TypeMapResultEntry<T, std::uint32_t>,
+                                  td_ns::TypeMapResultEntry<T, std::uint64_t>,
+                                  td_ns::TypeMapResultEntry<T, std::int8_t>,
+                                  td_ns::TypeMapResultEntry<T, std::int16_t>,
+                                  td_ns::TypeMapResultEntry<T, std::int32_t>,
+                                  td_ns::TypeMapResultEntry<T, std::int64_t>,
+                                  td_ns::TypeMapResultEntry<T, sycl::half>,
+                                  td_ns::TypeMapResultEntry<T, float>,
+                                  td_ns::TypeMapResultEntry<T, double>,
+                                  td_ns::DefaultResultEntry<void>>::result_type;
+
+    static constexpr bool is_defined = !std::is_same_v<value_type, void>;
+};
+
+namespace hyperparam_detail
+{
+
+namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
+
+using vsu_ns::ContigHyperparameterSetDefault;
+using vsu_ns::UnaryContigHyperparameterSetEntry;
+
+template <typename argTy>
+struct FloorContigHyperparameterSet
+{
+    using value_type =
+        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
+
+    constexpr static auto vec_sz = value_type::vec_sz;
+    constexpr static auto n_vecs = value_type::n_vecs;
+};
+
+} // end of namespace hyperparam_detail
+
+template <typename T1, typename T2, std::uint8_t vec_sz, std::uint8_t n_vecs>
+class floor_contig_kernel;
+
+template <typename argTy>
+sycl::event floor_contig_impl(sycl::queue &exec_q,
+                              std::size_t nelems,
+                              const char *arg_p,
+                              char *res_p,
+                              const std::vector<sycl::event> &depends = {})
+{
+    using FloorHS = hyperparam_detail::FloorContigHyperparameterSet<argTy>;
+    static constexpr std::uint8_t vec_sz = FloorHS::vec_sz;
+    static constexpr std::uint8_t n_vecs = FloorHS::n_vecs;
+
+    return elementwise_common::unary_contig_impl<
+        argTy, FloorOutputType, FloorContigFunctor, floor_contig_kernel, vec_sz,
+        n_vecs>(exec_q, nelems, arg_p, res_p, depends);
+}
+
+template <typename fnT, typename T>
+struct FloorContigFactory
+{
+    fnT get()
+    {
+        if constexpr (!FloorOutputType<T>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = floor_contig_impl<T>;
+            return fn;
+        }
+    }
+};
+
+template <typename fnT, typename T>
+struct FloorTypeMapFactory
+{
+    /*! @brief get typeid for output type of sycl::floor(T x) */
+    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
+    {
+        using rT = typename FloorOutputType<T>::value_type;
+        return td_ns::GetTypeid<rT>{}.get();
+    }
+};
+
+template <typename T1, typename T2, typename T3>
+class floor_strided_kernel;
+
+template <typename argTy>
+sycl::event
+    floor_strided_impl(sycl::queue &exec_q,
+                       std::size_t nelems,
+                       int nd,
+                       const ssize_t *shape_and_strides,
+                       const char *arg_p,
+                       ssize_t arg_offset,
+                       char *res_p,
+                       ssize_t res_offset,
+                       const std::vector<sycl::event> &depends,
+                       const std::vector<sycl::event> &additional_depends)
+{
+    return elementwise_common::unary_strided_impl<
+        argTy, FloorOutputType, FloorStridedFunctor, floor_strided_kernel>(
+        exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p,
+        res_offset, depends, additional_depends);
+}
+
+template <typename fnT, typename T>
+struct FloorStridedFactory
+{
+    fnT get()
+    {
+        if constexpr (!FloorOutputType<T>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = floor_strided_impl<T>;
+            return fn;
+        }
+    }
+};
+
+} // namespace dpctl::tensor::kernels::floor
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/imag.hpp b/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/imag.hpp
new file mode 100644
index 000000000000..667fb47efdc8
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/imag.hpp
@@ -0,0 +1,232 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for elementwise evaluation of IMAG(x) function.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <complex>
+#include <cstddef>
+#include <cstdint>
+#include <type_traits>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "vec_size_util.hpp"
+
+#include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+
+#include "utils/type_dispatch_building.hpp"
+#include "utils/type_utils.hpp"
+
+namespace dpctl::tensor::kernels::imag
+{
+
+using dpctl::tensor::ssize_t;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+using dpctl::tensor::type_utils::is_complex;
+using dpctl::tensor::type_utils::is_complex_v;
+
+template <typename argT, typename resT>
+struct ImagFunctor
+{
+
+    // is function constant for given argT
+    using is_constant =
+        typename std::is_same<is_complex<argT>, std::false_type>;
+    // constant value, if constant
+    static constexpr resT constant_value = resT{0};
+    // is function defined for sycl::vec
+    using supports_vec = typename std::false_type;
+    // do both argTy and resTy support sugroup store/load operation
+    using supports_sg_loadstore = typename std::negation<
+        std::disjunction<is_complex<resT>, is_complex<argT>>>;
+
+    resT operator()(const argT &in) const
+    {
+        if constexpr (is_complex_v<argT>) {
+            return std::imag(in);
+        }
+        else {
+            static_assert(std::is_same_v<resT, argT>);
+            return constant_value;
+        }
+    }
+};
+
+template <typename argTy,
+          typename resTy = argTy,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
+          bool enable_sg_loadstore = true>
+using ImagContigFunctor =
+    elementwise_common::UnaryContigFunctor<argTy,
+                                           resTy,
+                                           ImagFunctor<argTy, resTy>,
+                                           vec_sz,
+                                           n_vecs,
+                                           enable_sg_loadstore>;
+
+template <typename argTy, typename resTy, typename IndexerT>
+using ImagStridedFunctor = elementwise_common::
+    UnaryStridedFunctor<argTy, resTy, IndexerT, ImagFunctor<argTy, resTy>>;
+
+template <typename T>
+struct ImagOutputType
+{
+    using value_type = typename std::disjunction<
+        td_ns::TypeMapResultEntry<T, bool>,
+        td_ns::TypeMapResultEntry<T, std::uint8_t>,
+        td_ns::TypeMapResultEntry<T, std::uint16_t>,
+        td_ns::TypeMapResultEntry<T, std::uint32_t>,
+        td_ns::TypeMapResultEntry<T, std::uint64_t>,
+        td_ns::TypeMapResultEntry<T, std::int8_t>,
+        td_ns::TypeMapResultEntry<T, std::int16_t>,
+        td_ns::TypeMapResultEntry<T, std::int32_t>,
+        td_ns::TypeMapResultEntry<T, std::int64_t>,
+        td_ns::TypeMapResultEntry<T, sycl::half>,
+        td_ns::TypeMapResultEntry<T, float>,
+        td_ns::TypeMapResultEntry<T, double>,
+        td_ns::TypeMapResultEntry<T, std::complex<float>, float>,
+        td_ns::TypeMapResultEntry<T, std::complex<double>, double>,
+        td_ns::DefaultResultEntry<void>>::result_type;
+
+    static constexpr bool is_defined = !std::is_same_v<value_type, void>;
+};
+
+namespace hyperparam_detail
+{
+
+namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
+
+using vsu_ns::ContigHyperparameterSetDefault;
+using vsu_ns::UnaryContigHyperparameterSetEntry;
+
+template <typename argTy>
+struct ImagContigHyperparameterSet
+{
+    using value_type =
+        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
+
+    constexpr static auto vec_sz = value_type::vec_sz;
+    constexpr static auto n_vecs = value_type::n_vecs;
+};
+
+} // end of namespace hyperparam_detail
+
+template <typename T1, typename T2, std::uint8_t vec_sz, std::uint8_t n_vecs>
+class imag_contig_kernel;
+
+template <typename argTy>
+sycl::event imag_contig_impl(sycl::queue &exec_q,
+                             std::size_t nelems,
+                             const char *arg_p,
+                             char *res_p,
+                             const std::vector<sycl::event> &depends = {})
+{
+    using ImagHS = hyperparam_detail::ImagContigHyperparameterSet<argTy>;
+    static constexpr std::uint8_t vec_sz = ImagHS::vec_sz;
+    static constexpr std::uint8_t n_vecs = ImagHS::n_vecs;
+
+    return elementwise_common::unary_contig_impl<
+        argTy, ImagOutputType, ImagContigFunctor, imag_contig_kernel, vec_sz,
+        n_vecs>(exec_q, nelems, arg_p, res_p, depends);
+}
+
+template <typename fnT, typename T>
+struct ImagContigFactory
+{
+    fnT get()
+    {
+        if constexpr (!ImagOutputType<T>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = imag_contig_impl<T>;
+            return fn;
+        }
+    }
+};
+
+template <typename fnT, typename T>
+struct ImagTypeMapFactory
+{
+    /*! @brief get typeid for output type of std::imag(T x) */
+    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
+    {
+        using rT = typename ImagOutputType<T>::value_type;
+        return td_ns::GetTypeid<rT>{}.get();
+    }
+};
+
+template <typename T1, typename T2, typename T3>
+class imag_strided_kernel;
+
+template <typename argTy>
+sycl::event
+    imag_strided_impl(sycl::queue &exec_q,
+                      std::size_t nelems,
+                      int nd,
+                      const ssize_t *shape_and_strides,
+                      const char *arg_p,
+                      ssize_t arg_offset,
+                      char *res_p,
+                      ssize_t res_offset,
+                      const std::vector<sycl::event> &depends,
+                      const std::vector<sycl::event> &additional_depends)
+{
+    return elementwise_common::unary_strided_impl<
+        argTy, ImagOutputType, ImagStridedFunctor, imag_strided_kernel>(
+        exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p,
+        res_offset, depends, additional_depends);
+}
+
+template <typename fnT, typename T>
+struct ImagStridedFactory
+{
+    fnT get()
+    {
+        if constexpr (!ImagOutputType<T>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = imag_strided_impl<T>;
+            return fn;
+        }
+    }
+};
+
+} // namespace dpctl::tensor::kernels::imag
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/isfinite.hpp b/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/isfinite.hpp
new file mode 100644
index 000000000000..5b8ee877981f
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/isfinite.hpp
@@ -0,0 +1,229 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for elementwise evaluation of ISFINITE(x)
+/// function that tests whether a tensor element is finite.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <cmath>
+#include <complex>
+#include <cstddef>
+#include <cstdint>
+#include <type_traits>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "vec_size_util.hpp"
+
+#include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+
+#include "utils/type_dispatch_building.hpp"
+#include "utils/type_utils.hpp"
+
+namespace dpctl::tensor::kernels::isfinite
+{
+
+using dpctl::tensor::ssize_t;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+using dpctl::tensor::type_utils::is_complex;
+using dpctl::tensor::type_utils::vec_cast;
+
+template <typename argT, typename resT>
+struct IsFiniteFunctor
+{
+    static_assert(std::is_same_v<resT, bool>);
+
+    /*
+    std::is_same<argT, bool>::value ||
+                           std::is_integral<argT>::value
+    */
+    using is_constant = typename std::disjunction<std::is_same<argT, bool>,
+                                                  std::is_integral<argT>>;
+    static constexpr resT constant_value = true;
+    using supports_vec = typename std::false_type;
+    using supports_sg_loadstore = typename std::negation<
+        std::disjunction<is_complex<resT>, is_complex<argT>>>;
+
+    resT operator()(const argT &in) const
+    {
+        if constexpr (is_complex<argT>::value) {
+            const bool real_isfinite = std::isfinite(std::real(in));
+            const bool imag_isfinite = std::isfinite(std::imag(in));
+            return (real_isfinite && imag_isfinite);
+        }
+        else if constexpr (std::is_same<argT, bool>::value ||
+                           std::is_integral<argT>::value)
+        {
+            return constant_value;
+        }
+        else if constexpr (std::is_same_v<argT, sycl::half>) {
+            return sycl::isfinite(in);
+        }
+        else {
+            return std::isfinite(in);
+        }
+    }
+
+    template <int vec_sz>
+    sycl::vec<resT, vec_sz> operator()(const sycl::vec<argT, vec_sz> &in) const
+    {
+        auto const &res_vec = sycl::isfinite(in);
+
+        using deducedT = typename std::remove_cv_t<
+            std::remove_reference_t<decltype(res_vec)>>::element_type;
+
+        return vec_cast<bool, deducedT, vec_sz>(res_vec);
+    }
+};
+
+template <typename argT,
+          typename resT = bool,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
+          bool enable_sg_loadstore = true>
+using IsFiniteContigFunctor =
+    elementwise_common::UnaryContigFunctor<argT,
+                                           resT,
+                                           IsFiniteFunctor<argT, resT>,
+                                           vec_sz,
+                                           n_vecs,
+                                           enable_sg_loadstore>;
+
+template <typename argTy, typename resTy, typename IndexerT>
+using IsFiniteStridedFunctor = elementwise_common::
+    UnaryStridedFunctor<argTy, resTy, IndexerT, IsFiniteFunctor<argTy, resTy>>;
+
+template <typename argTy>
+struct IsFiniteOutputType
+{
+    using value_type = bool;
+};
+
+namespace hyperparam_detail
+{
+
+namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
+
+using vsu_ns::ContigHyperparameterSetDefault;
+using vsu_ns::UnaryContigHyperparameterSetEntry;
+
+template <typename argTy>
+struct IsFiniteContigHyperparameterSet
+{
+    using value_type =
+        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
+
+    constexpr static auto vec_sz = value_type::vec_sz;
+    constexpr static auto n_vecs = value_type::n_vecs;
+};
+
+} // end of namespace hyperparam_detail
+
+template <typename T1, typename T2, std::uint8_t vec_sz, std::uint8_t n_vecs>
+class isfinite_contig_kernel;
+
+template <typename argTy>
+sycl::event isfinite_contig_impl(sycl::queue &exec_q,
+                                 std::size_t nelems,
+                                 const char *arg_p,
+                                 char *res_p,
+                                 const std::vector<sycl::event> &depends = {})
+{
+    using IsFiniteHS =
+        hyperparam_detail::IsFiniteContigHyperparameterSet<argTy>;
+    static constexpr std::uint8_t vec_sz = IsFiniteHS::vec_sz;
+    static constexpr std::uint8_t n_vecs = IsFiniteHS::n_vecs;
+
+    return elementwise_common::unary_contig_impl<
+        argTy, IsFiniteOutputType, IsFiniteContigFunctor,
+        isfinite_contig_kernel, vec_sz, n_vecs>(exec_q, nelems, arg_p, res_p,
+                                                depends);
+}
+
+template <typename fnT, typename T>
+struct IsFiniteContigFactory
+{
+    fnT get()
+    {
+        fnT fn = isfinite_contig_impl<T>;
+        return fn;
+    }
+};
+
+template <typename fnT, typename T>
+struct IsFiniteTypeMapFactory
+{
+    /*! @brief get typeid for output type of sycl::isfinite(T x) */
+    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
+    {
+        using rT = typename IsFiniteOutputType<T>::value_type;
+        return td_ns::GetTypeid<rT>{}.get();
+    }
+};
+
+template <typename T1, typename T2, typename T3>
+class isfinite_strided_kernel;
+
+template <typename argTy>
+sycl::event
+    isfinite_strided_impl(sycl::queue &exec_q,
+                          std::size_t nelems,
+                          int nd,
+                          const ssize_t *shape_and_strides,
+                          const char *arg_p,
+                          ssize_t arg_offset,
+                          char *res_p,
+                          ssize_t res_offset,
+                          const std::vector<sycl::event> &depends,
+                          const std::vector<sycl::event> &additional_depends)
+{
+    return elementwise_common::unary_strided_impl<argTy, IsFiniteOutputType,
+                                                  IsFiniteStridedFunctor,
+                                                  isfinite_strided_kernel>(
+        exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p,
+        res_offset, depends, additional_depends);
+}
+
+template <typename fnT, typename T>
+struct IsFiniteStridedFactory
+{
+    fnT get()
+    {
+        fnT fn = isfinite_strided_impl<T>;
+        return fn;
+    }
+};
+
+} // namespace dpctl::tensor::kernels::isfinite
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/isinf.hpp b/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/isinf.hpp
new file mode 100644
index 000000000000..89ba83df9268
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/isinf.hpp
@@ -0,0 +1,224 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for elementwise evaluation of ISINF(x)
+/// function that tests whether a tensor element is an infinity.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <cmath>
+#include <complex>
+#include <cstddef>
+#include <cstdint>
+#include <type_traits>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "vec_size_util.hpp"
+
+#include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+
+#include "utils/type_dispatch_building.hpp"
+#include "utils/type_utils.hpp"
+
+namespace dpctl::tensor::kernels::isinf
+{
+
+using dpctl::tensor::ssize_t;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+using dpctl::tensor::type_utils::is_complex;
+using dpctl::tensor::type_utils::vec_cast;
+
+template <typename argT, typename resT>
+struct IsInfFunctor
+{
+    static_assert(std::is_same_v<resT, bool>);
+
+    using is_constant = typename std::disjunction<std::is_same<argT, bool>,
+                                                  std::is_integral<argT>>;
+    static constexpr resT constant_value = false;
+    using supports_vec =
+        typename std::disjunction<std::is_same<argT, sycl::half>,
+                                  std::is_floating_point<argT>>;
+    using supports_sg_loadstore = typename std::negation<
+        std::disjunction<is_complex<resT>, is_complex<argT>>>;
+
+    resT operator()(const argT &in) const
+    {
+        if constexpr (is_complex<argT>::value) {
+            const bool real_isinf = std::isinf(std::real(in));
+            const bool imag_isinf = std::isinf(std::imag(in));
+            return (real_isinf || imag_isinf);
+        }
+        else if constexpr (std::is_same<argT, bool>::value ||
+                           std::is_integral<argT>::value)
+        {
+            return constant_value;
+        }
+        else if constexpr (std::is_same_v<argT, sycl::half>) {
+            return sycl::isinf(in);
+        }
+        else {
+            return std::isinf(in);
+        }
+    }
+
+    template <int vec_sz>
+    sycl::vec<resT, vec_sz> operator()(const sycl::vec<argT, vec_sz> &in) const
+    {
+        auto const &res_vec = sycl::isinf(in);
+
+        using deducedT = typename std::remove_cv_t<
+            std::remove_reference_t<decltype(res_vec)>>::element_type;
+
+        return vec_cast<bool, deducedT, vec_sz>(res_vec);
+    }
+};
+
+template <typename argT,
+          typename resT = bool,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
+          bool enable_sg_loadstore = true>
+using IsInfContigFunctor =
+    elementwise_common::UnaryContigFunctor<argT,
+                                           resT,
+                                           IsInfFunctor<argT, resT>,
+                                           vec_sz,
+                                           n_vecs,
+                                           enable_sg_loadstore>;
+
+template <typename argTy, typename resTy, typename IndexerT>
+using IsInfStridedFunctor = elementwise_common::
+    UnaryStridedFunctor<argTy, resTy, IndexerT, IsInfFunctor<argTy, resTy>>;
+
+template <typename argTy>
+struct IsInfOutputType
+{
+    using value_type = bool;
+};
+
+namespace hyperparam_detail
+{
+
+namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
+
+using vsu_ns::ContigHyperparameterSetDefault;
+using vsu_ns::UnaryContigHyperparameterSetEntry;
+
+template <typename argTy>
+struct IsInfContigHyperparameterSet
+{
+    using value_type =
+        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
+
+    constexpr static auto vec_sz = value_type::vec_sz;
+    constexpr static auto n_vecs = value_type::n_vecs;
+};
+
+} // namespace hyperparam_detail
+
+template <typename T1, typename T2, std::uint8_t vec_sz, std::uint8_t n_vecs>
+class isinf_contig_kernel;
+
+template <typename argTy>
+sycl::event isinf_contig_impl(sycl::queue &exec_q,
+                              std::size_t nelems,
+                              const char *arg_p,
+                              char *res_p,
+                              const std::vector<sycl::event> &depends = {})
+{
+    using IsInfHS = hyperparam_detail::IsInfContigHyperparameterSet<argTy>;
+    static constexpr std::uint8_t vec_sz = IsInfHS::vec_sz;
+    static constexpr std::uint8_t n_vecs = IsInfHS::n_vecs;
+
+    return elementwise_common::unary_contig_impl<
+        argTy, IsInfOutputType, IsInfContigFunctor, isinf_contig_kernel, vec_sz,
+        n_vecs>(exec_q, nelems, arg_p, res_p, depends);
+}
+
+template <typename fnT, typename T>
+struct IsInfContigFactory
+{
+    fnT get()
+    {
+        fnT fn = isinf_contig_impl<T>;
+        return fn;
+    }
+};
+
+template <typename fnT, typename T>
+struct IsInfTypeMapFactory
+{
+    /*! @brief get typeid for output type of sycl::isinf(T x) */
+    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
+    {
+        using rT = typename IsInfOutputType<T>::value_type;
+        return td_ns::GetTypeid<rT>{}.get();
+    }
+};
+
+template <typename T1, typename T2, typename T3>
+class isinf_strided_kernel;
+
+template <typename argTy>
+sycl::event
+    isinf_strided_impl(sycl::queue &exec_q,
+                       std::size_t nelems,
+                       int nd,
+                       const ssize_t *shape_and_strides,
+                       const char *arg_p,
+                       ssize_t arg_offset,
+                       char *res_p,
+                       ssize_t res_offset,
+                       const std::vector<sycl::event> &depends,
+                       const std::vector<sycl::event> &additional_depends)
+{
+    return elementwise_common::unary_strided_impl<
+        argTy, IsInfOutputType, IsInfStridedFunctor, isinf_strided_kernel>(
+        exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p,
+        res_offset, depends, additional_depends);
+}
+
+template <typename fnT, typename T>
+struct IsInfStridedFactory
+{
+    fnT get()
+    {
+        fnT fn = isinf_strided_impl<T>;
+        return fn;
+    }
+};
+
+} // namespace dpctl::tensor::kernels::isinf
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/isnan.hpp b/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/isnan.hpp
new file mode 100644
index 000000000000..f78b724bf2d3
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/isnan.hpp
@@ -0,0 +1,222 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for elementwise evaluation of ISNAN(x)
+/// function that tests whether a tensor element is a NaN.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <complex>
+#include <cstddef>
+#include <cstdint>
+#include <type_traits>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "vec_size_util.hpp"
+
+#include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+
+#include "utils/type_dispatch_building.hpp"
+#include "utils/type_utils.hpp"
+
+namespace dpctl::tensor::kernels::isnan
+{
+
+using dpctl::tensor::ssize_t;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+using dpctl::tensor::type_utils::is_complex;
+using dpctl::tensor::type_utils::vec_cast;
+
+template <typename argT, typename resT>
+struct IsNanFunctor
+{
+    static_assert(std::is_same_v<resT, bool>);
+
+    /*
+    std::is_same<argT, bool>::value ||
+                           std::is_integral<argT>::value
+    */
+    using is_constant = typename std::disjunction<std::is_same<argT, bool>,
+                                                  std::is_integral<argT>>;
+    static constexpr resT constant_value = false;
+    using supports_vec = typename std::true_type;
+    using supports_sg_loadstore = typename std::negation<
+        std::disjunction<is_complex<resT>, is_complex<argT>>>;
+
+    resT operator()(const argT &in) const
+    {
+        if constexpr (is_complex<argT>::value) {
+            const bool real_isnan = sycl::isnan(std::real(in));
+            const bool imag_isnan = sycl::isnan(std::imag(in));
+            return (real_isnan || imag_isnan);
+        }
+        else if constexpr (std::is_same<argT, bool>::value ||
+                           std::is_integral<argT>::value)
+        {
+            return constant_value;
+        }
+        else {
+            return sycl::isnan(in);
+        }
+    }
+
+    template <int vec_sz>
+    sycl::vec<resT, vec_sz> operator()(const sycl::vec<argT, vec_sz> &in) const
+    {
+        auto const &res_vec = sycl::isnan(in);
+
+        using deducedT = typename std::remove_cv_t<
+            std::remove_reference_t<decltype(res_vec)>>::element_type;
+
+        return vec_cast<bool, deducedT, vec_sz>(res_vec);
+    }
+};
+
+template <typename argT,
+          typename resT = bool,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
+          bool enable_sg_loadstore = true>
+using IsNanContigFunctor =
+    elementwise_common::UnaryContigFunctor<argT,
+                                           resT,
+                                           IsNanFunctor<argT, resT>,
+                                           vec_sz,
+                                           n_vecs,
+                                           enable_sg_loadstore>;
+
+template <typename argTy, typename resTy, typename IndexerT>
+using IsNanStridedFunctor = elementwise_common::
+    UnaryStridedFunctor<argTy, resTy, IndexerT, IsNanFunctor<argTy, resTy>>;
+
+template <typename argTy>
+struct IsNanOutputType
+{
+    using value_type = bool;
+};
+
+namespace hyperparam_detail
+{
+
+namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
+
+using vsu_ns::ContigHyperparameterSetDefault;
+using vsu_ns::UnaryContigHyperparameterSetEntry;
+
+template <typename argTy>
+struct IsNanContigHyperparameterSet
+{
+    using value_type =
+        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
+
+    constexpr static auto vec_sz = value_type::vec_sz;
+    constexpr static auto n_vecs = value_type::n_vecs;
+};
+
+} // end of namespace hyperparam_detail
+
+template <typename T1, typename T2, std::uint8_t vec_sz, std::uint8_t n_vecs>
+class isnan_contig_kernel;
+
+template <typename argTy>
+sycl::event isnan_contig_impl(sycl::queue &exec_q,
+                              std::size_t nelems,
+                              const char *arg_p,
+                              char *res_p,
+                              const std::vector<sycl::event> &depends = {})
+{
+    using IsNanHS = hyperparam_detail::IsNanContigHyperparameterSet<argTy>;
+    static constexpr std::uint8_t vec_sz = IsNanHS::vec_sz;
+    static constexpr std::uint8_t n_vecs = IsNanHS::n_vecs;
+
+    return elementwise_common::unary_contig_impl<
+        argTy, IsNanOutputType, IsNanContigFunctor, isnan_contig_kernel, vec_sz,
+        n_vecs>(exec_q, nelems, arg_p, res_p, depends);
+}
+
+template <typename fnT, typename T>
+struct IsNanContigFactory
+{
+    fnT get()
+    {
+        fnT fn = isnan_contig_impl<T>;
+        return fn;
+    }
+};
+
+template <typename fnT, typename T>
+struct IsNanTypeMapFactory
+{
+    /*! @brief get typeid for output type of sycl::isnan(T x) */
+    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
+    {
+        using rT = typename IsNanOutputType<T>::value_type;
+        return td_ns::GetTypeid<rT>{}.get();
+    }
+};
+
+template <typename T1, typename T2, typename T3>
+class isnan_strided_kernel;
+
+template <typename argTy>
+sycl::event
+    isnan_strided_impl(sycl::queue &exec_q,
+                       std::size_t nelems,
+                       int nd,
+                       const ssize_t *shape_and_strides,
+                       const char *arg_p,
+                       ssize_t arg_offset,
+                       char *res_p,
+                       ssize_t res_offset,
+                       const std::vector<sycl::event> &depends,
+                       const std::vector<sycl::event> &additional_depends)
+{
+    return elementwise_common::unary_strided_impl<
+        argTy, IsNanOutputType, IsNanStridedFunctor, isnan_strided_kernel>(
+        exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p,
+        res_offset, depends, additional_depends);
+}
+
+template <typename fnT, typename T>
+struct IsNanStridedFactory
+{
+    fnT get()
+    {
+        fnT fn = isnan_strided_impl<T>;
+        return fn;
+    }
+};
+
+} // namespace dpctl::tensor::kernels::isnan
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/log.hpp b/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/log.hpp
new file mode 100644
index 000000000000..05e5048f65a7
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/log.hpp
@@ -0,0 +1,222 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for elementwise evaluation of LOG(x) function.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <complex>
+#include <cstddef>
+#include <cstdint>
+#include <type_traits>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "sycl_complex.hpp"
+#include "vec_size_util.hpp"
+
+#include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+
+#include "utils/type_dispatch_building.hpp"
+#include "utils/type_utils.hpp"
+
+namespace dpctl::tensor::kernels::log
+{
+
+using dpctl::tensor::ssize_t;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+using dpctl::tensor::type_utils::is_complex;
+
+template <typename argT, typename resT>
+struct LogFunctor
+{
+
+    // is function constant for given argT
+    using is_constant = typename std::false_type;
+    // constant value, if constant
+    // constexpr resT constant_value = resT{};
+    // is function defined for sycl::vec
+    using supports_vec = typename std::false_type;
+    // do both argTy and resTy support sugroup store/load operation
+    using supports_sg_loadstore = typename std::negation<
+        std::disjunction<is_complex<resT>, is_complex<argT>>>;
+
+    resT operator()(const argT &in) const
+    {
+        if constexpr (is_complex<argT>::value) {
+            using realT = typename argT::value_type;
+            return exprm_ns::log(exprm_ns::complex<realT>(in)); // log(in);
+        }
+        else {
+            return sycl::log(in);
+        }
+    }
+};
+
+template <typename argTy,
+          typename resTy = argTy,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
+          bool enable_sg_loadstore = true>
+using LogContigFunctor =
+    elementwise_common::UnaryContigFunctor<argTy,
+                                           resTy,
+                                           LogFunctor<argTy, resTy>,
+                                           vec_sz,
+                                           n_vecs,
+                                           enable_sg_loadstore>;
+
+template <typename argTy, typename resTy, typename IndexerT>
+using LogStridedFunctor = elementwise_common::
+    UnaryStridedFunctor<argTy, resTy, IndexerT, LogFunctor<argTy, resTy>>;
+
+template <typename T>
+struct LogOutputType
+{
+    using value_type = typename std::disjunction<
+        td_ns::TypeMapResultEntry<T, sycl::half, sycl::half>,
+        td_ns::TypeMapResultEntry<T, float, float>,
+        td_ns::TypeMapResultEntry<T, double, double>,
+        td_ns::TypeMapResultEntry<T, std::complex<float>, std::complex<float>>,
+        td_ns::
+            TypeMapResultEntry<T, std::complex<double>, std::complex<double>>,
+        td_ns::DefaultResultEntry<void>>::result_type;
+
+    static constexpr bool is_defined = !std::is_same_v<value_type, void>;
+};
+
+namespace hyperparam_detail
+{
+
+namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
+
+using vsu_ns::ContigHyperparameterSetDefault;
+using vsu_ns::UnaryContigHyperparameterSetEntry;
+
+template <typename argTy>
+struct LogContigHyperparameterSet
+{
+    using value_type =
+        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
+
+    constexpr static auto vec_sz = value_type::vec_sz;
+    constexpr static auto n_vecs = value_type::n_vecs;
+};
+
+} // end of namespace hyperparam_detail
+
+template <typename T1, typename T2, std::uint8_t vec_sz, std::uint8_t n_vecs>
+class log_contig_kernel;
+
+template <typename argTy>
+sycl::event log_contig_impl(sycl::queue &exec_q,
+                            std::size_t nelems,
+                            const char *arg_p,
+                            char *res_p,
+                            const std::vector<sycl::event> &depends = {})
+{
+    using LogHS = hyperparam_detail::LogContigHyperparameterSet<argTy>;
+    static constexpr std::uint8_t vec_sz = LogHS::vec_sz;
+    static constexpr std::uint8_t n_vecs = LogHS::n_vecs;
+
+    return elementwise_common::unary_contig_impl<
+        argTy, LogOutputType, LogContigFunctor, log_contig_kernel, vec_sz,
+        n_vecs>(exec_q, nelems, arg_p, res_p, depends);
+}
+
+template <typename fnT, typename T>
+struct LogContigFactory
+{
+    fnT get()
+    {
+        if constexpr (!LogOutputType<T>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = log_contig_impl<T>;
+            return fn;
+        }
+    }
+};
+
+template <typename fnT, typename T>
+struct LogTypeMapFactory
+{
+    /*! @brief get typeid for output type of sycl::log(T x) */
+    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
+    {
+        using rT = typename LogOutputType<T>::value_type;
+        return td_ns::GetTypeid<rT>{}.get();
+    }
+};
+
+template <typename T1, typename T2, typename T3>
+class log_strided_kernel;
+
+template <typename argTy>
+sycl::event log_strided_impl(sycl::queue &exec_q,
+                             std::size_t nelems,
+                             int nd,
+                             const ssize_t *shape_and_strides,
+                             const char *arg_p,
+                             ssize_t arg_offset,
+                             char *res_p,
+                             ssize_t res_offset,
+                             const std::vector<sycl::event> &depends,
+                             const std::vector<sycl::event> &additional_depends)
+{
+    return elementwise_common::unary_strided_impl<
+        argTy, LogOutputType, LogStridedFunctor, log_strided_kernel>(
+        exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p,
+        res_offset, depends, additional_depends);
+}
+
+template <typename fnT, typename T>
+struct LogStridedFactory
+{
+    fnT get()
+    {
+        if constexpr (!LogOutputType<T>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = log_strided_impl<T>;
+            return fn;
+        }
+    }
+};
+
+} // namespace dpctl::tensor::kernels::log
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/log10.hpp b/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/log10.hpp
new file mode 100644
index 000000000000..8ddb701ea622
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/log10.hpp
@@ -0,0 +1,240 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for elementwise evaluation of LOG10(x) function.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <complex>
+#include <cstddef>
+#include <cstdint>
+#include <sycl/sycl.hpp>
+#include <type_traits>
+#include <vector>
+
+#include "sycl_complex.hpp"
+#include "vec_size_util.hpp"
+
+#include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+
+#include "utils/type_dispatch_building.hpp"
+#include "utils/type_utils.hpp"
+
+namespace dpctl::tensor::kernels::log10
+{
+
+using dpctl::tensor::ssize_t;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+using dpctl::tensor::type_utils::is_complex;
+using dpctl::tensor::type_utils::vec_cast;
+
+template <typename argT, typename resT>
+struct Log10Functor
+{
+
+    // is function constant for given argT
+    using is_constant = typename std::false_type;
+    // constant value, if constant
+    // constexpr resT constant_value = resT{};
+    // is function defined for sycl::vec
+    using supports_vec = typename std::negation<
+        std::disjunction<is_complex<resT>, is_complex<argT>>>;
+    // do both argTy and resTy support sugroup store/load operation
+    using supports_sg_loadstore = typename std::negation<
+        std::disjunction<is_complex<resT>, is_complex<argT>>>;
+
+    resT operator()(const argT &in) const
+    {
+        if constexpr (is_complex<argT>::value) {
+            using realT = typename argT::value_type;
+            // return (log(in) / log(realT{10}));
+            return exprm_ns::log(exprm_ns::complex<realT>(in)) /
+                   sycl::log(realT{10});
+        }
+        else {
+            return sycl::log10(in);
+        }
+    }
+
+    template <int vec_sz>
+    sycl::vec<resT, vec_sz> operator()(const sycl::vec<argT, vec_sz> &in) const
+    {
+        auto const &res_vec = sycl::log10(in);
+        using deducedT = typename std::remove_cv_t<
+            std::remove_reference_t<decltype(res_vec)>>::element_type;
+        if constexpr (std::is_same_v<resT, deducedT>) {
+            return res_vec;
+        }
+        else {
+            return vec_cast<resT, deducedT, vec_sz>(res_vec);
+        }
+    }
+};
+
+template <typename argTy,
+          typename resTy = argTy,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
+          bool enable_sg_loadstore = true>
+using Log10ContigFunctor =
+    elementwise_common::UnaryContigFunctor<argTy,
+                                           resTy,
+                                           Log10Functor<argTy, resTy>,
+                                           vec_sz,
+                                           n_vecs,
+                                           enable_sg_loadstore>;
+
+template <typename argTy, typename resTy, typename IndexerT>
+using Log10StridedFunctor = elementwise_common::
+    UnaryStridedFunctor<argTy, resTy, IndexerT, Log10Functor<argTy, resTy>>;
+
+template <typename T>
+struct Log10OutputType
+{
+    using value_type = typename std::disjunction<
+        td_ns::TypeMapResultEntry<T, sycl::half, sycl::half>,
+        td_ns::TypeMapResultEntry<T, float, float>,
+        td_ns::TypeMapResultEntry<T, double, double>,
+        td_ns::TypeMapResultEntry<T, std::complex<float>, std::complex<float>>,
+        td_ns::
+            TypeMapResultEntry<T, std::complex<double>, std::complex<double>>,
+        td_ns::DefaultResultEntry<void>>::result_type;
+
+    static constexpr bool is_defined = !std::is_same_v<value_type, void>;
+};
+
+namespace hyperparam_detail
+{
+
+namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
+
+using vsu_ns::ContigHyperparameterSetDefault;
+using vsu_ns::UnaryContigHyperparameterSetEntry;
+
+template <typename argTy>
+struct Log10ContigHyperparameterSet
+{
+    using value_type =
+        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
+
+    constexpr static auto vec_sz = value_type::vec_sz;
+    constexpr static auto n_vecs = value_type::n_vecs;
+};
+
+} // end of namespace hyperparam_detail
+
+template <typename T1, typename T2, std::uint8_t vec_sz, std::uint8_t n_vecs>
+class log10_contig_kernel;
+
+template <typename argTy>
+sycl::event log10_contig_impl(sycl::queue &exec_q,
+                              std::size_t nelems,
+                              const char *arg_p,
+                              char *res_p,
+                              const std::vector<sycl::event> &depends = {})
+{
+    using Log10HS = hyperparam_detail::Log10ContigHyperparameterSet<argTy>;
+    static constexpr std::uint8_t vec_sz = Log10HS::vec_sz;
+    static constexpr std::uint8_t n_vecs = Log10HS::n_vecs;
+
+    return elementwise_common::unary_contig_impl<
+        argTy, Log10OutputType, Log10ContigFunctor, log10_contig_kernel, vec_sz,
+        n_vecs>(exec_q, nelems, arg_p, res_p, depends);
+}
+
+template <typename fnT, typename T>
+struct Log10ContigFactory
+{
+    fnT get()
+    {
+        if constexpr (!Log10OutputType<T>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = log10_contig_impl<T>;
+            return fn;
+        }
+    }
+};
+
+template <typename fnT, typename T>
+struct Log10TypeMapFactory
+{
+    /*! @brief get typeid for output type of sycl::log10(T x) */
+    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
+    {
+        using rT = typename Log10OutputType<T>::value_type;
+        return td_ns::GetTypeid<rT>{}.get();
+    }
+};
+
+template <typename T1, typename T2, typename T3>
+class log10_strided_kernel;
+
+template <typename argTy>
+sycl::event
+    log10_strided_impl(sycl::queue &exec_q,
+                       std::size_t nelems,
+                       int nd,
+                       const ssize_t *shape_and_strides,
+                       const char *arg_p,
+                       ssize_t arg_offset,
+                       char *res_p,
+                       ssize_t res_offset,
+                       const std::vector<sycl::event> &depends,
+                       const std::vector<sycl::event> &additional_depends)
+{
+    return elementwise_common::unary_strided_impl<
+        argTy, Log10OutputType, Log10StridedFunctor, log10_strided_kernel>(
+        exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p,
+        res_offset, depends, additional_depends);
+}
+
+template <typename fnT, typename T>
+struct Log10StridedFactory
+{
+    fnT get()
+    {
+        if constexpr (!Log10OutputType<T>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = log10_strided_impl<T>;
+            return fn;
+        }
+    }
+};
+
+} // namespace dpctl::tensor::kernels::log10
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/log1p.hpp b/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/log1p.hpp
new file mode 100644
index 000000000000..8365932aead7
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/log1p.hpp
@@ -0,0 +1,248 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for elementwise evaluation of LOG1P(x) function.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <algorithm>
+#include <complex>
+#include <cstddef>
+#include <cstdint>
+#include <type_traits>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "vec_size_util.hpp"
+
+#include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+
+#include "utils/type_dispatch_building.hpp"
+#include "utils/type_utils.hpp"
+
+namespace dpctl::tensor::kernels::log1p
+{
+
+using dpctl::tensor::ssize_t;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+using dpctl::tensor::type_utils::is_complex;
+
+// TODO: evaluate precision against alternatives
+template <typename argT, typename resT>
+struct Log1pFunctor
+{
+
+    // is function constant for given argT
+    using is_constant = typename std::false_type;
+    // constant value, if constant
+    // constexpr resT constant_value = resT{};
+    // is function defined for sycl::vec
+    using supports_vec = typename std::false_type;
+    // do both argTy and resTy support sugroup store/load operation
+    using supports_sg_loadstore = typename std::negation<
+        std::disjunction<is_complex<resT>, is_complex<argT>>>;
+
+    resT operator()(const argT &in) const
+    {
+        if constexpr (is_complex<argT>::value) {
+            // log1p(z) = ln((x + 1) + yI)
+            //          = ln(|(x + 1) + yi|)
+            //             + I * atan2(y, x + 1)
+            //          = ln(sqrt((x + 1)^2 + y^2))
+            //             + I *atan2(y, x + 1)
+            //          = log1p(x^2 + 2x + y^2) / 2
+            //             + I * atan2(y, x + 1)
+            using realT = typename argT::value_type;
+            const realT x = std::real(in);
+            const realT y = std::imag(in);
+
+            // imaginary part of result
+            const realT res_im = sycl::atan2(y, x + 1);
+
+            if (std::max(sycl::fabs(x), sycl::fabs(y)) < realT{.1}) {
+                const realT v = x * (2 + x) + y * y;
+                return resT{sycl::log1p(v) / 2, res_im};
+            }
+            else {
+                // when not close to zero,
+                // prevent overflow
+                const realT m = sycl::hypot(x + 1, y);
+                return resT{sycl::log(m), res_im};
+            }
+        }
+        else {
+            static_assert(std::is_floating_point_v<argT> ||
+                          std::is_same_v<argT, sycl::half>);
+            return sycl::log1p(in);
+        }
+    }
+};
+
+template <typename argTy,
+          typename resTy = argTy,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
+          bool enable_sg_loadstore = true>
+using Log1pContigFunctor =
+    elementwise_common::UnaryContigFunctor<argTy,
+                                           resTy,
+                                           Log1pFunctor<argTy, resTy>,
+                                           vec_sz,
+                                           n_vecs,
+                                           enable_sg_loadstore>;
+
+template <typename argTy, typename resTy, typename IndexerT>
+using Log1pStridedFunctor = elementwise_common::
+    UnaryStridedFunctor<argTy, resTy, IndexerT, Log1pFunctor<argTy, resTy>>;
+
+template <typename T>
+struct Log1pOutputType
+{
+    using value_type = typename std::disjunction<
+        td_ns::TypeMapResultEntry<T, sycl::half, sycl::half>,
+        td_ns::TypeMapResultEntry<T, float, float>,
+        td_ns::TypeMapResultEntry<T, double, double>,
+        td_ns::TypeMapResultEntry<T, std::complex<float>, std::complex<float>>,
+        td_ns::
+            TypeMapResultEntry<T, std::complex<double>, std::complex<double>>,
+        td_ns::DefaultResultEntry<void>>::result_type;
+
+    static constexpr bool is_defined = !std::is_same_v<value_type, void>;
+};
+
+namespace hyperparam_detail
+{
+
+namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
+
+using vsu_ns::ContigHyperparameterSetDefault;
+using vsu_ns::UnaryContigHyperparameterSetEntry;
+
+template <typename argTy>
+struct Log1pContigHyperparameterSet
+{
+    using value_type =
+        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
+
+    constexpr static auto vec_sz = value_type::vec_sz;
+    constexpr static auto n_vecs = value_type::n_vecs;
+};
+
+} // end of namespace hyperparam_detail
+
+template <typename T1, typename T2, std::uint8_t vec_sz, std::uint8_t n_vecs>
+class log1p_contig_kernel;
+
+template <typename argTy>
+sycl::event log1p_contig_impl(sycl::queue &exec_q,
+                              std::size_t nelems,
+                              const char *arg_p,
+                              char *res_p,
+                              const std::vector<sycl::event> &depends = {})
+{
+    using Log1pHS = hyperparam_detail::Log1pContigHyperparameterSet<argTy>;
+    static constexpr std::uint8_t vec_sz = Log1pHS::vec_sz;
+    static constexpr std::uint8_t n_vecs = Log1pHS::n_vecs;
+
+    return elementwise_common::unary_contig_impl<
+        argTy, Log1pOutputType, Log1pContigFunctor, log1p_contig_kernel, vec_sz,
+        n_vecs>(exec_q, nelems, arg_p, res_p, depends);
+}
+
+template <typename fnT, typename T>
+struct Log1pContigFactory
+{
+    fnT get()
+    {
+        if constexpr (!Log1pOutputType<T>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = log1p_contig_impl<T>;
+            return fn;
+        }
+    }
+};
+
+template <typename fnT, typename T>
+struct Log1pTypeMapFactory
+{
+    /*! @brief get typeid for output type of sycl::log1p(T x) */
+    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
+    {
+        using rT = typename Log1pOutputType<T>::value_type;
+        return td_ns::GetTypeid<rT>{}.get();
+    }
+};
+
+template <typename T1, typename T2, typename T3>
+class log1p_strided_kernel;
+
+template <typename argTy>
+sycl::event
+    log1p_strided_impl(sycl::queue &exec_q,
+                       std::size_t nelems,
+                       int nd,
+                       const ssize_t *shape_and_strides,
+                       const char *arg_p,
+                       ssize_t arg_offset,
+                       char *res_p,
+                       ssize_t res_offset,
+                       const std::vector<sycl::event> &depends,
+                       const std::vector<sycl::event> &additional_depends)
+{
+    return elementwise_common::unary_strided_impl<
+        argTy, Log1pOutputType, Log1pStridedFunctor, log1p_strided_kernel>(
+        exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p,
+        res_offset, depends, additional_depends);
+}
+
+template <typename fnT, typename T>
+struct Log1pStridedFactory
+{
+    fnT get()
+    {
+        if constexpr (!Log1pOutputType<T>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = log1p_strided_impl<T>;
+            return fn;
+        }
+    }
+};
+
+} // namespace dpctl::tensor::kernels::log1p
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/log2.hpp b/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/log2.hpp
new file mode 100644
index 000000000000..3cb537b82522
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/log2.hpp
@@ -0,0 +1,241 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for elementwise evaluation of LOG2(x) function.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <complex>
+#include <cstddef>
+#include <cstdint>
+#include <sycl/sycl.hpp>
+#include <type_traits>
+#include <vector>
+
+#include "sycl_complex.hpp"
+#include "vec_size_util.hpp"
+
+#include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+
+#include "utils/type_dispatch_building.hpp"
+#include "utils/type_utils.hpp"
+
+namespace dpctl::tensor::kernels::log2
+{
+
+using dpctl::tensor::ssize_t;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+using dpctl::tensor::type_utils::is_complex;
+using dpctl::tensor::type_utils::vec_cast;
+
+template <typename argT, typename resT>
+struct Log2Functor
+{
+
+    // is function constant for given argT
+    using is_constant = typename std::false_type;
+    // constant value, if constant
+    // constexpr resT constant_value = resT{};
+    // is function defined for sycl::vec
+    using supports_vec = typename std::negation<
+        std::disjunction<is_complex<resT>, is_complex<argT>>>;
+    // do both argTy and resTy support sugroup store/load operation
+    using supports_sg_loadstore = typename std::negation<
+        std::disjunction<is_complex<resT>, is_complex<argT>>>;
+
+    resT operator()(const argT &in) const
+    {
+        if constexpr (is_complex<argT>::value) {
+            using realT = typename argT::value_type;
+
+            // log(in) / log(realT{2});
+            return exprm_ns::log(exprm_ns::complex<realT>(in)) /
+                   sycl::log(realT{2});
+        }
+        else {
+            return sycl::log2(in);
+        }
+    }
+
+    template <int vec_sz>
+    sycl::vec<resT, vec_sz> operator()(const sycl::vec<argT, vec_sz> &in) const
+    {
+        auto const &res_vec = sycl::log2(in);
+        using deducedT = typename std::remove_cv_t<
+            std::remove_reference_t<decltype(res_vec)>>::element_type;
+        if constexpr (std::is_same_v<resT, deducedT>) {
+            return res_vec;
+        }
+        else {
+            return vec_cast<resT, deducedT, vec_sz>(res_vec);
+        }
+    }
+};
+
+template <typename argTy,
+          typename resTy = argTy,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
+          bool enable_sg_loadstore = true>
+using Log2ContigFunctor =
+    elementwise_common::UnaryContigFunctor<argTy,
+                                           resTy,
+                                           Log2Functor<argTy, resTy>,
+                                           vec_sz,
+                                           n_vecs,
+                                           enable_sg_loadstore>;
+
+template <typename argTy, typename resTy, typename IndexerT>
+using Log2StridedFunctor = elementwise_common::
+    UnaryStridedFunctor<argTy, resTy, IndexerT, Log2Functor<argTy, resTy>>;
+
+template <typename T>
+struct Log2OutputType
+{
+    using value_type = typename std::disjunction<
+        td_ns::TypeMapResultEntry<T, sycl::half, sycl::half>,
+        td_ns::TypeMapResultEntry<T, float, float>,
+        td_ns::TypeMapResultEntry<T, double, double>,
+        td_ns::TypeMapResultEntry<T, std::complex<float>, std::complex<float>>,
+        td_ns::
+            TypeMapResultEntry<T, std::complex<double>, std::complex<double>>,
+        td_ns::DefaultResultEntry<void>>::result_type;
+
+    static constexpr bool is_defined = !std::is_same_v<value_type, void>;
+};
+
+namespace hyperparam_detail
+{
+
+namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
+
+using vsu_ns::ContigHyperparameterSetDefault;
+using vsu_ns::UnaryContigHyperparameterSetEntry;
+
+template <typename argTy>
+struct Log2ContigHyperparameterSet
+{
+    using value_type =
+        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
+
+    constexpr static auto vec_sz = value_type::vec_sz;
+    constexpr static auto n_vecs = value_type::n_vecs;
+};
+
+} // end of namespace hyperparam_detail
+
+template <typename T1, typename T2, std::uint8_t vec_sz, std::uint8_t n_vecs>
+class log2_contig_kernel;
+
+template <typename argTy>
+sycl::event log2_contig_impl(sycl::queue &exec_q,
+                             std::size_t nelems,
+                             const char *arg_p,
+                             char *res_p,
+                             const std::vector<sycl::event> &depends = {})
+{
+    using Log2HS = hyperparam_detail::Log2ContigHyperparameterSet<argTy>;
+    static constexpr std::uint8_t vec_sz = Log2HS::vec_sz;
+    static constexpr std::uint8_t n_vecs = Log2HS::n_vecs;
+
+    return elementwise_common::unary_contig_impl<
+        argTy, Log2OutputType, Log2ContigFunctor, log2_contig_kernel, vec_sz,
+        n_vecs>(exec_q, nelems, arg_p, res_p, depends);
+}
+
+template <typename fnT, typename T>
+struct Log2ContigFactory
+{
+    fnT get()
+    {
+        if constexpr (!Log2OutputType<T>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = log2_contig_impl<T>;
+            return fn;
+        }
+    }
+};
+
+template <typename fnT, typename T>
+struct Log2TypeMapFactory
+{
+    /*! @brief get typeid for output type of sycl::log2(T x) */
+    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
+    {
+        using rT = typename Log2OutputType<T>::value_type;
+        return td_ns::GetTypeid<rT>{}.get();
+    }
+};
+
+template <typename T1, typename T2, typename T3>
+class log2_strided_kernel;
+
+template <typename argTy>
+sycl::event
+    log2_strided_impl(sycl::queue &exec_q,
+                      std::size_t nelems,
+                      int nd,
+                      const ssize_t *shape_and_strides,
+                      const char *arg_p,
+                      ssize_t arg_offset,
+                      char *res_p,
+                      ssize_t res_offset,
+                      const std::vector<sycl::event> &depends,
+                      const std::vector<sycl::event> &additional_depends)
+{
+    return elementwise_common::unary_strided_impl<
+        argTy, Log2OutputType, Log2StridedFunctor, log2_strided_kernel>(
+        exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p,
+        res_offset, depends, additional_depends);
+}
+
+template <typename fnT, typename T>
+struct Log2StridedFactory
+{
+    fnT get()
+    {
+        if constexpr (!Log2OutputType<T>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = log2_strided_impl<T>;
+            return fn;
+        }
+    }
+};
+
+} // namespace dpctl::tensor::kernels::log2
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/logaddexp.hpp b/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/logaddexp.hpp
index 7337b6e43eab..af93b089f0b2 100644
--- a/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/logaddexp.hpp
+++ b/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/logaddexp.hpp
@@ -25,7 +25,8 @@
 // ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
 // THE POSSIBILITY OF SUCH DAMAGE.
 //*****************************************************************************
-///
+//
+//===---------------------------------------------------------------------===//
 /// \file
 /// This file defines kernels for elementwise evaluation of LOGADDEXP(x1, x2)
 /// function.
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/logical_not.hpp b/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/logical_not.hpp
new file mode 100644
index 000000000000..b8f1c042ca73
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/logical_not.hpp
@@ -0,0 +1,199 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for elementwise evaluation of LOGICAL_NOT(x)
+/// function.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <cstddef>
+#include <cstdint>
+#include <type_traits>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "vec_size_util.hpp"
+
+#include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+
+#include "utils/type_dispatch_building.hpp"
+#include "utils/type_utils.hpp"
+
+namespace dpctl::tensor::kernels::logical_not
+{
+
+using dpctl::tensor::ssize_t;
+namespace td_ns = dpctl::tensor::type_dispatch;
+namespace tu_ns = dpctl::tensor::type_utils;
+
+template <typename argT, typename resT>
+struct LogicalNotFunctor
+{
+    static_assert(std::is_same_v<resT, bool>);
+
+    using is_constant = typename std::false_type;
+    // constexpr resT constant_value = resT{};
+    using supports_vec = typename std::false_type;
+    using supports_sg_loadstore = typename std::negation<
+        std::disjunction<tu_ns::is_complex<resT>, tu_ns::is_complex<argT>>>;
+
+    resT operator()(const argT &in) const
+    {
+        using tu_ns::convert_impl;
+        return !convert_impl<bool, argT>(in);
+    }
+};
+
+template <typename argT,
+          typename resT = bool,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
+          bool enable_sg_loadstore = true>
+using LogicalNotContigFunctor =
+    elementwise_common::UnaryContigFunctor<argT,
+                                           resT,
+                                           LogicalNotFunctor<argT, resT>,
+                                           vec_sz,
+                                           n_vecs,
+                                           enable_sg_loadstore>;
+
+template <typename argTy, typename resTy, typename IndexerT>
+using LogicalNotStridedFunctor =
+    elementwise_common::UnaryStridedFunctor<argTy,
+                                            resTy,
+                                            IndexerT,
+                                            LogicalNotFunctor<argTy, resTy>>;
+
+template <typename argTy>
+struct LogicalNotOutputType
+{
+    using value_type = bool;
+};
+
+namespace hyperparam_detail
+{
+
+namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
+
+using vsu_ns::ContigHyperparameterSetDefault;
+using vsu_ns::UnaryContigHyperparameterSetEntry;
+
+template <typename argTy>
+struct LogicalNotContigHyperparameterSet
+{
+    using value_type =
+        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
+
+    constexpr static auto vec_sz = value_type::vec_sz;
+    constexpr static auto n_vecs = value_type::n_vecs;
+};
+
+} // end of namespace hyperparam_detail
+
+template <typename T1, typename T2, std::uint8_t vec_sz, std::uint8_t n_vecs>
+class logical_not_contig_kernel;
+
+template <typename argTy>
+sycl::event
+    logical_not_contig_impl(sycl::queue &exec_q,
+                            std::size_t nelems,
+                            const char *arg_p,
+                            char *res_p,
+                            const std::vector<sycl::event> &depends = {})
+{
+    using LogicalNotHS =
+        hyperparam_detail::LogicalNotContigHyperparameterSet<argTy>;
+    static constexpr std::uint8_t vec_sz = LogicalNotHS::vec_sz;
+    static constexpr std::uint8_t n_vecs = LogicalNotHS::n_vecs;
+
+    return elementwise_common::unary_contig_impl<
+        argTy, LogicalNotOutputType, LogicalNotContigFunctor,
+        logical_not_contig_kernel, vec_sz, n_vecs>(exec_q, nelems, arg_p, res_p,
+                                                   depends);
+}
+
+template <typename fnT, typename T>
+struct LogicalNotContigFactory
+{
+    fnT get()
+    {
+        fnT fn = logical_not_contig_impl<T>;
+        return fn;
+    }
+};
+
+template <typename fnT, typename T>
+struct LogicalNotTypeMapFactory
+{
+    /*! @brief get typeid for output type of sycl::logical_not(T x) */
+    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
+    {
+        using rT = typename LogicalNotOutputType<T>::value_type;
+        return td_ns::GetTypeid<rT>{}.get();
+    }
+};
+
+template <typename T1, typename T2, typename T3>
+class logical_not_strided_kernel;
+
+template <typename argTy>
+sycl::event
+    logical_not_strided_impl(sycl::queue &exec_q,
+                             std::size_t nelems,
+                             int nd,
+                             const ssize_t *shape_and_strides,
+                             const char *arg_p,
+                             ssize_t arg_offset,
+                             char *res_p,
+                             ssize_t res_offset,
+                             const std::vector<sycl::event> &depends,
+                             const std::vector<sycl::event> &additional_depends)
+{
+    return elementwise_common::unary_strided_impl<argTy, LogicalNotOutputType,
+                                                  LogicalNotStridedFunctor,
+                                                  logical_not_strided_kernel>(
+        exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p,
+        res_offset, depends, additional_depends);
+}
+
+template <typename fnT, typename T>
+struct LogicalNotStridedFactory
+{
+    fnT get()
+    {
+        fnT fn = logical_not_strided_impl<T>;
+        return fn;
+    }
+};
+
+} // namespace dpctl::tensor::kernels::logical_not
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/negative.hpp b/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/negative.hpp
new file mode 100644
index 000000000000..f90786013557
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/negative.hpp
@@ -0,0 +1,222 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for elementwise evaluation of NEGATIVE(x)
+/// function that returns -x.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <complex>
+#include <cstddef>
+#include <cstdint>
+#include <type_traits>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "vec_size_util.hpp"
+
+#include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+
+#include "utils/type_dispatch_building.hpp"
+#include "utils/type_utils.hpp"
+
+namespace dpctl::tensor::kernels::negative
+{
+
+using dpctl::tensor::ssize_t;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+using dpctl::tensor::type_utils::is_complex;
+
+template <typename argT, typename resT>
+struct NegativeFunctor
+{
+
+    using is_constant = typename std::false_type;
+    // constexpr resT constant_value = resT{};
+    using supports_vec = typename std::false_type;
+    using supports_sg_loadstore = typename std::negation<
+        std::disjunction<is_complex<resT>, is_complex<argT>>>;
+
+    resT operator()(const argT &x) const
+    {
+        return -x;
+    }
+};
+
+template <typename argT,
+          typename resT = argT,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
+          bool enable_sg_loadstore = true>
+using NegativeContigFunctor =
+    elementwise_common::UnaryContigFunctor<argT,
+                                           resT,
+                                           NegativeFunctor<argT, resT>,
+                                           vec_sz,
+                                           n_vecs,
+                                           enable_sg_loadstore>;
+
+template <typename T>
+struct NegativeOutputType
+{
+    using value_type = typename std::disjunction<
+        td_ns::TypeMapResultEntry<T, std::uint8_t>,
+        td_ns::TypeMapResultEntry<T, std::uint16_t>,
+        td_ns::TypeMapResultEntry<T, std::uint32_t>,
+        td_ns::TypeMapResultEntry<T, std::uint64_t>,
+        td_ns::TypeMapResultEntry<T, std::int8_t>,
+        td_ns::TypeMapResultEntry<T, std::int16_t>,
+        td_ns::TypeMapResultEntry<T, std::int32_t>,
+        td_ns::TypeMapResultEntry<T, std::int64_t>,
+        td_ns::TypeMapResultEntry<T, sycl::half>,
+        td_ns::TypeMapResultEntry<T, float>,
+        td_ns::TypeMapResultEntry<T, double>,
+        td_ns::TypeMapResultEntry<T, std::complex<float>>,
+        td_ns::TypeMapResultEntry<T, std::complex<double>>,
+        td_ns::DefaultResultEntry<void>>::result_type;
+
+    static constexpr bool is_defined = !std::is_same_v<value_type, void>;
+};
+
+namespace hyperparam_detail
+{
+
+namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
+
+using vsu_ns::ContigHyperparameterSetDefault;
+using vsu_ns::UnaryContigHyperparameterSetEntry;
+
+template <typename argTy>
+struct NegativeContigHyperparameterSet
+{
+    using value_type =
+        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
+
+    constexpr static auto vec_sz = value_type::vec_sz;
+    constexpr static auto n_vecs = value_type::n_vecs;
+};
+
+} // end of namespace hyperparam_detail
+
+template <typename T1, typename T2, std::uint8_t vec_sz, std::uint8_t n_vecs>
+class negative_contig_kernel;
+
+template <typename argTy>
+sycl::event negative_contig_impl(sycl::queue &exec_q,
+                                 std::size_t nelems,
+                                 const char *arg_p,
+                                 char *res_p,
+                                 const std::vector<sycl::event> &depends = {})
+{
+    using NegHS = hyperparam_detail::NegativeContigHyperparameterSet<argTy>;
+    static constexpr std::uint8_t vec_sz = NegHS::vec_sz;
+    static constexpr std::uint8_t n_vecs = NegHS::n_vecs;
+
+    return elementwise_common::unary_contig_impl<
+        argTy, NegativeOutputType, NegativeContigFunctor,
+        negative_contig_kernel, vec_sz, n_vecs>(exec_q, nelems, arg_p, res_p,
+                                                depends);
+}
+
+template <typename fnT, typename T>
+struct NegativeContigFactory
+{
+    fnT get()
+    {
+        if constexpr (!NegativeOutputType<T>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = negative_contig_impl<T>;
+            return fn;
+        }
+    }
+};
+
+template <typename fnT, typename T>
+struct NegativeTypeMapFactory
+{
+    /*! @brief get typeid for output type of std::negative(T x) */
+    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
+    {
+        using rT = typename NegativeOutputType<T>::value_type;
+        return td_ns::GetTypeid<rT>{}.get();
+    }
+};
+
+template <typename argTy, typename resTy, typename IndexerT>
+using NegativeStridedFunctor = elementwise_common::
+    UnaryStridedFunctor<argTy, resTy, IndexerT, NegativeFunctor<argTy, resTy>>;
+
+template <typename T1, typename T2, typename T3>
+class negative_strided_kernel;
+
+template <typename argTy>
+sycl::event
+    negative_strided_impl(sycl::queue &exec_q,
+                          std::size_t nelems,
+                          int nd,
+                          const ssize_t *shape_and_strides,
+                          const char *arg_p,
+                          ssize_t arg_offset,
+                          char *res_p,
+                          ssize_t res_offset,
+                          const std::vector<sycl::event> &depends,
+                          const std::vector<sycl::event> &additional_depends)
+{
+    return elementwise_common::unary_strided_impl<argTy, NegativeOutputType,
+                                                  NegativeStridedFunctor,
+                                                  negative_strided_kernel>(
+        exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p,
+        res_offset, depends, additional_depends);
+}
+
+template <typename fnT, typename T>
+struct NegativeStridedFactory
+{
+    fnT get()
+    {
+        if constexpr (!NegativeOutputType<T>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = negative_strided_impl<T>;
+            return fn;
+        }
+    }
+};
+
+} // namespace dpctl::tensor::kernels::negative
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/positive.hpp b/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/positive.hpp
new file mode 100644
index 000000000000..c1ef29c709ab
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/positive.hpp
@@ -0,0 +1,238 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for elementwise evaluation of POSITIVE(x)
+/// function that returns +x.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <complex>
+#include <cstddef>
+#include <cstdint>
+#include <type_traits>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "vec_size_util.hpp"
+
+#include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+
+#include "utils/type_dispatch_building.hpp"
+#include "utils/type_utils.hpp"
+
+namespace dpctl::tensor::kernels::positive
+{
+
+using dpctl::tensor::ssize_t;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+using dpctl::tensor::type_utils::is_complex;
+using dpctl::tensor::type_utils::vec_cast;
+
+template <typename argT, typename resT>
+struct PositiveFunctor
+{
+
+    using is_constant = typename std::false_type;
+    // constexpr resT constant_value = resT{};
+    using supports_vec = typename std::negation<
+        std::disjunction<is_complex<resT>, is_complex<argT>>>;
+    using supports_sg_loadstore = typename std::negation<
+        std::disjunction<is_complex<resT>, is_complex<argT>>>;
+
+    resT operator()(const argT &x) const
+    {
+        return x;
+    }
+
+    template <int vec_sz>
+    sycl::vec<resT, vec_sz> operator()(const sycl::vec<argT, vec_sz> &in) const
+    {
+        auto const &res_vec = in;
+        using deducedT = typename std::remove_cv_t<
+            std::remove_reference_t<decltype(res_vec)>>::element_type;
+        if constexpr (std::is_same_v<resT, deducedT>) {
+            return res_vec;
+        }
+        else {
+            return vec_cast<resT, deducedT, vec_sz>(res_vec);
+        }
+    }
+};
+
+template <typename argT,
+          typename resT = argT,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
+          bool enable_sg_loadstore = true>
+using PositiveContigFunctor =
+    elementwise_common::UnaryContigFunctor<argT,
+                                           resT,
+                                           PositiveFunctor<argT, resT>,
+                                           vec_sz,
+                                           n_vecs,
+                                           enable_sg_loadstore>;
+
+template <typename T>
+struct PositiveOutputType
+{
+    using value_type = typename std::disjunction<
+        td_ns::TypeMapResultEntry<T, std::uint8_t>,
+        td_ns::TypeMapResultEntry<T, std::uint16_t>,
+        td_ns::TypeMapResultEntry<T, std::uint32_t>,
+        td_ns::TypeMapResultEntry<T, std::uint64_t>,
+        td_ns::TypeMapResultEntry<T, std::int8_t>,
+        td_ns::TypeMapResultEntry<T, std::int16_t>,
+        td_ns::TypeMapResultEntry<T, std::int32_t>,
+        td_ns::TypeMapResultEntry<T, std::int64_t>,
+        td_ns::TypeMapResultEntry<T, sycl::half>,
+        td_ns::TypeMapResultEntry<T, float>,
+        td_ns::TypeMapResultEntry<T, double>,
+        td_ns::TypeMapResultEntry<T, std::complex<float>>,
+        td_ns::TypeMapResultEntry<T, std::complex<double>>,
+        td_ns::DefaultResultEntry<void>>::result_type;
+
+    static constexpr bool is_defined = !std::is_same_v<value_type, void>;
+};
+
+namespace hyperparam_detail
+{
+
+namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
+
+using vsu_ns::ContigHyperparameterSetDefault;
+using vsu_ns::UnaryContigHyperparameterSetEntry;
+
+template <typename argTy>
+struct PositiveContigHyperparameterSet
+{
+    using value_type =
+        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
+
+    constexpr static auto vec_sz = value_type::vec_sz;
+    constexpr static auto n_vecs = value_type::n_vecs;
+};
+
+} // end of namespace hyperparam_detail
+
+template <typename T1, typename T2, std::uint8_t vec_sz, std::uint8_t n_vecs>
+class positive_contig_kernel;
+
+template <typename argTy>
+sycl::event positive_contig_impl(sycl::queue &exec_q,
+                                 std::size_t nelems,
+                                 const char *arg_p,
+                                 char *res_p,
+                                 const std::vector<sycl::event> &depends = {})
+{
+    using PosHS = hyperparam_detail::PositiveContigHyperparameterSet<argTy>;
+    static constexpr std::uint8_t vec_sz = PosHS::vec_sz;
+    static constexpr std::uint8_t n_vecs = PosHS::n_vecs;
+
+    return elementwise_common::unary_contig_impl<
+        argTy, PositiveOutputType, PositiveContigFunctor,
+        positive_contig_kernel, vec_sz, n_vecs>(exec_q, nelems, arg_p, res_p,
+                                                depends);
+}
+
+template <typename fnT, typename T>
+struct PositiveContigFactory
+{
+    fnT get()
+    {
+        if constexpr (!PositiveOutputType<T>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = positive_contig_impl<T>;
+            return fn;
+        }
+    }
+};
+
+template <typename fnT, typename T>
+struct PositiveTypeMapFactory
+{
+    /*! @brief get typeid for output type of std::positive(T x) */
+    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
+    {
+        using rT = typename PositiveOutputType<T>::value_type;
+        return td_ns::GetTypeid<rT>{}.get();
+    }
+};
+
+template <typename argTy, typename resTy, typename IndexerT>
+using PositiveStridedFunctor = elementwise_common::
+    UnaryStridedFunctor<argTy, resTy, IndexerT, PositiveFunctor<argTy, resTy>>;
+
+template <typename T1, typename T2, typename T3>
+class positive_strided_kernel;
+
+template <typename argTy>
+sycl::event
+    positive_strided_impl(sycl::queue &exec_q,
+                          std::size_t nelems,
+                          int nd,
+                          const ssize_t *shape_and_strides,
+                          const char *arg_p,
+                          ssize_t arg_offset,
+                          char *res_p,
+                          ssize_t res_offset,
+                          const std::vector<sycl::event> &depends,
+                          const std::vector<sycl::event> &additional_depends)
+{
+    return elementwise_common::unary_strided_impl<argTy, PositiveOutputType,
+                                                  PositiveStridedFunctor,
+                                                  positive_strided_kernel>(
+        exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p,
+        res_offset, depends, additional_depends);
+}
+
+template <typename fnT, typename T>
+struct PositiveStridedFactory
+{
+    fnT get()
+    {
+        if constexpr (!PositiveOutputType<T>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = positive_strided_impl<T>;
+            return fn;
+        }
+    }
+};
+
+} // namespace dpctl::tensor::kernels::positive
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/cos.cpp b/dpctl_ext/tensor/libtensor/source/elementwise_functions/cos.cpp
new file mode 100644
index 000000000000..966364c8b8c0
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/elementwise_functions/cos.cpp
@@ -0,0 +1,125 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "cos.hpp"
+#include "elementwise_functions.hpp"
+#include "utils/type_dispatch.hpp"
+#include "utils/type_dispatch_building.hpp"
+
+#include "kernels/elementwise_functions/common.hpp"
+#include "kernels/elementwise_functions/cos.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
+using ew_cmn_ns::unary_contig_impl_fn_ptr_t;
+using ew_cmn_ns::unary_strided_impl_fn_ptr_t;
+
+// U11: ==== COS   (x)
+namespace impl
+{
+
+namespace cos_fn_ns = dpctl::tensor::kernels::cos;
+
+static unary_contig_impl_fn_ptr_t cos_contig_dispatch_vector[td_ns::num_types];
+static int cos_output_typeid_vector[td_ns::num_types];
+static unary_strided_impl_fn_ptr_t
+    cos_strided_dispatch_vector[td_ns::num_types];
+
+void populate_cos_dispatch_vectors(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = cos_fn_ns;
+
+    using fn_ns::CosContigFactory;
+    DispatchVectorBuilder<unary_contig_impl_fn_ptr_t, CosContigFactory,
+                          num_types>
+        dvb1;
+    dvb1.populate_dispatch_vector(cos_contig_dispatch_vector);
+
+    using fn_ns::CosStridedFactory;
+    DispatchVectorBuilder<unary_strided_impl_fn_ptr_t, CosStridedFactory,
+                          num_types>
+        dvb2;
+    dvb2.populate_dispatch_vector(cos_strided_dispatch_vector);
+
+    using fn_ns::CosTypeMapFactory;
+    DispatchVectorBuilder<int, CosTypeMapFactory, num_types> dvb3;
+    dvb3.populate_dispatch_vector(cos_output_typeid_vector);
+};
+
+} // namespace impl
+
+void init_cos(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        impl::populate_cos_dispatch_vectors();
+        using impl::cos_contig_dispatch_vector;
+        using impl::cos_output_typeid_vector;
+        using impl::cos_strided_dispatch_vector;
+
+        auto cos_pyapi = [&](const arrayT &src, const arrayT &dst,
+                             sycl::queue &exec_q,
+                             const event_vecT &depends = {}) {
+            return py_unary_ufunc(
+                src, dst, exec_q, depends, cos_output_typeid_vector,
+                cos_contig_dispatch_vector, cos_strided_dispatch_vector);
+        };
+        m.def("_cos", cos_pyapi, "", py::arg("src"), py::arg("dst"),
+              py::arg("sycl_queue"), py::arg("depends") = py::list());
+
+        auto cos_result_type_pyapi = [&](const py::dtype &dtype) {
+            return py_unary_ufunc_result_type(dtype, cos_output_typeid_vector);
+        };
+        m.def("_cos_result_type", cos_result_type_pyapi);
+    }
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/cos.hpp b/dpctl_ext/tensor/libtensor/source/elementwise_functions/cos.hpp
new file mode 100644
index 000000000000..4b9ab341a355
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/elementwise_functions/cos.hpp
@@ -0,0 +1,46 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern void init_cos(py::module_ m);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/cosh.cpp b/dpctl_ext/tensor/libtensor/source/elementwise_functions/cosh.cpp
new file mode 100644
index 000000000000..54fc5d57e4df
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/elementwise_functions/cosh.cpp
@@ -0,0 +1,125 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "cosh.hpp"
+#include "elementwise_functions.hpp"
+#include "utils/type_dispatch.hpp"
+#include "utils/type_dispatch_building.hpp"
+
+#include "kernels/elementwise_functions/common.hpp"
+#include "kernels/elementwise_functions/cosh.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
+using ew_cmn_ns::unary_contig_impl_fn_ptr_t;
+using ew_cmn_ns::unary_strided_impl_fn_ptr_t;
+
+// U12: ==== COSH   (x)
+namespace impl
+{
+
+namespace cosh_fn_ns = dpctl::tensor::kernels::cosh;
+
+static unary_contig_impl_fn_ptr_t cosh_contig_dispatch_vector[td_ns::num_types];
+static int cosh_output_typeid_vector[td_ns::num_types];
+static unary_strided_impl_fn_ptr_t
+    cosh_strided_dispatch_vector[td_ns::num_types];
+
+void populate_cosh_dispatch_vectors(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = cosh_fn_ns;
+
+    using fn_ns::CoshContigFactory;
+    DispatchVectorBuilder<unary_contig_impl_fn_ptr_t, CoshContigFactory,
+                          num_types>
+        dvb1;
+    dvb1.populate_dispatch_vector(cosh_contig_dispatch_vector);
+
+    using fn_ns::CoshStridedFactory;
+    DispatchVectorBuilder<unary_strided_impl_fn_ptr_t, CoshStridedFactory,
+                          num_types>
+        dvb2;
+    dvb2.populate_dispatch_vector(cosh_strided_dispatch_vector);
+
+    using fn_ns::CoshTypeMapFactory;
+    DispatchVectorBuilder<int, CoshTypeMapFactory, num_types> dvb3;
+    dvb3.populate_dispatch_vector(cosh_output_typeid_vector);
+};
+
+} // namespace impl
+
+void init_cosh(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        impl::populate_cosh_dispatch_vectors();
+        using impl::cosh_contig_dispatch_vector;
+        using impl::cosh_output_typeid_vector;
+        using impl::cosh_strided_dispatch_vector;
+
+        auto cosh_pyapi = [&](const arrayT &src, const arrayT &dst,
+                              sycl::queue &exec_q,
+                              const event_vecT &depends = {}) {
+            return py_unary_ufunc(
+                src, dst, exec_q, depends, cosh_output_typeid_vector,
+                cosh_contig_dispatch_vector, cosh_strided_dispatch_vector);
+        };
+        m.def("_cosh", cosh_pyapi, "", py::arg("src"), py::arg("dst"),
+              py::arg("sycl_queue"), py::arg("depends") = py::list());
+
+        auto cosh_result_type_pyapi = [&](const py::dtype &dtype) {
+            return py_unary_ufunc_result_type(dtype, cosh_output_typeid_vector);
+        };
+        m.def("_cosh_result_type", cosh_result_type_pyapi);
+    }
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/cosh.hpp b/dpctl_ext/tensor/libtensor/source/elementwise_functions/cosh.hpp
new file mode 100644
index 000000000000..6ddfe5643b54
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/elementwise_functions/cosh.hpp
@@ -0,0 +1,46 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern void init_cosh(py::module_ m);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/elementwise_common.cpp b/dpctl_ext/tensor/libtensor/source/elementwise_functions/elementwise_common.cpp
index c2a0f3762b01..0a0c02f7ed31 100644
--- a/dpctl_ext/tensor/libtensor/source/elementwise_functions/elementwise_common.cpp
+++ b/dpctl_ext/tensor/libtensor/source/elementwise_functions/elementwise_common.cpp
@@ -55,39 +55,39 @@
 #include "ceil.hpp"
 #include "conj.hpp"
 // #include "copysign.hpp"
-// #include "cos.hpp"
-// #include "cosh.hpp"
+#include "cos.hpp"
+#include "cosh.hpp"
 // #include "equal.hpp"
-// #include "exp.hpp"
+#include "exp.hpp"
 // #include "exp2.hpp"
-// #include "expm1.hpp"
-// #include "floor.hpp"
+#include "expm1.hpp"
+#include "floor.hpp"
 // #include "floor_divide.hpp"
 // #include "greater.hpp"
 // #include "greater_equal.hpp"
 // #include "hypot.hpp"
-// #include "imag.hpp"
-// #include "isfinite.hpp"
-// #include "isinf.hpp"
-// #include "isnan.hpp"
+#include "imag.hpp"
+#include "isfinite.hpp"
+#include "isinf.hpp"
+#include "isnan.hpp"
 // #include "less.hpp"
 // #include "less_equal.hpp"
-// #include "log.hpp"
-// #include "log10.hpp"
-// #include "log1p.hpp"
-// #include "log2.hpp"
+#include "log.hpp"
+#include "log10.hpp"
+#include "log1p.hpp"
+#include "log2.hpp"
 // #include "logaddexp.hpp"
 // #include "logical_and.hpp"
-// #include "logical_not.hpp"
+#include "logical_not.hpp"
 // #include "logical_or.hpp"
 // #include "logical_xor.hpp"
 // #include "maximum.hpp"
 // #include "minimum.hpp"
 // #include "multiply.hpp"
-// #include "negative.hpp"
+#include "negative.hpp"
 // #include "nextafter.hpp"
 // #include "not_equal.hpp"
-// #include "positive.hpp"
+#include "positive.hpp"
 // #include "pow.hpp"
 // #include "proj.hpp"
 // #include "real.hpp"
@@ -135,40 +135,40 @@ void init_elementwise_functions(py::module_ m)
     init_ceil(m);
     init_conj(m);
     // init_copysign(m);
-    // init_cos(m);
-    // init_cosh(m);
+    init_cos(m);
+    init_cosh(m);
     // init_divide(m);
     // init_equal(m);
-    // init_exp(m);
+    init_exp(m);
     // init_exp2(m);
-    // init_expm1(m);
-    // init_floor(m);
+    init_expm1(m);
+    init_floor(m);
     // init_floor_divide(m);
     // init_greater(m);
     // init_greater_equal(m);
     // init_hypot(m);
-    // init_imag(m);
-    // init_isfinite(m);
-    // init_isinf(m);
-    // init_isnan(m);
+    init_imag(m);
+    init_isfinite(m);
+    init_isinf(m);
+    init_isnan(m);
     // init_less(m);
     // init_less_equal(m);
-    // init_log(m);
-    // init_log10(m);
-    // init_log1p(m);
-    // init_log2(m);
+    init_log(m);
+    init_log10(m);
+    init_log1p(m);
+    init_log2(m);
     // init_logaddexp(m);
     // init_logical_and(m);
-    // init_logical_not(m);
+    init_logical_not(m);
     // init_logical_or(m);
     // init_logical_xor(m);
     // init_maximum(m);
     // init_minimum(m);
     // init_multiply(m);
     // init_nextafter(m);
-    // init_negative(m);
+    init_negative(m);
     // init_not_equal(m);
-    // init_positive(m);
+    init_positive(m);
     // init_pow(m);
     // init_proj(m);
     // init_real(m);
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/exp.cpp b/dpctl_ext/tensor/libtensor/source/elementwise_functions/exp.cpp
new file mode 100644
index 000000000000..cd3cd65107f7
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/elementwise_functions/exp.cpp
@@ -0,0 +1,125 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "elementwise_functions.hpp"
+#include "exp.hpp"
+#include "utils/type_dispatch.hpp"
+#include "utils/type_dispatch_building.hpp"
+
+#include "kernels/elementwise_functions/common.hpp"
+#include "kernels/elementwise_functions/exp.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
+using ew_cmn_ns::unary_contig_impl_fn_ptr_t;
+using ew_cmn_ns::unary_strided_impl_fn_ptr_t;
+
+// U13: ==== EXP   (x)
+namespace impl
+{
+
+namespace exp_fn_ns = dpctl::tensor::kernels::exp;
+
+static unary_contig_impl_fn_ptr_t exp_contig_dispatch_vector[td_ns::num_types];
+static int exp_output_typeid_vector[td_ns::num_types];
+static unary_strided_impl_fn_ptr_t
+    exp_strided_dispatch_vector[td_ns::num_types];
+
+void populate_exp_dispatch_vectors(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = exp_fn_ns;
+
+    using fn_ns::ExpContigFactory;
+    DispatchVectorBuilder<unary_contig_impl_fn_ptr_t, ExpContigFactory,
+                          num_types>
+        dvb1;
+    dvb1.populate_dispatch_vector(exp_contig_dispatch_vector);
+
+    using fn_ns::ExpStridedFactory;
+    DispatchVectorBuilder<unary_strided_impl_fn_ptr_t, ExpStridedFactory,
+                          num_types>
+        dvb2;
+    dvb2.populate_dispatch_vector(exp_strided_dispatch_vector);
+
+    using fn_ns::ExpTypeMapFactory;
+    DispatchVectorBuilder<int, ExpTypeMapFactory, num_types> dvb3;
+    dvb3.populate_dispatch_vector(exp_output_typeid_vector);
+};
+
+} // namespace impl
+
+void init_exp(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        impl::populate_exp_dispatch_vectors();
+        using impl::exp_contig_dispatch_vector;
+        using impl::exp_output_typeid_vector;
+        using impl::exp_strided_dispatch_vector;
+
+        auto exp_pyapi = [&](const arrayT &src, const arrayT &dst,
+                             sycl::queue &exec_q,
+                             const event_vecT &depends = {}) {
+            return py_unary_ufunc(
+                src, dst, exec_q, depends, exp_output_typeid_vector,
+                exp_contig_dispatch_vector, exp_strided_dispatch_vector);
+        };
+        m.def("_exp", exp_pyapi, "", py::arg("src"), py::arg("dst"),
+              py::arg("sycl_queue"), py::arg("depends") = py::list());
+
+        auto exp_result_type_pyapi = [&](const py::dtype &dtype) {
+            return py_unary_ufunc_result_type(dtype, exp_output_typeid_vector);
+        };
+        m.def("_exp_result_type", exp_result_type_pyapi);
+    }
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/exp.hpp b/dpctl_ext/tensor/libtensor/source/elementwise_functions/exp.hpp
new file mode 100644
index 000000000000..14b757a18e92
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/elementwise_functions/exp.hpp
@@ -0,0 +1,46 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern void init_exp(py::module_ m);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/expm1.cpp b/dpctl_ext/tensor/libtensor/source/elementwise_functions/expm1.cpp
new file mode 100644
index 000000000000..b4770b7b819c
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/elementwise_functions/expm1.cpp
@@ -0,0 +1,127 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "elementwise_functions.hpp"
+#include "expm1.hpp"
+#include "utils/type_dispatch.hpp"
+#include "utils/type_dispatch_building.hpp"
+
+#include "kernels/elementwise_functions/common.hpp"
+#include "kernels/elementwise_functions/expm1.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
+using ew_cmn_ns::unary_contig_impl_fn_ptr_t;
+using ew_cmn_ns::unary_strided_impl_fn_ptr_t;
+
+// U14: ==== EXPM1   (x)
+namespace impl
+{
+
+namespace expm1_fn_ns = dpctl::tensor::kernels::expm1;
+
+static unary_contig_impl_fn_ptr_t
+    expm1_contig_dispatch_vector[td_ns::num_types];
+static int expm1_output_typeid_vector[td_ns::num_types];
+static unary_strided_impl_fn_ptr_t
+    expm1_strided_dispatch_vector[td_ns::num_types];
+
+void populate_expm1_dispatch_vectors(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = expm1_fn_ns;
+
+    using fn_ns::Expm1ContigFactory;
+    DispatchVectorBuilder<unary_contig_impl_fn_ptr_t, Expm1ContigFactory,
+                          num_types>
+        dvb1;
+    dvb1.populate_dispatch_vector(expm1_contig_dispatch_vector);
+
+    using fn_ns::Expm1StridedFactory;
+    DispatchVectorBuilder<unary_strided_impl_fn_ptr_t, Expm1StridedFactory,
+                          num_types>
+        dvb2;
+    dvb2.populate_dispatch_vector(expm1_strided_dispatch_vector);
+
+    using fn_ns::Expm1TypeMapFactory;
+    DispatchVectorBuilder<int, Expm1TypeMapFactory, num_types> dvb3;
+    dvb3.populate_dispatch_vector(expm1_output_typeid_vector);
+};
+
+} // namespace impl
+
+void init_expm1(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        impl::populate_expm1_dispatch_vectors();
+        using impl::expm1_contig_dispatch_vector;
+        using impl::expm1_output_typeid_vector;
+        using impl::expm1_strided_dispatch_vector;
+
+        auto expm1_pyapi = [&](const arrayT &src, const arrayT &dst,
+                               sycl::queue &exec_q,
+                               const event_vecT &depends = {}) {
+            return py_unary_ufunc(
+                src, dst, exec_q, depends, expm1_output_typeid_vector,
+                expm1_contig_dispatch_vector, expm1_strided_dispatch_vector);
+        };
+        m.def("_expm1", expm1_pyapi, "", py::arg("src"), py::arg("dst"),
+              py::arg("sycl_queue"), py::arg("depends") = py::list());
+
+        auto expm1_result_type_pyapi = [&](const py::dtype &dtype) {
+            return py_unary_ufunc_result_type(dtype,
+                                              expm1_output_typeid_vector);
+        };
+        m.def("_expm1_result_type", expm1_result_type_pyapi);
+    }
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/expm1.hpp b/dpctl_ext/tensor/libtensor/source/elementwise_functions/expm1.hpp
new file mode 100644
index 000000000000..4f373fe67dff
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/elementwise_functions/expm1.hpp
@@ -0,0 +1,46 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern void init_expm1(py::module_ m);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/floor.cpp b/dpctl_ext/tensor/libtensor/source/elementwise_functions/floor.cpp
new file mode 100644
index 000000000000..2a81ce6552a9
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/elementwise_functions/floor.cpp
@@ -0,0 +1,127 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "elementwise_functions.hpp"
+#include "floor.hpp"
+#include "utils/type_dispatch.hpp"
+#include "utils/type_dispatch_building.hpp"
+
+#include "kernels/elementwise_functions/common.hpp"
+#include "kernels/elementwise_functions/floor.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
+using ew_cmn_ns::unary_contig_impl_fn_ptr_t;
+using ew_cmn_ns::unary_strided_impl_fn_ptr_t;
+
+// U15: ==== FLOOR   (x)
+namespace impl
+{
+
+namespace floor_fn_ns = dpctl::tensor::kernels::floor;
+
+static unary_contig_impl_fn_ptr_t
+    floor_contig_dispatch_vector[td_ns::num_types];
+static int floor_output_typeid_vector[td_ns::num_types];
+static unary_strided_impl_fn_ptr_t
+    floor_strided_dispatch_vector[td_ns::num_types];
+
+void populate_floor_dispatch_vectors(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = floor_fn_ns;
+
+    using fn_ns::FloorContigFactory;
+    DispatchVectorBuilder<unary_contig_impl_fn_ptr_t, FloorContigFactory,
+                          num_types>
+        dvb1;
+    dvb1.populate_dispatch_vector(floor_contig_dispatch_vector);
+
+    using fn_ns::FloorStridedFactory;
+    DispatchVectorBuilder<unary_strided_impl_fn_ptr_t, FloorStridedFactory,
+                          num_types>
+        dvb2;
+    dvb2.populate_dispatch_vector(floor_strided_dispatch_vector);
+
+    using fn_ns::FloorTypeMapFactory;
+    DispatchVectorBuilder<int, FloorTypeMapFactory, num_types> dvb3;
+    dvb3.populate_dispatch_vector(floor_output_typeid_vector);
+};
+
+} // namespace impl
+
+void init_floor(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        impl::populate_floor_dispatch_vectors();
+        using impl::floor_contig_dispatch_vector;
+        using impl::floor_output_typeid_vector;
+        using impl::floor_strided_dispatch_vector;
+
+        auto floor_pyapi = [&](const arrayT &src, const arrayT &dst,
+                               sycl::queue &exec_q,
+                               const event_vecT &depends = {}) {
+            return py_unary_ufunc(
+                src, dst, exec_q, depends, floor_output_typeid_vector,
+                floor_contig_dispatch_vector, floor_strided_dispatch_vector);
+        };
+        m.def("_floor", floor_pyapi, "", py::arg("src"), py::arg("dst"),
+              py::arg("sycl_queue"), py::arg("depends") = py::list());
+
+        auto floor_result_type_pyapi = [&](const py::dtype &dtype) {
+            return py_unary_ufunc_result_type(dtype,
+                                              floor_output_typeid_vector);
+        };
+        m.def("_floor_result_type", floor_result_type_pyapi);
+    }
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/floor.hpp b/dpctl_ext/tensor/libtensor/source/elementwise_functions/floor.hpp
new file mode 100644
index 000000000000..5e5fe41ce313
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/elementwise_functions/floor.hpp
@@ -0,0 +1,46 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern void init_floor(py::module_ m);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/imag.cpp b/dpctl_ext/tensor/libtensor/source/elementwise_functions/imag.cpp
new file mode 100644
index 000000000000..833295d22891
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/elementwise_functions/imag.cpp
@@ -0,0 +1,125 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "elementwise_functions.hpp"
+#include "imag.hpp"
+#include "utils/type_dispatch.hpp"
+#include "utils/type_dispatch_building.hpp"
+
+#include "kernels/elementwise_functions/common.hpp"
+#include "kernels/elementwise_functions/imag.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
+using ew_cmn_ns::unary_contig_impl_fn_ptr_t;
+using ew_cmn_ns::unary_strided_impl_fn_ptr_t;
+
+// U16: ==== IMAG   (x)
+namespace impl
+{
+
+namespace imag_fn_ns = dpctl::tensor::kernels::imag;
+
+static unary_contig_impl_fn_ptr_t imag_contig_dispatch_vector[td_ns::num_types];
+static int imag_output_typeid_vector[td_ns::num_types];
+static unary_strided_impl_fn_ptr_t
+    imag_strided_dispatch_vector[td_ns::num_types];
+
+void populate_imag_dispatch_vectors(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = imag_fn_ns;
+
+    using fn_ns::ImagContigFactory;
+    DispatchVectorBuilder<unary_contig_impl_fn_ptr_t, ImagContigFactory,
+                          num_types>
+        dvb1;
+    dvb1.populate_dispatch_vector(imag_contig_dispatch_vector);
+
+    using fn_ns::ImagStridedFactory;
+    DispatchVectorBuilder<unary_strided_impl_fn_ptr_t, ImagStridedFactory,
+                          num_types>
+        dvb2;
+    dvb2.populate_dispatch_vector(imag_strided_dispatch_vector);
+
+    using fn_ns::ImagTypeMapFactory;
+    DispatchVectorBuilder<int, ImagTypeMapFactory, num_types> dvb3;
+    dvb3.populate_dispatch_vector(imag_output_typeid_vector);
+};
+
+} // namespace impl
+
+void init_imag(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        impl::populate_imag_dispatch_vectors();
+        using impl::imag_contig_dispatch_vector;
+        using impl::imag_output_typeid_vector;
+        using impl::imag_strided_dispatch_vector;
+
+        auto imag_pyapi = [&](const arrayT &src, const arrayT &dst,
+                              sycl::queue &exec_q,
+                              const event_vecT &depends = {}) {
+            return py_unary_ufunc(
+                src, dst, exec_q, depends, imag_output_typeid_vector,
+                imag_contig_dispatch_vector, imag_strided_dispatch_vector);
+        };
+        m.def("_imag", imag_pyapi, "", py::arg("src"), py::arg("dst"),
+              py::arg("sycl_queue"), py::arg("depends") = py::list());
+
+        auto imag_result_type_pyapi = [&](const py::dtype &dtype) {
+            return py_unary_ufunc_result_type(dtype, imag_output_typeid_vector);
+        };
+        m.def("_imag_result_type", imag_result_type_pyapi);
+    }
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/imag.hpp b/dpctl_ext/tensor/libtensor/source/elementwise_functions/imag.hpp
new file mode 100644
index 000000000000..7cc285855328
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/elementwise_functions/imag.hpp
@@ -0,0 +1,46 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern void init_imag(py::module_ m);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/isfinite.cpp b/dpctl_ext/tensor/libtensor/source/elementwise_functions/isfinite.cpp
new file mode 100644
index 000000000000..1882406b37f3
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/elementwise_functions/isfinite.cpp
@@ -0,0 +1,128 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "elementwise_functions.hpp"
+#include "isfinite.hpp"
+#include "utils/type_dispatch.hpp"
+#include "utils/type_dispatch_building.hpp"
+
+#include "kernels/elementwise_functions/common.hpp"
+#include "kernels/elementwise_functions/isfinite.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
+using ew_cmn_ns::unary_contig_impl_fn_ptr_t;
+using ew_cmn_ns::unary_strided_impl_fn_ptr_t;
+
+// U17: ==== ISFINITE   (x)
+namespace impl
+{
+
+namespace isfinite_fn_ns = dpctl::tensor::kernels::isfinite;
+
+static unary_contig_impl_fn_ptr_t
+    isfinite_contig_dispatch_vector[td_ns::num_types];
+static int isfinite_output_typeid_vector[td_ns::num_types];
+static unary_strided_impl_fn_ptr_t
+    isfinite_strided_dispatch_vector[td_ns::num_types];
+
+void populate_isfinite_dispatch_vectors(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = isfinite_fn_ns;
+
+    using fn_ns::IsFiniteContigFactory;
+    DispatchVectorBuilder<unary_contig_impl_fn_ptr_t, IsFiniteContigFactory,
+                          num_types>
+        dvb1;
+    dvb1.populate_dispatch_vector(isfinite_contig_dispatch_vector);
+
+    using fn_ns::IsFiniteStridedFactory;
+    DispatchVectorBuilder<unary_strided_impl_fn_ptr_t, IsFiniteStridedFactory,
+                          num_types>
+        dvb2;
+    dvb2.populate_dispatch_vector(isfinite_strided_dispatch_vector);
+
+    using fn_ns::IsFiniteTypeMapFactory;
+    DispatchVectorBuilder<int, IsFiniteTypeMapFactory, num_types> dvb3;
+    dvb3.populate_dispatch_vector(isfinite_output_typeid_vector);
+};
+
+} // namespace impl
+
+void init_isfinite(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        impl::populate_isfinite_dispatch_vectors();
+        using impl::isfinite_contig_dispatch_vector;
+        using impl::isfinite_output_typeid_vector;
+        using impl::isfinite_strided_dispatch_vector;
+
+        auto isfinite_pyapi = [&](const arrayT &src, const arrayT &dst,
+                                  sycl::queue &exec_q,
+                                  const event_vecT &depends = {}) {
+            return py_unary_ufunc(src, dst, exec_q, depends,
+                                  isfinite_output_typeid_vector,
+                                  isfinite_contig_dispatch_vector,
+                                  isfinite_strided_dispatch_vector);
+        };
+        m.def("_isfinite", isfinite_pyapi, "", py::arg("src"), py::arg("dst"),
+              py::arg("sycl_queue"), py::arg("depends") = py::list());
+
+        auto isfinite_result_type_pyapi = [&](const py::dtype &dtype) {
+            return py_unary_ufunc_result_type(dtype,
+                                              isfinite_output_typeid_vector);
+        };
+        m.def("_isfinite_result_type", isfinite_result_type_pyapi);
+    }
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/isfinite.hpp b/dpctl_ext/tensor/libtensor/source/elementwise_functions/isfinite.hpp
new file mode 100644
index 000000000000..31691916c1f8
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/elementwise_functions/isfinite.hpp
@@ -0,0 +1,46 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern void init_isfinite(py::module_ m);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/isinf.cpp b/dpctl_ext/tensor/libtensor/source/elementwise_functions/isinf.cpp
new file mode 100644
index 000000000000..b6bb5605412c
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/elementwise_functions/isinf.cpp
@@ -0,0 +1,127 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "elementwise_functions.hpp"
+#include "isinf.hpp"
+#include "utils/type_dispatch.hpp"
+#include "utils/type_dispatch_building.hpp"
+
+#include "kernels/elementwise_functions/common.hpp"
+#include "kernels/elementwise_functions/isinf.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
+using ew_cmn_ns::unary_contig_impl_fn_ptr_t;
+using ew_cmn_ns::unary_strided_impl_fn_ptr_t;
+
+// U18: ==== ISINF   (x)
+namespace impl
+{
+
+namespace isinf_fn_ns = dpctl::tensor::kernels::isinf;
+
+static unary_contig_impl_fn_ptr_t
+    isinf_contig_dispatch_vector[td_ns::num_types];
+static int isinf_output_typeid_vector[td_ns::num_types];
+static unary_strided_impl_fn_ptr_t
+    isinf_strided_dispatch_vector[td_ns::num_types];
+
+void populate_isinf_dispatch_vectors(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = isinf_fn_ns;
+
+    using fn_ns::IsInfContigFactory;
+    DispatchVectorBuilder<unary_contig_impl_fn_ptr_t, IsInfContigFactory,
+                          num_types>
+        dvb1;
+    dvb1.populate_dispatch_vector(isinf_contig_dispatch_vector);
+
+    using fn_ns::IsInfStridedFactory;
+    DispatchVectorBuilder<unary_strided_impl_fn_ptr_t, IsInfStridedFactory,
+                          num_types>
+        dvb2;
+    dvb2.populate_dispatch_vector(isinf_strided_dispatch_vector);
+
+    using fn_ns::IsInfTypeMapFactory;
+    DispatchVectorBuilder<int, IsInfTypeMapFactory, num_types> dvb3;
+    dvb3.populate_dispatch_vector(isinf_output_typeid_vector);
+};
+
+} // namespace impl
+
+void init_isinf(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        impl::populate_isinf_dispatch_vectors();
+        using impl::isinf_contig_dispatch_vector;
+        using impl::isinf_output_typeid_vector;
+        using impl::isinf_strided_dispatch_vector;
+
+        auto isinf_pyapi = [&](const arrayT &src, const arrayT &dst,
+                               sycl::queue &exec_q,
+                               const event_vecT &depends = {}) {
+            return py_unary_ufunc(
+                src, dst, exec_q, depends, isinf_output_typeid_vector,
+                isinf_contig_dispatch_vector, isinf_strided_dispatch_vector);
+        };
+        m.def("_isinf", isinf_pyapi, "", py::arg("src"), py::arg("dst"),
+              py::arg("sycl_queue"), py::arg("depends") = py::list());
+
+        auto isinf_result_type_pyapi = [&](const py::dtype &dtype) {
+            return py_unary_ufunc_result_type(dtype,
+                                              isinf_output_typeid_vector);
+        };
+        m.def("_isinf_result_type", isinf_result_type_pyapi);
+    }
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/isinf.hpp b/dpctl_ext/tensor/libtensor/source/elementwise_functions/isinf.hpp
new file mode 100644
index 000000000000..3dec9f20c791
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/elementwise_functions/isinf.hpp
@@ -0,0 +1,46 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern void init_isinf(py::module_ m);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/isnan.cpp b/dpctl_ext/tensor/libtensor/source/elementwise_functions/isnan.cpp
new file mode 100644
index 000000000000..ce832d0a0ed3
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/elementwise_functions/isnan.cpp
@@ -0,0 +1,127 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "elementwise_functions.hpp"
+#include "isnan.hpp"
+#include "utils/type_dispatch.hpp"
+#include "utils/type_dispatch_building.hpp"
+
+#include "kernels/elementwise_functions/common.hpp"
+#include "kernels/elementwise_functions/isnan.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
+using ew_cmn_ns::unary_contig_impl_fn_ptr_t;
+using ew_cmn_ns::unary_strided_impl_fn_ptr_t;
+
+// U19: ==== ISNAN   (x)
+namespace impl
+{
+
+namespace isnan_fn_ns = dpctl::tensor::kernels::isnan;
+
+static unary_contig_impl_fn_ptr_t
+    isnan_contig_dispatch_vector[td_ns::num_types];
+static int isnan_output_typeid_vector[td_ns::num_types];
+static unary_strided_impl_fn_ptr_t
+    isnan_strided_dispatch_vector[td_ns::num_types];
+
+void populate_isnan_dispatch_vectors(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = isnan_fn_ns;
+
+    using fn_ns::IsNanContigFactory;
+    DispatchVectorBuilder<unary_contig_impl_fn_ptr_t, IsNanContigFactory,
+                          num_types>
+        dvb1;
+    dvb1.populate_dispatch_vector(isnan_contig_dispatch_vector);
+
+    using fn_ns::IsNanStridedFactory;
+    DispatchVectorBuilder<unary_strided_impl_fn_ptr_t, IsNanStridedFactory,
+                          num_types>
+        dvb2;
+    dvb2.populate_dispatch_vector(isnan_strided_dispatch_vector);
+
+    using fn_ns::IsNanTypeMapFactory;
+    DispatchVectorBuilder<int, IsNanTypeMapFactory, num_types> dvb3;
+    dvb3.populate_dispatch_vector(isnan_output_typeid_vector);
+};
+
+} // namespace impl
+
+void init_isnan(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        impl::populate_isnan_dispatch_vectors();
+        using impl::isnan_contig_dispatch_vector;
+        using impl::isnan_output_typeid_vector;
+        using impl::isnan_strided_dispatch_vector;
+
+        auto isnan_pyapi = [&](const arrayT &src, const arrayT &dst,
+                               sycl::queue &exec_q,
+                               const event_vecT &depends = {}) {
+            return py_unary_ufunc(
+                src, dst, exec_q, depends, isnan_output_typeid_vector,
+                isnan_contig_dispatch_vector, isnan_strided_dispatch_vector);
+        };
+        m.def("_isnan", isnan_pyapi, "", py::arg("src"), py::arg("dst"),
+              py::arg("sycl_queue"), py::arg("depends") = py::list());
+
+        auto isnan_result_type_pyapi = [&](const py::dtype &dtype) {
+            return py_unary_ufunc_result_type(dtype,
+                                              isnan_output_typeid_vector);
+        };
+        m.def("_isnan_result_type", isnan_result_type_pyapi);
+    }
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/isnan.hpp b/dpctl_ext/tensor/libtensor/source/elementwise_functions/isnan.hpp
new file mode 100644
index 000000000000..d5a8cdae37e8
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/elementwise_functions/isnan.hpp
@@ -0,0 +1,46 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern void init_isnan(py::module_ m);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/log.cpp b/dpctl_ext/tensor/libtensor/source/elementwise_functions/log.cpp
new file mode 100644
index 000000000000..2906304eaffa
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/elementwise_functions/log.cpp
@@ -0,0 +1,125 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "elementwise_functions.hpp"
+#include "log.hpp"
+#include "utils/type_dispatch.hpp"
+#include "utils/type_dispatch_building.hpp"
+
+#include "kernels/elementwise_functions/common.hpp"
+#include "kernels/elementwise_functions/log.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
+using ew_cmn_ns::unary_contig_impl_fn_ptr_t;
+using ew_cmn_ns::unary_strided_impl_fn_ptr_t;
+
+// U20: ==== LOG   (x)
+namespace impl
+{
+
+namespace log_fn_ns = dpctl::tensor::kernels::log;
+
+static unary_contig_impl_fn_ptr_t log_contig_dispatch_vector[td_ns::num_types];
+static int log_output_typeid_vector[td_ns::num_types];
+static unary_strided_impl_fn_ptr_t
+    log_strided_dispatch_vector[td_ns::num_types];
+
+void populate_log_dispatch_vectors(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = log_fn_ns;
+
+    using fn_ns::LogContigFactory;
+    DispatchVectorBuilder<unary_contig_impl_fn_ptr_t, LogContigFactory,
+                          num_types>
+        dvb1;
+    dvb1.populate_dispatch_vector(log_contig_dispatch_vector);
+
+    using fn_ns::LogStridedFactory;
+    DispatchVectorBuilder<unary_strided_impl_fn_ptr_t, LogStridedFactory,
+                          num_types>
+        dvb2;
+    dvb2.populate_dispatch_vector(log_strided_dispatch_vector);
+
+    using fn_ns::LogTypeMapFactory;
+    DispatchVectorBuilder<int, LogTypeMapFactory, num_types> dvb3;
+    dvb3.populate_dispatch_vector(log_output_typeid_vector);
+};
+
+} // namespace impl
+
+void init_log(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        impl::populate_log_dispatch_vectors();
+        using impl::log_contig_dispatch_vector;
+        using impl::log_output_typeid_vector;
+        using impl::log_strided_dispatch_vector;
+
+        auto log_pyapi = [&](const arrayT &src, const arrayT &dst,
+                             sycl::queue &exec_q,
+                             const event_vecT &depends = {}) {
+            return py_unary_ufunc(
+                src, dst, exec_q, depends, log_output_typeid_vector,
+                log_contig_dispatch_vector, log_strided_dispatch_vector);
+        };
+        m.def("_log", log_pyapi, "", py::arg("src"), py::arg("dst"),
+              py::arg("sycl_queue"), py::arg("depends") = py::list());
+
+        auto log_result_type_pyapi = [&](const py::dtype &dtype) {
+            return py_unary_ufunc_result_type(dtype, log_output_typeid_vector);
+        };
+        m.def("_log_result_type", log_result_type_pyapi);
+    }
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/log.hpp b/dpctl_ext/tensor/libtensor/source/elementwise_functions/log.hpp
new file mode 100644
index 000000000000..fb065e82e037
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/elementwise_functions/log.hpp
@@ -0,0 +1,46 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern void init_log(py::module_ m);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/log10.cpp b/dpctl_ext/tensor/libtensor/source/elementwise_functions/log10.cpp
new file mode 100644
index 000000000000..9501af987341
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/elementwise_functions/log10.cpp
@@ -0,0 +1,127 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "elementwise_functions.hpp"
+#include "log10.hpp"
+#include "utils/type_dispatch.hpp"
+#include "utils/type_dispatch_building.hpp"
+
+#include "kernels/elementwise_functions/common.hpp"
+#include "kernels/elementwise_functions/log10.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
+using ew_cmn_ns::unary_contig_impl_fn_ptr_t;
+using ew_cmn_ns::unary_strided_impl_fn_ptr_t;
+
+// U23: ==== LOG10   (x)
+namespace impl
+{
+
+namespace log10_fn_ns = dpctl::tensor::kernels::log10;
+
+static unary_contig_impl_fn_ptr_t
+    log10_contig_dispatch_vector[td_ns::num_types];
+static int log10_output_typeid_vector[td_ns::num_types];
+static unary_strided_impl_fn_ptr_t
+    log10_strided_dispatch_vector[td_ns::num_types];
+
+void populate_log10_dispatch_vectors(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = log10_fn_ns;
+
+    using fn_ns::Log10ContigFactory;
+    DispatchVectorBuilder<unary_contig_impl_fn_ptr_t, Log10ContigFactory,
+                          num_types>
+        dvb1;
+    dvb1.populate_dispatch_vector(log10_contig_dispatch_vector);
+
+    using fn_ns::Log10StridedFactory;
+    DispatchVectorBuilder<unary_strided_impl_fn_ptr_t, Log10StridedFactory,
+                          num_types>
+        dvb2;
+    dvb2.populate_dispatch_vector(log10_strided_dispatch_vector);
+
+    using fn_ns::Log10TypeMapFactory;
+    DispatchVectorBuilder<int, Log10TypeMapFactory, num_types> dvb3;
+    dvb3.populate_dispatch_vector(log10_output_typeid_vector);
+};
+
+} // namespace impl
+
+void init_log10(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        impl::populate_log10_dispatch_vectors();
+        using impl::log10_contig_dispatch_vector;
+        using impl::log10_output_typeid_vector;
+        using impl::log10_strided_dispatch_vector;
+
+        auto log10_pyapi = [&](const arrayT &src, const arrayT &dst,
+                               sycl::queue &exec_q,
+                               const event_vecT &depends = {}) {
+            return py_unary_ufunc(
+                src, dst, exec_q, depends, log10_output_typeid_vector,
+                log10_contig_dispatch_vector, log10_strided_dispatch_vector);
+        };
+        m.def("_log10", log10_pyapi, "", py::arg("src"), py::arg("dst"),
+              py::arg("sycl_queue"), py::arg("depends") = py::list());
+
+        auto log10_result_type_pyapi = [&](const py::dtype &dtype) {
+            return py_unary_ufunc_result_type(dtype,
+                                              log10_output_typeid_vector);
+        };
+        m.def("_log10_result_type", log10_result_type_pyapi);
+    }
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/log10.hpp b/dpctl_ext/tensor/libtensor/source/elementwise_functions/log10.hpp
new file mode 100644
index 000000000000..779b15472462
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/elementwise_functions/log10.hpp
@@ -0,0 +1,46 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern void init_log10(py::module_ m);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/log1p.cpp b/dpctl_ext/tensor/libtensor/source/elementwise_functions/log1p.cpp
new file mode 100644
index 000000000000..c94b3f3b5d7d
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/elementwise_functions/log1p.cpp
@@ -0,0 +1,127 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "elementwise_functions.hpp"
+#include "log1p.hpp"
+#include "utils/type_dispatch.hpp"
+#include "utils/type_dispatch_building.hpp"
+
+#include "kernels/elementwise_functions/common.hpp"
+#include "kernels/elementwise_functions/log1p.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
+using ew_cmn_ns::unary_contig_impl_fn_ptr_t;
+using ew_cmn_ns::unary_strided_impl_fn_ptr_t;
+
+// U21: ==== LOG1P   (x)
+namespace impl
+{
+
+namespace log1p_fn_ns = dpctl::tensor::kernels::log1p;
+
+static unary_contig_impl_fn_ptr_t
+    log1p_contig_dispatch_vector[td_ns::num_types];
+static int log1p_output_typeid_vector[td_ns::num_types];
+static unary_strided_impl_fn_ptr_t
+    log1p_strided_dispatch_vector[td_ns::num_types];
+
+void populate_log1p_dispatch_vectors(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = log1p_fn_ns;
+
+    using fn_ns::Log1pContigFactory;
+    DispatchVectorBuilder<unary_contig_impl_fn_ptr_t, Log1pContigFactory,
+                          num_types>
+        dvb1;
+    dvb1.populate_dispatch_vector(log1p_contig_dispatch_vector);
+
+    using fn_ns::Log1pStridedFactory;
+    DispatchVectorBuilder<unary_strided_impl_fn_ptr_t, Log1pStridedFactory,
+                          num_types>
+        dvb2;
+    dvb2.populate_dispatch_vector(log1p_strided_dispatch_vector);
+
+    using fn_ns::Log1pTypeMapFactory;
+    DispatchVectorBuilder<int, Log1pTypeMapFactory, num_types> dvb3;
+    dvb3.populate_dispatch_vector(log1p_output_typeid_vector);
+};
+
+} // namespace impl
+
+void init_log1p(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        impl::populate_log1p_dispatch_vectors();
+        using impl::log1p_contig_dispatch_vector;
+        using impl::log1p_output_typeid_vector;
+        using impl::log1p_strided_dispatch_vector;
+
+        auto log1p_pyapi = [&](const arrayT &src, const arrayT &dst,
+                               sycl::queue &exec_q,
+                               const event_vecT &depends = {}) {
+            return py_unary_ufunc(
+                src, dst, exec_q, depends, log1p_output_typeid_vector,
+                log1p_contig_dispatch_vector, log1p_strided_dispatch_vector);
+        };
+        m.def("_log1p", log1p_pyapi, "", py::arg("src"), py::arg("dst"),
+              py::arg("sycl_queue"), py::arg("depends") = py::list());
+
+        auto log1p_result_type_pyapi = [&](const py::dtype &dtype) {
+            return py_unary_ufunc_result_type(dtype,
+                                              log1p_output_typeid_vector);
+        };
+        m.def("_log1p_result_type", log1p_result_type_pyapi);
+    }
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/log1p.hpp b/dpctl_ext/tensor/libtensor/source/elementwise_functions/log1p.hpp
new file mode 100644
index 000000000000..85bf21c8ea48
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/elementwise_functions/log1p.hpp
@@ -0,0 +1,46 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern void init_log1p(py::module_ m);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/log2.cpp b/dpctl_ext/tensor/libtensor/source/elementwise_functions/log2.cpp
new file mode 100644
index 000000000000..825d516f7820
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/elementwise_functions/log2.cpp
@@ -0,0 +1,125 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "elementwise_functions.hpp"
+#include "log2.hpp"
+#include "utils/type_dispatch.hpp"
+#include "utils/type_dispatch_building.hpp"
+
+#include "kernels/elementwise_functions/common.hpp"
+#include "kernels/elementwise_functions/log2.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
+using ew_cmn_ns::unary_contig_impl_fn_ptr_t;
+using ew_cmn_ns::unary_strided_impl_fn_ptr_t;
+
+// U22: ==== LOG2   (x)
+namespace impl
+{
+
+namespace log2_fn_ns = dpctl::tensor::kernels::log2;
+
+static unary_contig_impl_fn_ptr_t log2_contig_dispatch_vector[td_ns::num_types];
+static int log2_output_typeid_vector[td_ns::num_types];
+static unary_strided_impl_fn_ptr_t
+    log2_strided_dispatch_vector[td_ns::num_types];
+
+void populate_log2_dispatch_vectors(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = log2_fn_ns;
+
+    using fn_ns::Log2ContigFactory;
+    DispatchVectorBuilder<unary_contig_impl_fn_ptr_t, Log2ContigFactory,
+                          num_types>
+        dvb1;
+    dvb1.populate_dispatch_vector(log2_contig_dispatch_vector);
+
+    using fn_ns::Log2StridedFactory;
+    DispatchVectorBuilder<unary_strided_impl_fn_ptr_t, Log2StridedFactory,
+                          num_types>
+        dvb2;
+    dvb2.populate_dispatch_vector(log2_strided_dispatch_vector);
+
+    using fn_ns::Log2TypeMapFactory;
+    DispatchVectorBuilder<int, Log2TypeMapFactory, num_types> dvb3;
+    dvb3.populate_dispatch_vector(log2_output_typeid_vector);
+};
+
+} // namespace impl
+
+void init_log2(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        impl::populate_log2_dispatch_vectors();
+        using impl::log2_contig_dispatch_vector;
+        using impl::log2_output_typeid_vector;
+        using impl::log2_strided_dispatch_vector;
+
+        auto log2_pyapi = [&](const arrayT &src, const arrayT &dst,
+                              sycl::queue &exec_q,
+                              const event_vecT &depends = {}) {
+            return py_unary_ufunc(
+                src, dst, exec_q, depends, log2_output_typeid_vector,
+                log2_contig_dispatch_vector, log2_strided_dispatch_vector);
+        };
+        m.def("_log2", log2_pyapi, "", py::arg("src"), py::arg("dst"),
+              py::arg("sycl_queue"), py::arg("depends") = py::list());
+
+        auto log2_result_type_pyapi = [&](const py::dtype &dtype) {
+            return py_unary_ufunc_result_type(dtype, log2_output_typeid_vector);
+        };
+        m.def("_log2_result_type", log2_result_type_pyapi);
+    }
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/log2.hpp b/dpctl_ext/tensor/libtensor/source/elementwise_functions/log2.hpp
new file mode 100644
index 000000000000..11f757b1449d
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/elementwise_functions/log2.hpp
@@ -0,0 +1,46 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern void init_log2(py::module_ m);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/logical_not.cpp b/dpctl_ext/tensor/libtensor/source/elementwise_functions/logical_not.cpp
new file mode 100644
index 000000000000..e8f5845fac16
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/elementwise_functions/logical_not.cpp
@@ -0,0 +1,129 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "elementwise_functions.hpp"
+#include "logical_not.hpp"
+#include "utils/type_dispatch.hpp"
+#include "utils/type_dispatch_building.hpp"
+
+#include "kernels/elementwise_functions/common.hpp"
+#include "kernels/elementwise_functions/logical_not.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
+using ew_cmn_ns::unary_contig_impl_fn_ptr_t;
+using ew_cmn_ns::unary_strided_impl_fn_ptr_t;
+
+// U24: ==== LOGICAL_NOT   (x)
+namespace impl
+{
+
+namespace logical_not_fn_ns = dpctl::tensor::kernels::logical_not;
+
+static unary_contig_impl_fn_ptr_t
+    logical_not_contig_dispatch_vector[td_ns::num_types];
+static int logical_not_output_typeid_vector[td_ns::num_types];
+static unary_strided_impl_fn_ptr_t
+    logical_not_strided_dispatch_vector[td_ns::num_types];
+
+void populate_logical_not_dispatch_vectors(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = logical_not_fn_ns;
+
+    using fn_ns::LogicalNotContigFactory;
+    DispatchVectorBuilder<unary_contig_impl_fn_ptr_t, LogicalNotContigFactory,
+                          num_types>
+        dvb1;
+    dvb1.populate_dispatch_vector(logical_not_contig_dispatch_vector);
+
+    using fn_ns::LogicalNotStridedFactory;
+    DispatchVectorBuilder<unary_strided_impl_fn_ptr_t, LogicalNotStridedFactory,
+                          num_types>
+        dvb2;
+    dvb2.populate_dispatch_vector(logical_not_strided_dispatch_vector);
+
+    using fn_ns::LogicalNotTypeMapFactory;
+    DispatchVectorBuilder<int, LogicalNotTypeMapFactory, num_types> dvb3;
+    dvb3.populate_dispatch_vector(logical_not_output_typeid_vector);
+};
+
+} // namespace impl
+
+void init_logical_not(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        impl::populate_logical_not_dispatch_vectors();
+        using impl::logical_not_contig_dispatch_vector;
+        using impl::logical_not_output_typeid_vector;
+        using impl::logical_not_strided_dispatch_vector;
+
+        auto logical_not_pyapi = [&](const arrayT &src, const arrayT &dst,
+                                     sycl::queue &exec_q,
+                                     const event_vecT &depends = {}) {
+            return py_unary_ufunc(src, dst, exec_q, depends,
+                                  logical_not_output_typeid_vector,
+                                  logical_not_contig_dispatch_vector,
+                                  logical_not_strided_dispatch_vector);
+        };
+        m.def("_logical_not", logical_not_pyapi, "", py::arg("src"),
+              py::arg("dst"), py::arg("sycl_queue"),
+              py::arg("depends") = py::list());
+
+        auto logical_not_result_type_pyapi = [&](const py::dtype &dtype) {
+            return py_unary_ufunc_result_type(dtype,
+                                              logical_not_output_typeid_vector);
+        };
+        m.def("_logical_not_result_type", logical_not_result_type_pyapi);
+    }
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/logical_not.hpp b/dpctl_ext/tensor/libtensor/source/elementwise_functions/logical_not.hpp
new file mode 100644
index 000000000000..f3bb79cc28cc
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/elementwise_functions/logical_not.hpp
@@ -0,0 +1,46 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern void init_logical_not(py::module_ m);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/negative.cpp b/dpctl_ext/tensor/libtensor/source/elementwise_functions/negative.cpp
new file mode 100644
index 000000000000..8510a15eab00
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/elementwise_functions/negative.cpp
@@ -0,0 +1,128 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "elementwise_functions.hpp"
+#include "negative.hpp"
+#include "utils/type_dispatch.hpp"
+#include "utils/type_dispatch_building.hpp"
+
+#include "kernels/elementwise_functions/common.hpp"
+#include "kernels/elementwise_functions/negative.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
+using ew_cmn_ns::unary_contig_impl_fn_ptr_t;
+using ew_cmn_ns::unary_strided_impl_fn_ptr_t;
+
+// U25: ==== NEGATIVE   (x)
+namespace impl
+{
+
+namespace negative_fn_ns = dpctl::tensor::kernels::negative;
+
+static unary_contig_impl_fn_ptr_t
+    negative_contig_dispatch_vector[td_ns::num_types];
+static int negative_output_typeid_vector[td_ns::num_types];
+static unary_strided_impl_fn_ptr_t
+    negative_strided_dispatch_vector[td_ns::num_types];
+
+void populate_negative_dispatch_vectors(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = negative_fn_ns;
+
+    using fn_ns::NegativeContigFactory;
+    DispatchVectorBuilder<unary_contig_impl_fn_ptr_t, NegativeContigFactory,
+                          num_types>
+        dvb1;
+    dvb1.populate_dispatch_vector(negative_contig_dispatch_vector);
+
+    using fn_ns::NegativeStridedFactory;
+    DispatchVectorBuilder<unary_strided_impl_fn_ptr_t, NegativeStridedFactory,
+                          num_types>
+        dvb2;
+    dvb2.populate_dispatch_vector(negative_strided_dispatch_vector);
+
+    using fn_ns::NegativeTypeMapFactory;
+    DispatchVectorBuilder<int, NegativeTypeMapFactory, num_types> dvb3;
+    dvb3.populate_dispatch_vector(negative_output_typeid_vector);
+};
+
+} // namespace impl
+
+void init_negative(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        impl::populate_negative_dispatch_vectors();
+        using impl::negative_contig_dispatch_vector;
+        using impl::negative_output_typeid_vector;
+        using impl::negative_strided_dispatch_vector;
+
+        auto negative_pyapi = [&](const arrayT &src, const arrayT &dst,
+                                  sycl::queue &exec_q,
+                                  const event_vecT &depends = {}) {
+            return py_unary_ufunc(src, dst, exec_q, depends,
+                                  negative_output_typeid_vector,
+                                  negative_contig_dispatch_vector,
+                                  negative_strided_dispatch_vector);
+        };
+        m.def("_negative", negative_pyapi, "", py::arg("src"), py::arg("dst"),
+              py::arg("sycl_queue"), py::arg("depends") = py::list());
+
+        auto negative_result_type_pyapi = [&](const py::dtype &dtype) {
+            return py_unary_ufunc_result_type(dtype,
+                                              negative_output_typeid_vector);
+        };
+        m.def("_negative_result_type", negative_result_type_pyapi);
+    }
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/negative.hpp b/dpctl_ext/tensor/libtensor/source/elementwise_functions/negative.hpp
new file mode 100644
index 000000000000..083df516b435
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/elementwise_functions/negative.hpp
@@ -0,0 +1,46 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern void init_negative(py::module_ m);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/positive.cpp b/dpctl_ext/tensor/libtensor/source/elementwise_functions/positive.cpp
new file mode 100644
index 000000000000..6518b10a77c0
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/elementwise_functions/positive.cpp
@@ -0,0 +1,128 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "elementwise_functions.hpp"
+#include "positive.hpp"
+#include "utils/type_dispatch.hpp"
+#include "utils/type_dispatch_building.hpp"
+
+#include "kernels/elementwise_functions/common.hpp"
+#include "kernels/elementwise_functions/positive.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
+using ew_cmn_ns::unary_contig_impl_fn_ptr_t;
+using ew_cmn_ns::unary_strided_impl_fn_ptr_t;
+
+// U26: ==== POSITIVE   (x)
+namespace impl
+{
+
+namespace positive_fn_ns = dpctl::tensor::kernels::positive;
+
+static unary_contig_impl_fn_ptr_t
+    positive_contig_dispatch_vector[td_ns::num_types];
+static int positive_output_typeid_vector[td_ns::num_types];
+static unary_strided_impl_fn_ptr_t
+    positive_strided_dispatch_vector[td_ns::num_types];
+
+void populate_positive_dispatch_vectors(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = positive_fn_ns;
+
+    using fn_ns::PositiveContigFactory;
+    DispatchVectorBuilder<unary_contig_impl_fn_ptr_t, PositiveContigFactory,
+                          num_types>
+        dvb1;
+    dvb1.populate_dispatch_vector(positive_contig_dispatch_vector);
+
+    using fn_ns::PositiveStridedFactory;
+    DispatchVectorBuilder<unary_strided_impl_fn_ptr_t, PositiveStridedFactory,
+                          num_types>
+        dvb2;
+    dvb2.populate_dispatch_vector(positive_strided_dispatch_vector);
+
+    using fn_ns::PositiveTypeMapFactory;
+    DispatchVectorBuilder<int, PositiveTypeMapFactory, num_types> dvb3;
+    dvb3.populate_dispatch_vector(positive_output_typeid_vector);
+};
+
+} // namespace impl
+
+void init_positive(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        impl::populate_positive_dispatch_vectors();
+        using impl::positive_contig_dispatch_vector;
+        using impl::positive_output_typeid_vector;
+        using impl::positive_strided_dispatch_vector;
+
+        auto positive_pyapi = [&](const arrayT &src, const arrayT &dst,
+                                  sycl::queue &exec_q,
+                                  const event_vecT &depends = {}) {
+            return py_unary_ufunc(src, dst, exec_q, depends,
+                                  positive_output_typeid_vector,
+                                  positive_contig_dispatch_vector,
+                                  positive_strided_dispatch_vector);
+        };
+        m.def("_positive", positive_pyapi, "", py::arg("src"), py::arg("dst"),
+              py::arg("sycl_queue"), py::arg("depends") = py::list());
+
+        auto positive_result_type_pyapi = [&](const py::dtype &dtype) {
+            return py_unary_ufunc_result_type(dtype,
+                                              positive_output_typeid_vector);
+        };
+        m.def("_positive_result_type", positive_result_type_pyapi);
+    }
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/positive.hpp b/dpctl_ext/tensor/libtensor/source/elementwise_functions/positive.hpp
new file mode 100644
index 000000000000..05bd04b577af
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/elementwise_functions/positive.hpp
@@ -0,0 +1,46 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern void init_positive(py::module_ m);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpnp/dpnp_iface_logic.py b/dpnp/dpnp_iface_logic.py
index a81416a28e43..90a669f713bd 100644
--- a/dpnp/dpnp_iface_logic.py
+++ b/dpnp/dpnp_iface_logic.py
@@ -51,10 +51,11 @@
 # TODO: revert to `import dpctl.tensor...`
 # when dpnp fully migrates dpctl/tensor
 import dpctl_ext.tensor as dpt
+import dpctl_ext.tensor._tensor_elementwise_impl as ti_ext
 import dpnp
 import dpnp.backend.extensions.ufunc._ufunc_impl as ufi
-from dpnp.dpnp_algo.dpnp_elementwise_common import DPNPBinaryFunc, DPNPUnaryFunc
 
+from .dpnp_algo.dpnp_elementwise_common import DPNPBinaryFunc, DPNPUnaryFunc
 from .dpnp_array import dpnp_array
 from .dpnp_utils import get_usm_allocations
 
@@ -1094,8 +1095,8 @@ def iscomplexobj(x):
 
 isfinite = DPNPUnaryFunc(
     "isfinite",
-    ti._isfinite_result_type,
-    ti._isfinite,
+    ti_ext._isfinite_result_type,
+    ti_ext._isfinite,
     _ISFINITE_DOCSTRING,
 )
 
@@ -1337,8 +1338,8 @@ def isin(
 
 isinf = DPNPUnaryFunc(
     "isinf",
-    ti._isinf_result_type,
-    ti._isinf,
+    ti_ext._isinf_result_type,
+    ti_ext._isinf,
     _ISINF_DOCSTRING,
 )
 
@@ -1395,8 +1396,8 @@ def isin(
 
 isnan = DPNPUnaryFunc(
     "isnan",
-    ti._isnan_result_type,
-    ti._isnan,
+    ti_ext._isnan_result_type,
+    ti_ext._isnan,
     _ISNAN_DOCSTRING,
 )
 
@@ -1968,8 +1969,8 @@ def isscalar(element):
 
 logical_not = DPNPUnaryFunc(
     "logical_not",
-    ti._logical_not_result_type,
-    ti._logical_not,
+    ti_ext._logical_not_result_type,
+    ti_ext._logical_not,
     _LOGICAL_NOT_DOCSTRING,
 )
 
diff --git a/dpnp/dpnp_iface_mathematical.py b/dpnp/dpnp_iface_mathematical.py
index 906f814604b0..a6e0d0b39a98 100644
--- a/dpnp/dpnp_iface_mathematical.py
+++ b/dpnp/dpnp_iface_mathematical.py
@@ -2057,8 +2057,8 @@ def ediff1d(ary, to_end=None, to_begin=None):
 
 floor = DPNPUnaryFunc(
     "floor",
-    ti._floor_result_type,
-    ti._floor,
+    ti_ext._floor_result_type,
+    ti_ext._floor,
     _FLOOR_DOCSTRING,
     mkl_fn_to_call="_mkl_floor_to_call",
     mkl_impl_fn="_floor",
@@ -2941,8 +2941,8 @@ def gradient(f, *varargs, axis=None, edge_order=1):
 
 imag = DPNPImag(
     "imag",
-    ti._imag_result_type,
-    ti._imag,
+    ti_ext._imag_result_type,
+    ti_ext._imag,
     _IMAG_DOCSTRING,
 )
 
@@ -3852,8 +3852,8 @@ def _check_nan_inf(val, val_dt):
 
 negative = DPNPUnaryFunc(
     "negative",
-    ti._negative_result_type,
-    ti._negative,
+    ti_ext._negative_result_type,
+    ti_ext._negative,
     _NEGATIVE_DOCSTRING,
     acceptance_fn=acceptance_fn_negative,
 )
@@ -3988,8 +3988,8 @@ def _check_nan_inf(val, val_dt):
 
 positive = DPNPUnaryFunc(
     "positive",
-    ti._positive_result_type,
-    ti._positive,
+    ti_ext._positive_result_type,
+    ti_ext._positive,
     _POSITIVE_DOCSTRING,
     acceptance_fn=acceptance_fn_positive,
 )
diff --git a/dpnp/dpnp_iface_trigonometric.py b/dpnp/dpnp_iface_trigonometric.py
index 24004fbbeaf9..d459a3392311 100644
--- a/dpnp/dpnp_iface_trigonometric.py
+++ b/dpnp/dpnp_iface_trigonometric.py
@@ -777,8 +777,8 @@ def _get_accumulation_res_dt(a, dtype):
 
 cos = DPNPUnaryFunc(
     "cos",
-    ti._cos_result_type,
-    ti._cos,
+    ti_ext._cos_result_type,
+    ti_ext._cos,
     _COS_DOCSTRING,
     mkl_fn_to_call="_mkl_cos_to_call",
     mkl_impl_fn="_cos",
@@ -841,8 +841,8 @@ def _get_accumulation_res_dt(a, dtype):
 
 cosh = DPNPUnaryFunc(
     "cosh",
-    ti._cosh_result_type,
-    ti._cosh,
+    ti_ext._cosh_result_type,
+    ti_ext._cosh,
     _COSH_DOCSTRING,
     mkl_fn_to_call="_mkl_cosh_to_call",
     mkl_impl_fn="_cosh",
@@ -1127,8 +1127,8 @@ def cumlogsumexp(
 
 exp = DPNPUnaryFunc(
     "exp",
-    ti._exp_result_type,
-    ti._exp,
+    ti_ext._exp_result_type,
+    ti_ext._exp,
     _EXP_DOCSTRING,
     mkl_fn_to_call="_mkl_exp_to_call",
     mkl_impl_fn="_exp",
@@ -1259,8 +1259,8 @@ def cumlogsumexp(
 
 expm1 = DPNPUnaryFunc(
     "expm1",
-    ti._expm1_result_type,
-    ti._expm1,
+    ti_ext._expm1_result_type,
+    ti_ext._expm1,
     _EXPM1_DOCSTRING,
     mkl_fn_to_call="_mkl_expm1_to_call",
     mkl_impl_fn="_expm1",
@@ -1416,8 +1416,8 @@ def cumlogsumexp(
 
 log = DPNPUnaryFunc(
     "log",
-    ti._log_result_type,
-    ti._log,
+    ti_ext._log_result_type,
+    ti_ext._log,
     _LOG_DOCSTRING,
     mkl_fn_to_call="_mkl_ln_to_call",
     mkl_impl_fn="_ln",
@@ -1495,8 +1495,8 @@ def cumlogsumexp(
 
 log10 = DPNPUnaryFunc(
     "log10",
-    ti._log10_result_type,
-    ti._log10,
+    ti_ext._log10_result_type,
+    ti_ext._log10,
     _LOG10_DOCSTRING,
     mkl_fn_to_call="_mkl_log10_to_call",
     mkl_impl_fn="_log10",
@@ -1580,8 +1580,8 @@ def cumlogsumexp(
 
 log1p = DPNPUnaryFunc(
     "log1p",
-    ti._log1p_result_type,
-    ti._log1p,
+    ti_ext._log1p_result_type,
+    ti_ext._log1p,
     _LOG1P_DOCSTRING,
     mkl_fn_to_call="_mkl_log1p_to_call",
     mkl_impl_fn="_log1p",
@@ -1660,8 +1660,8 @@ def cumlogsumexp(
 
 log2 = DPNPUnaryFunc(
     "log2",
-    ti._log2_result_type,
-    ti._log2,
+    ti_ext._log2_result_type,
+    ti_ext._log2,
     _LOG2_DOCSTRING,
     mkl_fn_to_call="_mkl_log2_to_call",
     mkl_impl_fn="_log2",

From 3a0c2ff8eed0ffe774eac9af9c7533714fa4dda8 Mon Sep 17 00:00:00 2001
From: vlad-perevezentsev <vladislav.perevezentsev@intel.com>
Date: Wed, 18 Mar 2026 12:26:07 +0100
Subject: [PATCH 15/43] Extend `_tensor_elementwise_impl` (unary) part 3
 (#2801)

This PR extends `_tensor_elementwise_impl` with the remaining unary
functions:  `real, reciprocal, round, rsqrt, sign, signbit, sin, sinh, sqrt, square, tan, tanh, trunc`
---
 .github/workflows/conda-package.yml           |   2 +-
 dpctl_ext/tensor/CMakeLists.txt               |  32 +-
 dpctl_ext/tensor/__init__.py                  |  32 ++
 dpctl_ext/tensor/_elementwise_funcs.py        | 468 ++++++++++++++++++
 .../kernels/elementwise_functions/cbrt.hpp    | 209 ++++++++
 .../kernels/elementwise_functions/exp2.hpp    | 272 ++++++++++
 .../kernels/elementwise_functions/proj.hpp    | 239 +++++++++
 .../kernels/elementwise_functions/real.hpp    | 231 +++++++++
 .../elementwise_functions/reciprocal.hpp      | 229 +++++++++
 .../kernels/elementwise_functions/round.hpp   | 241 +++++++++
 .../kernels/elementwise_functions/rsqrt.hpp   | 209 ++++++++
 .../kernels/elementwise_functions/sign.hpp    | 258 ++++++++++
 .../kernels/elementwise_functions/signbit.hpp | 223 +++++++++
 .../kernels/elementwise_functions/sin.hpp     | 333 +++++++++++++
 .../kernels/elementwise_functions/sinh.hpp    | 302 +++++++++++
 .../kernels/elementwise_functions/sqrt.hpp    | 224 +++++++++
 .../kernels/elementwise_functions/square.hpp  | 251 ++++++++++
 .../kernels/elementwise_functions/tan.hpp     | 276 +++++++++++
 .../kernels/elementwise_functions/tanh.hpp    | 270 ++++++++++
 .../kernels/elementwise_functions/trunc.hpp   | 226 +++++++++
 .../source/elementwise_functions/cbrt.cpp     | 125 +++++
 .../source/elementwise_functions/cbrt.hpp     |  46 ++
 .../elementwise_common.cpp                    |  64 +--
 .../source/elementwise_functions/exp2.cpp     | 125 +++++
 .../source/elementwise_functions/exp2.hpp     |  46 ++
 .../source/elementwise_functions/proj.cpp     | 125 +++++
 .../source/elementwise_functions/proj.hpp     |  46 ++
 .../source/elementwise_functions/real.cpp     | 125 +++++
 .../source/elementwise_functions/real.hpp     |  46 ++
 .../elementwise_functions/reciprocal.cpp      | 129 +++++
 .../elementwise_functions/reciprocal.hpp      |  46 ++
 .../source/elementwise_functions/round.cpp    | 126 +++++
 .../source/elementwise_functions/round.hpp    |  46 ++
 .../source/elementwise_functions/rsqrt.cpp    | 127 +++++
 .../source/elementwise_functions/rsqrt.hpp    |  46 ++
 .../source/elementwise_functions/sign.cpp     | 125 +++++
 .../source/elementwise_functions/sign.hpp     |  46 ++
 .../source/elementwise_functions/signbit.cpp  | 128 +++++
 .../source/elementwise_functions/signbit.hpp  |  46 ++
 .../source/elementwise_functions/sin.cpp      | 125 +++++
 .../source/elementwise_functions/sin.hpp      |  46 ++
 .../source/elementwise_functions/sinh.cpp     | 125 +++++
 .../source/elementwise_functions/sinh.hpp     |  46 ++
 .../source/elementwise_functions/sqrt.cpp     | 125 +++++
 .../source/elementwise_functions/sqrt.hpp     |  46 ++
 .../source/elementwise_functions/square.cpp   | 127 +++++
 .../source/elementwise_functions/square.hpp   |  46 ++
 .../source/elementwise_functions/tan.cpp      | 125 +++++
 .../source/elementwise_functions/tan.hpp      |  46 ++
 .../source/elementwise_functions/tanh.cpp     | 125 +++++
 .../source/elementwise_functions/tanh.hpp     |  46 ++
 .../source/elementwise_functions/trunc.cpp    | 127 +++++
 .../source/elementwise_functions/trunc.hpp    |  46 ++
 dpnp/dpnp_iface_mathematical.py               |  24 +-
 dpnp/dpnp_iface_trigonometric.py              |  40 +-
 55 files changed, 7324 insertions(+), 81 deletions(-)
 create mode 100644 dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/cbrt.hpp
 create mode 100644 dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/exp2.hpp
 create mode 100644 dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/proj.hpp
 create mode 100644 dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/real.hpp
 create mode 100644 dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/reciprocal.hpp
 create mode 100644 dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/round.hpp
 create mode 100644 dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/rsqrt.hpp
 create mode 100644 dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/sign.hpp
 create mode 100644 dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/signbit.hpp
 create mode 100644 dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/sin.hpp
 create mode 100644 dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/sinh.hpp
 create mode 100644 dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/sqrt.hpp
 create mode 100644 dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/square.hpp
 create mode 100644 dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/tan.hpp
 create mode 100644 dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/tanh.hpp
 create mode 100644 dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/trunc.hpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/elementwise_functions/cbrt.cpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/elementwise_functions/cbrt.hpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/elementwise_functions/exp2.cpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/elementwise_functions/exp2.hpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/elementwise_functions/proj.cpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/elementwise_functions/proj.hpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/elementwise_functions/real.cpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/elementwise_functions/real.hpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/elementwise_functions/reciprocal.cpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/elementwise_functions/reciprocal.hpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/elementwise_functions/round.cpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/elementwise_functions/round.hpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/elementwise_functions/rsqrt.cpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/elementwise_functions/rsqrt.hpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/elementwise_functions/sign.cpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/elementwise_functions/sign.hpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/elementwise_functions/signbit.cpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/elementwise_functions/signbit.hpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/elementwise_functions/sin.cpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/elementwise_functions/sin.hpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/elementwise_functions/sinh.cpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/elementwise_functions/sinh.hpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/elementwise_functions/sqrt.cpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/elementwise_functions/sqrt.hpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/elementwise_functions/square.cpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/elementwise_functions/square.hpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/elementwise_functions/tan.cpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/elementwise_functions/tan.hpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/elementwise_functions/tanh.cpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/elementwise_functions/tanh.hpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/elementwise_functions/trunc.cpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/elementwise_functions/trunc.hpp

diff --git a/.github/workflows/conda-package.yml b/.github/workflows/conda-package.yml
index d2ac90621aaa..eb66c91dc8c2 100644
--- a/.github/workflows/conda-package.yml
+++ b/.github/workflows/conda-package.yml
@@ -37,7 +37,7 @@ jobs:
       actions: write
 
     runs-on: ${{ matrix.os }}
-    timeout-minutes: 60
+    timeout-minutes: 80
 
     defaults:
       run:
diff --git a/dpctl_ext/tensor/CMakeLists.txt b/dpctl_ext/tensor/CMakeLists.txt
index 261204223ddd..ef3565f9827e 100644
--- a/dpctl_ext/tensor/CMakeLists.txt
+++ b/dpctl_ext/tensor/CMakeLists.txt
@@ -88,7 +88,7 @@ set(_elementwise_sources
     #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/bitwise_or.cpp
     #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/bitwise_right_shift.cpp
     #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/bitwise_xor.cpp
-    #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/cbrt.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/cbrt.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/ceil.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/conj.cpp
     #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/copysign.cpp
@@ -96,7 +96,7 @@ set(_elementwise_sources
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/cosh.cpp
     #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/equal.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/exp.cpp
-    #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/exp2.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/exp2.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/expm1.cpp
     #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/floor_divide.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/floor.cpp
@@ -126,23 +126,23 @@ set(_elementwise_sources
     #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/not_equal.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/positive.cpp
     #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/pow.cpp
-    #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/proj.cpp
-    #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/real.cpp
-    #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/reciprocal.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/proj.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/real.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/reciprocal.cpp
     #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/remainder.cpp
-    #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/round.cpp
-    #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/rsqrt.cpp
-    #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/sign.cpp
-    #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/signbit.cpp
-    #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/sin.cpp
-    #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/sinh.cpp
-    #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/sqrt.cpp
-    #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/square.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/round.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/rsqrt.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/sign.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/signbit.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/sin.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/sinh.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/sqrt.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/square.cpp
     #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/subtract.cpp
-    #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/tan.cpp
-    #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/tanh.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/tan.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/tanh.cpp
     #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/true_divide.cpp
-    #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/trunc.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/trunc.cpp
 )
 set(_reduction_sources
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/reductions/reduction_common.cpp
diff --git a/dpctl_ext/tensor/__init__.py b/dpctl_ext/tensor/__init__.py
index ea18c2aab35e..70352687c5d6 100644
--- a/dpctl_ext/tensor/__init__.py
+++ b/dpctl_ext/tensor/__init__.py
@@ -63,11 +63,13 @@
     atan,
     atanh,
     bitwise_invert,
+    cbrt,
     ceil,
     conj,
     cos,
     cosh,
     exp,
+    exp2,
     expm1,
     floor,
     imag,
@@ -81,6 +83,20 @@
     logical_not,
     negative,
     positive,
+    proj,
+    real,
+    reciprocal,
+    round,
+    rsqrt,
+    sign,
+    signbit,
+    sin,
+    sinh,
+    sqrt,
+    square,
+    tan,
+    tanh,
+    trunc,
 )
 from ._indexing_functions import (
     extract,
@@ -154,6 +170,7 @@
     "broadcast_arrays",
     "broadcast_to",
     "can_cast",
+    "cbrt",
     "ceil",
     "concat",
     "conj",
@@ -172,6 +189,7 @@
     "expand_dims",
     "eye",
     "exp",
+    "exp2",
     "expm1",
     "finfo",
     "flip",
@@ -205,26 +223,40 @@
     "place",
     "positive",
     "prod",
+    "proj",
     "put",
     "put_along_axis",
+    "real",
+    "reciprocal",
     "reduce_hypot",
     "repeat",
     "reshape",
     "result_type",
     "roll",
+    "round",
+    "rsqrt",
     "searchsorted",
+    "sign",
+    "signbit",
+    "sin",
+    "sinh",
     "sort",
+    "sqrt",
+    "square",
     "squeeze",
     "stack",
     "sum",
     "swapaxes",
     "take",
     "take_along_axis",
+    "tan",
+    "tanh",
     "tile",
     "top_k",
     "to_numpy",
     "tril",
     "triu",
+    "trunc",
     "unique_all",
     "unique_counts",
     "unique_inverse",
diff --git a/dpctl_ext/tensor/_elementwise_funcs.py b/dpctl_ext/tensor/_elementwise_funcs.py
index b57074ae9784..ae0ef8aa3496 100644
--- a/dpctl_ext/tensor/_elementwise_funcs.py
+++ b/dpctl_ext/tensor/_elementwise_funcs.py
@@ -33,6 +33,7 @@
 from ._elementwise_common import UnaryElementwiseFunc
 from ._type_utils import (
     _acceptance_fn_negative,
+    _acceptance_fn_reciprocal,
 )
 
 # U01: ==== ABS    (x)
@@ -782,6 +783,473 @@
 )
 del _positive_docstring_
 
+# U27: ==== REAL        (x)
+_real_docstring = r"""
+real(x, /, \*, out=None, order='K')
+
+Computes real part of each element `x_i` for input array `x`.
+
+Args:
+    x (usm_ndarray):
+        Input array. May have any data type.
+    out (Union[usm_ndarray, None], optional):
+        Output array to populate.
+        Array must have the correct shape and the expected data type.
+    order ("C","F","A","K", optional):
+        Memory layout of the new output array, if parameter
+        `out` is ``None``.
+        Default: "K".
+
+Returns:
+    usm_ndarray:
+        An array containing the element-wise real component of input.
+        If the input is a real-valued data type, the returned array has
+        the same data type. If the input is a complex floating-point
+        data type, the returned array has a floating-point data type
+        with the same floating-point precision as complex input.
+"""
+
+real = UnaryElementwiseFunc(
+    "real", ti._real_result_type, ti._real, _real_docstring
+)
+del _real_docstring
+
+# U28: ==== ROUND       (x)
+_round_docstring = r"""
+round(x, /, \*, out=None, order='K')
+
+Rounds each element `x_i` of the input array `x` to
+the nearest integer-valued number.
+
+When two integers are equally close to `x_i`, the result is the nearest even
+integer to `x_i`.
+
+Args:
+    x (usm_ndarray):
+        Input array, expected to have a numeric data type.
+    out (Union[usm_ndarray, None], optional):
+        Output array to populate.
+        Array must have the correct shape and the expected data type.
+    order ("C","F","A","K", optional):
+        Memory layout of the new output array, if parameter
+        `out` is ``None``.
+        Default: "K".
+
+Returns:
+    usm_ndarray:
+        An array containing the element-wise rounded values.
+"""
+
+round = UnaryElementwiseFunc(
+    "round", ti._round_result_type, ti._round, _round_docstring
+)
+del _round_docstring
+
+# U29: ==== SIGN        (x)
+_sign_docstring = r"""
+sign(x, /, \*, out=None, order='K')
+
+Computes an indication of the sign of each element `x_i` of input array `x`
+using the signum function.
+
+The signum function returns `-1` if `x_i` is less than `0`,
+`0` if `x_i` is equal to `0`, and `1` if `x_i` is greater than `0`.
+
+Args:
+    x (usm_ndarray):
+        Input array, expected to have a numeric data type.
+    out (Union[usm_ndarray, None], optional):
+        Output array to populate.
+        Array must have the correct shape and the expected data type.
+    order ("C","F","A","K", optional):
+        Memory layout of the new output array, if parameter
+        `out` is ``None``.
+        Default: "K".
+
+Returns:
+    usm_ndarray:
+        An array containing the element-wise result of the signum function. The
+        data type of the returned array is determined by the Type Promotion
+        Rules.
+"""
+
+sign = UnaryElementwiseFunc(
+    "sign", ti._sign_result_type, ti._sign, _sign_docstring
+)
+del _sign_docstring
+
+# U30: ==== SIN         (x)
+_sin_docstring = r"""
+sin(x, /, \*, out=None, order='K')
+
+Computes sine for each element `x_i` of input array `x`.
+
+Args:
+    x (usm_ndarray):
+        Input array, expected to have a real-valued floating-point data type.
+    out (Union[usm_ndarray, None], optional):
+        Output array to populate.
+        Array must have the correct shape and the expected data type.
+    order ("C","F","A","K", optional):
+        Memory layout of the new output array, if parameter
+        `out` is ``None``.
+        Default: "K".
+
+Returns:
+    usm_ndarray:
+        An array containing the element-wise sine. The data type of the
+        returned array is determined by the Type Promotion Rules.
+"""
+
+sin = UnaryElementwiseFunc("sin", ti._sin_result_type, ti._sin, _sin_docstring)
+del _sin_docstring
+
+# U31: ==== SINH        (x)
+_sinh_docstring = r"""
+sinh(x, /, \*, out=None, order='K')
+
+Computes hyperbolic sine for each element `x_i` for input array `x`.
+
+Args:
+    x (usm_ndarray):
+        Input array, expected to have a floating-point data type.
+    out (Union[usm_ndarray, None], optional):
+        Output array to populate.
+        Array must have the correct shape and the expected data type.
+    order ("C","F","A","K", optional):
+        Memory layout of the new output array, if parameter
+        `out` is ``None``.
+        Default: "K".
+
+Returns:
+    usm_ndarray:
+        An array containing the element-wise hyperbolic sine. The data type
+        of the returned array is determined by the Type Promotion Rules.
+"""
+
+sinh = UnaryElementwiseFunc(
+    "sinh", ti._sinh_result_type, ti._sinh, _sinh_docstring
+)
+del _sinh_docstring
+
+# U32: ==== SQUARE      (x)
+_square_docstring_ = r"""
+square(x, /, \*, out=None, order='K')
+
+Squares each element `x_i` of input array `x`.
+
+Args:
+    x (usm_ndarray):
+        Input array. May have any data type.
+    out (Union[usm_ndarray, None], optional):
+        Output array to populate.
+        Array must have the correct shape and the expected data type.
+    order ("C","F","A","K", optional):
+        Memory layout of the new output array, if parameter
+        `out` is ``None``.
+        Default: "K".
+
+Returns:
+    usm_ndarray:
+        An array containing the element-wise squares of `x`. The data type of
+        the returned array is determined by the Type Promotion Rules.
+"""
+
+square = UnaryElementwiseFunc(
+    "square", ti._square_result_type, ti._square, _square_docstring_
+)
+del _square_docstring_
+
+# U33: ==== SQRT        (x)
+_sqrt_docstring_ = r"""
+sqrt(x, /, \*, out=None, order='K')
+
+Computes the positive square-root for each element `x_i` of input array `x`.
+
+Args:
+    x (usm_ndarray):
+        Input array, expected to have a floating-point data type.
+    out (Union[usm_ndarray, None], optional):
+        Output array to populate.
+        Array must have the correct shape and the expected data type.
+    order ("C","F","A","K", optional):
+        Memory layout of the new output array, if parameter
+        `out` is ``None``.
+        Default: "K".
+
+Returns:
+    usm_ndarray:
+        An array containing the element-wise positive square-roots of `x`. The
+        data type of the returned array is determined by the Type Promotion
+        Rules.
+"""
+
+sqrt = UnaryElementwiseFunc(
+    "sqrt", ti._sqrt_result_type, ti._sqrt, _sqrt_docstring_
+)
+del _sqrt_docstring_
+
+# U34: ==== TAN         (x)
+_tan_docstring = r"""
+tan(x, /, \*, out=None, order='K')
+
+Computes tangent for each element `x_i` for input array `x`.
+
+Args:
+    x (usm_ndarray):
+        Input array, expected to have a floating-point data type.
+    out (Union[usm_ndarray, None], optional):
+        Output array to populate.
+        Array must have the correct shape and the expected data type.
+    order ("C","F","A","K", optional):
+        Memory layout of the new output array, if parameter
+        `out` is ``None``.
+        Default: "K".
+
+Returns:
+    usm_ndarray:
+        An array containing the element-wise tangent. The data type
+        of the returned array is determined by the Type Promotion Rules.
+"""
+
+tan = UnaryElementwiseFunc("tan", ti._tan_result_type, ti._tan, _tan_docstring)
+del _tan_docstring
+
+# U35: ==== TANH        (x)
+_tanh_docstring = r"""
+tanh(x, /, \*, out=None, order='K')
+
+Computes hyperbolic tangent for each element `x_i` for input array `x`.
+
+Args:
+    x (usm_ndarray):
+        Input array, expected to have a floating-point data type.
+    out (Union[usm_ndarray, None], optional):
+        Output array to populate.
+        Array must have the correct shape and the expected data type.
+    order ("C","F","A","K", optional):
+        Memory layout of the new output array, if parameter
+        `out` is ``None``.
+        Default: "K".
+
+Returns:
+    usm_ndarray:
+        An array containing the element-wise hyperbolic tangent. The data type
+        of the returned array is determined by the Type Promotion Rules.
+"""
+
+tanh = UnaryElementwiseFunc(
+    "tanh", ti._tanh_result_type, ti._tanh, _tanh_docstring
+)
+del _tanh_docstring
+
+# U36: ==== TRUNC       (x)
+_trunc_docstring = r"""
+trunc(x, /, \*, out=None, order='K')
+
+Returns the truncated value for each element `x_i` for input array `x`.
+
+The truncated value of the scalar `x` is the nearest integer i which is
+closer to zero than `x` is. In short, the fractional part of the
+signed number `x` is discarded.
+
+Args:
+    x (usm_ndarray):
+        Input array, expected to have a boolean or real-valued data type.
+    out (Union[usm_ndarray, None], optional):
+        Output array to populate.
+        Array must have the correct shape and the expected data type.
+    order ("C","F","A","K", optional):
+        Memory layout of the new output array, if parameter
+        `out` is ``None``.
+        Default: "K".
+
+Returns:
+    usm_ndarray:
+        An array containing the result of element-wise division. The data type
+        of the returned array is determined by the Type Promotion Rules.
+"""
+trunc = UnaryElementwiseFunc(
+    "trunc", ti._trunc_result_type, ti._trunc, _trunc_docstring
+)
+del _trunc_docstring
+
+# U37: ==== CBRT        (x)
+_cbrt_docstring_ = r"""
+cbrt(x, /, \*, out=None, order='K')
+
+Computes the cube-root for each element `x_i` for input array `x`.
+
+Args:
+    x (usm_ndarray):
+        Input array, expected to have a real-valued floating-point data type.
+    out (Union[usm_ndarray, None], optional):
+        Output array to populate.
+        Array have the correct shape and the expected data type.
+    order ("C","F","A","K", optional):
+        Memory layout of the new output array, if parameter
+        `out` is ``None``.
+        Default: "K".
+
+Returns:
+    usm_ndarray:
+        An array containing the element-wise cube-root.
+        The data type of the returned array is determined by
+        the Type Promotion Rules.
+"""
+
+cbrt = UnaryElementwiseFunc(
+    "cbrt", ti._cbrt_result_type, ti._cbrt, _cbrt_docstring_
+)
+del _cbrt_docstring_
+
+# U38: ==== EXP2        (x)
+_exp2_docstring_ = r"""
+exp2(x, /, \*, out=None, order='K')
+
+Computes the base-2 exponential for each element `x_i` for input array `x`.
+
+Args:
+    x (usm_ndarray):
+        Input array, expected to have a floating-point data type.
+    out (Union[usm_ndarray, None], optional):
+        Output array to populate.
+        Array have the correct shape and the expected data type.
+    order ("C","F","A","K", optional):
+        Memory layout of the new output array, if parameter
+        `out` is ``None``.
+        Default: "K".
+
+Returns:
+    usm_ndarray:
+        An array containing the element-wise base-2 exponentials.
+        The data type of the returned array is determined by
+        the Type Promotion Rules.
+"""
+
+exp2 = UnaryElementwiseFunc(
+    "exp2", ti._exp2_result_type, ti._exp2, _exp2_docstring_
+)
+del _exp2_docstring_
+
+# U39: ==== RSQRT        (x)
+_rsqrt_docstring_ = r"""
+rsqrt(x, /, \*, out=None, order='K')
+
+Computes the reciprocal square-root for each element `x_i` for input array `x`.
+
+Args:
+    x (usm_ndarray):
+        Input array, expected to have a real-valued floating-point data type.
+    out (Union[usm_ndarray, None], optional):
+        Output array to populate.
+        Array have the correct shape and the expected data type.
+    order ("C","F","A","K", optional):
+        Memory layout of the new output array, if parameter
+        `out` is ``None``.
+        Default: "K".
+
+Returns:
+    usm_ndarray:
+        An array containing the element-wise reciprocal square-root.
+        The returned array has a floating-point data type determined by
+        the Type Promotion Rules.
+"""
+
+rsqrt = UnaryElementwiseFunc(
+    "rsqrt", ti._rsqrt_result_type, ti._rsqrt, _rsqrt_docstring_
+)
+del _rsqrt_docstring_
+
+# U40: ==== PROJ        (x)
+_proj_docstring = r"""
+proj(x, /, \*, out=None, order='K')
+
+Computes projection of each element `x_i` for input array `x`.
+
+Args:
+    x (usm_ndarray):
+        Input array, expected to have a complex data type.
+    out (Union[usm_ndarray, None], optional):
+        Output array to populate.
+        Array must have the correct shape and the expected data type.
+    order ("C","F","A","K", optional):
+        Memory layout of the new output array, if parameter
+        `out` is ``None``.
+        Default: "K".
+
+Returns:
+    usm_ndarray:
+        An array containing the element-wise projection.
+"""
+
+proj = UnaryElementwiseFunc(
+    "proj", ti._proj_result_type, ti._proj, _proj_docstring
+)
+del _proj_docstring
+
+# U41: ==== SIGNBIT        (x)
+_signbit_docstring = r"""
+signbit(x, /, \*, out=None, order='K')
+
+Computes an indication of whether the sign bit of each element `x_i` of
+input array `x` is set.
+
+Args:
+    x (usm_ndarray):
+        Input array, expected to have a real-valued floating-point data type.
+    out (Union[usm_ndarray, None], optional):
+        Output array to populate.
+        Array must have the correct shape and the expected data type.
+    order ("C","F","A","K", optional):
+        Memory layout of the new output array, if parameter
+        `out` is ``None``.
+        Default: "K".
+
+Returns:
+    usm_ndarray:
+        An array containing the element-wise signbit results. The returned array
+        must have a data type of `bool`.
+"""
+
+signbit = UnaryElementwiseFunc(
+    "signbit", ti._signbit_result_type, ti._signbit, _signbit_docstring
+)
+del _signbit_docstring
+
+# U42: ==== RECIPROCAL        (x)
+_reciprocal_docstring = r"""
+reciprocal(x, /, \*, out=None, order='K')
+
+Computes the reciprocal of each element `x_i` for input array `x`.
+
+Args:
+    x (usm_ndarray):
+        Input array, expected to have a floating-point data type.
+    out (Union[usm_ndarray, None], optional):
+        Output array to populate.
+        Array have the correct shape and the expected data type.
+    order ("C","F","A","K", optional):
+        Memory layout of the new output array, if parameter
+        `out` is ``None``.
+        Default: "K".
+
+Returns:
+    usm_ndarray:
+        An array containing the element-wise reciprocals.
+        The returned array has a floating-point data type determined
+        by the Type Promotion Rules.
+"""
+
+reciprocal = UnaryElementwiseFunc(
+    "reciprocal",
+    ti._reciprocal_result_type,
+    ti._reciprocal,
+    _reciprocal_docstring,
+    acceptance_fn=_acceptance_fn_reciprocal,
+)
+del _reciprocal_docstring
+
 # U43: ==== ANGLE        (x)
 _angle_docstring = r"""
 angle(x, /, \*, out=None, order='K')
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/cbrt.hpp b/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/cbrt.hpp
new file mode 100644
index 000000000000..57bbb09523a4
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/cbrt.hpp
@@ -0,0 +1,209 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for elementwise evaluation of CBRT(x)
+/// function that computes a cube root.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <cstddef>
+#include <cstdint>
+#include <type_traits>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "vec_size_util.hpp"
+
+#include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+
+#include "utils/type_dispatch_building.hpp"
+
+namespace dpctl::tensor::kernels::cbrt
+{
+
+using dpctl::tensor::ssize_t;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+template <typename argT, typename resT>
+struct CbrtFunctor
+{
+
+    // is function constant for given argT
+    using is_constant = typename std::false_type;
+    // constant value, if constant
+    // constexpr resT constant_value = resT{};
+    // is function defined for sycl::vec
+    using supports_vec = typename std::false_type;
+    // do both argTy and resTy support sugroup store/load operation
+    using supports_sg_loadstore = typename std::true_type;
+
+    resT operator()(const argT &in) const
+    {
+        return sycl::cbrt(in);
+    }
+};
+
+template <typename argTy,
+          typename resTy = argTy,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
+          bool enable_sg_loadstore = true>
+using CbrtContigFunctor =
+    elementwise_common::UnaryContigFunctor<argTy,
+                                           resTy,
+                                           CbrtFunctor<argTy, resTy>,
+                                           vec_sz,
+                                           n_vecs,
+                                           enable_sg_loadstore>;
+
+template <typename argTy, typename resTy, typename IndexerT>
+using CbrtStridedFunctor = elementwise_common::
+    UnaryStridedFunctor<argTy, resTy, IndexerT, CbrtFunctor<argTy, resTy>>;
+
+template <typename T>
+struct CbrtOutputType
+{
+    using value_type = typename std::disjunction<
+        td_ns::TypeMapResultEntry<T, sycl::half, sycl::half>,
+        td_ns::TypeMapResultEntry<T, float, float>,
+        td_ns::TypeMapResultEntry<T, double, double>,
+        td_ns::DefaultResultEntry<void>>::result_type;
+
+    static constexpr bool is_defined = !std::is_same_v<value_type, void>;
+};
+
+namespace hyperparam_detail
+{
+
+namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
+
+using vsu_ns::ContigHyperparameterSetDefault;
+using vsu_ns::UnaryContigHyperparameterSetEntry;
+
+template <typename argTy>
+struct CbrtContigHyperparameterSet
+{
+    using value_type =
+        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
+
+    constexpr static auto vec_sz = value_type::vec_sz;
+    constexpr static auto n_vecs = value_type::n_vecs;
+};
+
+} // end of namespace hyperparam_detail
+
+template <typename T1, typename T2, std::uint8_t vec_sz, std::uint8_t n_vecs>
+class cbrt_contig_kernel;
+
+template <typename argTy>
+sycl::event cbrt_contig_impl(sycl::queue &exec_q,
+                             std::size_t nelems,
+                             const char *arg_p,
+                             char *res_p,
+                             const std::vector<sycl::event> &depends = {})
+{
+    using CbrtHS = hyperparam_detail::CbrtContigHyperparameterSet<argTy>;
+    static constexpr std::uint8_t vec_sz = CbrtHS::vec_sz;
+    static constexpr std::uint8_t n_vecs = CbrtHS::n_vecs;
+
+    return elementwise_common::unary_contig_impl<
+        argTy, CbrtOutputType, CbrtContigFunctor, cbrt_contig_kernel, vec_sz,
+        n_vecs>(exec_q, nelems, arg_p, res_p, depends);
+}
+
+template <typename fnT, typename T>
+struct CbrtContigFactory
+{
+    fnT get()
+    {
+        if constexpr (!CbrtOutputType<T>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = cbrt_contig_impl<T>;
+            return fn;
+        }
+    }
+};
+
+template <typename fnT, typename T>
+struct CbrtTypeMapFactory
+{
+    /*! @brief get typeid for output type of std::cbrt(T x) */
+    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
+    {
+        using rT = typename CbrtOutputType<T>::value_type;
+        return td_ns::GetTypeid<rT>{}.get();
+    }
+};
+
+template <typename T1, typename T2, typename T3>
+class cbrt_strided_kernel;
+
+template <typename argTy>
+sycl::event
+    cbrt_strided_impl(sycl::queue &exec_q,
+                      std::size_t nelems,
+                      int nd,
+                      const ssize_t *shape_and_strides,
+                      const char *arg_p,
+                      ssize_t arg_offset,
+                      char *res_p,
+                      ssize_t res_offset,
+                      const std::vector<sycl::event> &depends,
+                      const std::vector<sycl::event> &additional_depends)
+{
+    return elementwise_common::unary_strided_impl<
+        argTy, CbrtOutputType, CbrtStridedFunctor, cbrt_strided_kernel>(
+        exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p,
+        res_offset, depends, additional_depends);
+}
+
+template <typename fnT, typename T>
+struct CbrtStridedFactory
+{
+    fnT get()
+    {
+        if constexpr (!CbrtOutputType<T>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = cbrt_strided_impl<T>;
+            return fn;
+        }
+    }
+};
+
+} // namespace dpctl::tensor::kernels::cbrt
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/exp2.hpp b/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/exp2.hpp
new file mode 100644
index 000000000000..dd09f4eee342
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/exp2.hpp
@@ -0,0 +1,272 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for elementwise evaluation of EXP2(x) function.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <cmath>
+#include <complex>
+#include <cstddef>
+#include <cstdint>
+#include <limits>
+#include <type_traits>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "sycl_complex.hpp"
+#include "vec_size_util.hpp"
+
+#include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+
+#include "utils/type_dispatch_building.hpp"
+#include "utils/type_utils.hpp"
+
+namespace dpctl::tensor::kernels::exp2
+{
+
+using dpctl::tensor::ssize_t;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+using dpctl::tensor::type_utils::is_complex;
+
+template <typename argT, typename resT>
+struct Exp2Functor
+{
+    // is function constant for given argT
+    using is_constant = typename std::false_type;
+    // constant value, if constant
+    // constexpr resT constant_value = resT{};
+    // is function defined for sycl::vec
+    using supports_vec = typename std::false_type;
+    // do both argTy and resTy support sugroup store/load operation
+    using supports_sg_loadstore = typename std::negation<
+        std::disjunction<is_complex<resT>, is_complex<argT>>>;
+
+    resT operator()(const argT &in) const
+    {
+        if constexpr (is_complex<argT>::value) {
+            using realT = typename argT::value_type;
+
+            const argT tmp = in * sycl::log(realT(2));
+
+            static constexpr realT q_nan =
+                std::numeric_limits<realT>::quiet_NaN();
+
+            const realT x = std::real(tmp);
+            const realT y = std::imag(tmp);
+            if (std::isfinite(x)) {
+                if (std::isfinite(y)) {
+                    return exprm_ns::exp(exprm_ns::complex<realT>(tmp));
+                }
+                else {
+                    return resT{q_nan, q_nan};
+                }
+            }
+            else if (std::isnan(x)) {
+                /* x is nan */
+                if (y == realT(0)) {
+                    return resT{in};
+                }
+                else {
+                    return resT{x, q_nan};
+                }
+            }
+            else {
+                if (!sycl::signbit(x)) { /* x is +inf */
+                    if (y == realT(0)) {
+                        return resT{x, y};
+                    }
+                    else if (std::isfinite(y)) {
+                        return resT{x * sycl::cos(y), x * sycl::sin(y)};
+                    }
+                    else {
+                        /* x = +inf, y = +-inf || nan */
+                        return resT{x, q_nan};
+                    }
+                }
+                else { /* x is -inf */
+                    if (std::isfinite(y)) {
+                        realT exp_x = sycl::exp(x);
+                        return resT{exp_x * sycl::cos(y), exp_x * sycl::sin(y)};
+                    }
+                    else {
+                        /* x = -inf, y = +-inf || nan */
+                        return resT{0, 0};
+                    }
+                }
+            }
+        }
+        else {
+            return sycl::exp2(in);
+        }
+    }
+};
+
+template <typename argTy,
+          typename resTy = argTy,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
+          bool enable_sg_loadstore = true>
+using Exp2ContigFunctor =
+    elementwise_common::UnaryContigFunctor<argTy,
+                                           resTy,
+                                           Exp2Functor<argTy, resTy>,
+                                           vec_sz,
+                                           n_vecs,
+                                           enable_sg_loadstore>;
+
+template <typename argTy, typename resTy, typename IndexerT>
+using Exp2StridedFunctor = elementwise_common::
+    UnaryStridedFunctor<argTy, resTy, IndexerT, Exp2Functor<argTy, resTy>>;
+
+template <typename T>
+struct Exp2OutputType
+{
+    using value_type = typename std::disjunction<
+        td_ns::TypeMapResultEntry<T, sycl::half>,
+        td_ns::TypeMapResultEntry<T, float>,
+        td_ns::TypeMapResultEntry<T, double>,
+        td_ns::TypeMapResultEntry<T, std::complex<float>>,
+        td_ns::TypeMapResultEntry<T, std::complex<double>>,
+        td_ns::DefaultResultEntry<void>>::result_type;
+
+    static constexpr bool is_defined = !std::is_same_v<value_type, void>;
+};
+
+namespace hyperparam_detail
+{
+
+namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
+
+using vsu_ns::ContigHyperparameterSetDefault;
+using vsu_ns::UnaryContigHyperparameterSetEntry;
+
+template <typename argTy>
+struct Exp2ContigHyperparameterSet
+{
+    using value_type =
+        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
+
+    constexpr static auto vec_sz = value_type::vec_sz;
+    constexpr static auto n_vecs = value_type::n_vecs;
+};
+
+} // end of namespace hyperparam_detail
+
+template <typename T1, typename T2, std::uint8_t vec_sz, std::uint8_t n_vecs>
+class exp2_contig_kernel;
+
+template <typename argTy>
+sycl::event exp2_contig_impl(sycl::queue &exec_q,
+                             std::size_t nelems,
+                             const char *arg_p,
+                             char *res_p,
+                             const std::vector<sycl::event> &depends = {})
+{
+    using Exp2HS = hyperparam_detail::Exp2ContigHyperparameterSet<argTy>;
+
+    static constexpr std::uint8_t vec_sz = Exp2HS::vec_sz;
+    static constexpr std::uint8_t n_vecs = Exp2HS::n_vecs;
+
+    return elementwise_common::unary_contig_impl<
+        argTy, Exp2OutputType, Exp2ContigFunctor, exp2_contig_kernel, vec_sz,
+        n_vecs>(exec_q, nelems, arg_p, res_p, depends);
+}
+
+template <typename fnT, typename T>
+struct Exp2ContigFactory
+{
+    fnT get()
+    {
+        if constexpr (!Exp2OutputType<T>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = exp2_contig_impl<T>;
+            return fn;
+        }
+    }
+};
+
+template <typename fnT, typename T>
+struct Exp2TypeMapFactory
+{
+    /*! @brief get typeid for output type of sycl::exp2(T x) */
+    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
+    {
+        using rT = typename Exp2OutputType<T>::value_type;
+        return td_ns::GetTypeid<rT>{}.get();
+    }
+};
+
+template <typename T1, typename T2, typename T3>
+class exp2_strided_kernel;
+
+template <typename argTy>
+sycl::event
+    exp2_strided_impl(sycl::queue &exec_q,
+                      std::size_t nelems,
+                      int nd,
+                      const ssize_t *shape_and_strides,
+                      const char *arg_p,
+                      ssize_t arg_offset,
+                      char *res_p,
+                      ssize_t res_offset,
+                      const std::vector<sycl::event> &depends,
+                      const std::vector<sycl::event> &additional_depends)
+{
+    return elementwise_common::unary_strided_impl<
+        argTy, Exp2OutputType, Exp2StridedFunctor, exp2_strided_kernel>(
+        exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p,
+        res_offset, depends, additional_depends);
+}
+
+template <typename fnT, typename T>
+struct Exp2StridedFactory
+{
+    fnT get()
+    {
+        if constexpr (!Exp2OutputType<T>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = exp2_strided_impl<T>;
+            return fn;
+        }
+    }
+};
+
+} // namespace dpctl::tensor::kernels::exp2
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/proj.hpp b/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/proj.hpp
new file mode 100644
index 000000000000..039da657cfd2
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/proj.hpp
@@ -0,0 +1,239 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for elementwise evaluation of PROJ(x) function.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <cmath>
+#include <complex>
+#include <cstddef>
+#include <cstdint>
+#include <limits>
+#include <type_traits>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "vec_size_util.hpp"
+
+#include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+
+#include "utils/type_dispatch_building.hpp"
+#include "utils/type_utils.hpp"
+
+namespace dpctl::tensor::kernels::proj
+{
+
+using dpctl::tensor::ssize_t;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+using dpctl::tensor::type_utils::is_complex;
+
+template <typename argT, typename resT>
+struct ProjFunctor
+{
+
+    // is function constant for given argT
+    using is_constant = typename std::false_type;
+    // constant value, if constant
+    // constexpr resT constant_value = resT{};
+    // is function defined for sycl::vec
+    using supports_vec = typename std::false_type;
+    // do both argTy and resTy support sugroup store/load operation
+    using supports_sg_loadstore = typename std::false_type;
+
+    resT operator()(const argT &in) const
+    {
+        using realT = typename argT::value_type;
+        const realT x = std::real(in);
+        const realT y = std::imag(in);
+
+        if (std::isinf(x)) {
+            return value_at_infinity(y);
+        }
+        else if (std::isinf(y)) {
+            return value_at_infinity(y);
+        }
+        else {
+            return in;
+        }
+    }
+
+private:
+    template <typename T>
+    std::complex<T> value_at_infinity(const T &y) const
+    {
+        const T res_im = sycl::copysign(T(0), y);
+        return std::complex<T>{std::numeric_limits<T>::infinity(), res_im};
+    }
+};
+
+template <typename argTy,
+          typename resTy = argTy,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
+          bool enable_sg_loadstore = true>
+using ProjContigFunctor =
+    elementwise_common::UnaryContigFunctor<argTy,
+                                           resTy,
+                                           ProjFunctor<argTy, resTy>,
+                                           vec_sz,
+                                           n_vecs,
+                                           enable_sg_loadstore>;
+
+template <typename argTy, typename resTy, typename IndexerT>
+using ProjStridedFunctor = elementwise_common::
+    UnaryStridedFunctor<argTy, resTy, IndexerT, ProjFunctor<argTy, resTy>>;
+
+template <typename T>
+struct ProjOutputType
+{
+    using value_type = typename std::disjunction<
+        td_ns::TypeMapResultEntry<T, std::complex<float>>,
+        td_ns::TypeMapResultEntry<T, std::complex<double>>,
+        td_ns::DefaultResultEntry<void>>::result_type;
+
+    static constexpr bool is_defined = !std::is_same_v<value_type, void>;
+};
+
+namespace hyperparam_detail
+{
+
+namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
+
+using vsu_ns::ContigHyperparameterSetDefault;
+using vsu_ns::UnaryContigHyperparameterSetEntry;
+
+template <typename argTy>
+struct ProjContigHyperparameterSet
+{
+    using value_type =
+        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
+
+    constexpr static auto vec_sz = value_type::vec_sz;
+    constexpr static auto n_vecs = value_type::n_vecs;
+};
+
+} // end of namespace hyperparam_detail
+
+template <typename T1, typename T2, std::uint8_t vec_sz, std::uint8_t n_vecs>
+class proj_contig_kernel;
+
+template <typename argTy>
+sycl::event proj_contig_impl(sycl::queue &exec_q,
+                             std::size_t nelems,
+                             const char *arg_p,
+                             char *res_p,
+                             const std::vector<sycl::event> &depends = {})
+{
+    using ProjHS = hyperparam_detail::ProjContigHyperparameterSet<argTy>;
+    static constexpr std::uint8_t vec_sz = ProjHS::vec_sz;
+    static constexpr std::uint8_t n_vecs = ProjHS::n_vecs;
+
+    return elementwise_common::unary_contig_impl<
+        argTy, ProjOutputType, ProjContigFunctor, proj_contig_kernel, vec_sz,
+        n_vecs>(exec_q, nelems, arg_p, res_p, depends);
+}
+
+template <typename fnT, typename T>
+struct ProjContigFactory
+{
+    fnT get()
+    {
+        if constexpr (!ProjOutputType<T>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            if constexpr (std::is_same_v<T, std::complex<double>>) {
+                fnT fn = proj_contig_impl<T>;
+                return fn;
+            }
+            else {
+                fnT fn = proj_contig_impl<T>;
+                return fn;
+            }
+        }
+    }
+};
+
+template <typename fnT, typename T>
+struct ProjTypeMapFactory
+{
+    /*! @brief get typeid for output type of std::proj(T x) */
+    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
+    {
+        using rT = typename ProjOutputType<T>::value_type;
+        return td_ns::GetTypeid<rT>{}.get();
+    }
+};
+
+template <typename T1, typename T2, typename T3>
+class proj_strided_kernel;
+
+template <typename argTy>
+sycl::event
+    proj_strided_impl(sycl::queue &exec_q,
+                      std::size_t nelems,
+                      int nd,
+                      const ssize_t *shape_and_strides,
+                      const char *arg_p,
+                      ssize_t arg_offset,
+                      char *res_p,
+                      ssize_t res_offset,
+                      const std::vector<sycl::event> &depends,
+                      const std::vector<sycl::event> &additional_depends)
+{
+    return elementwise_common::unary_strided_impl<
+        argTy, ProjOutputType, ProjStridedFunctor, proj_strided_kernel>(
+        exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p,
+        res_offset, depends, additional_depends);
+}
+
+template <typename fnT, typename T>
+struct ProjStridedFactory
+{
+    fnT get()
+    {
+        if constexpr (!ProjOutputType<T>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = proj_strided_impl<T>;
+            return fn;
+        }
+    }
+};
+
+} // namespace dpctl::tensor::kernels::proj
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/real.hpp b/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/real.hpp
new file mode 100644
index 000000000000..d21a9e6baa7d
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/real.hpp
@@ -0,0 +1,231 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for elementwise evaluation of REAL(x) function.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <complex>
+#include <cstddef>
+#include <cstdint>
+#include <type_traits>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "vec_size_util.hpp"
+
+#include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+
+#include "utils/type_dispatch_building.hpp"
+#include "utils/type_utils.hpp"
+
+namespace dpctl::tensor::kernels::real
+{
+
+using dpctl::tensor::ssize_t;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+using dpctl::tensor::type_utils::is_complex;
+using dpctl::tensor::type_utils::is_complex_v;
+
+template <typename argT, typename resT>
+struct RealFunctor
+{
+
+    // is function constant for given argT
+    using is_constant = typename std::false_type;
+    // constant value, if constant
+    // constexpr resT constant_value = resT{};
+    // is function defined for sycl::vec
+    using supports_vec = typename std::false_type;
+    // do both argTy and resTy support sugroup store/load operation
+    using supports_sg_loadstore = typename std::negation<
+        std::disjunction<is_complex<resT>, is_complex<argT>>>;
+
+    resT operator()(const argT &in) const
+    {
+        if constexpr (is_complex_v<argT>) {
+            return std::real(in);
+        }
+        else {
+            static_assert(std::is_same_v<resT, argT>);
+            return in;
+        }
+    }
+};
+
+template <typename argTy,
+          typename resTy = argTy,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
+          bool enable_sg_loadstore = true>
+using RealContigFunctor =
+    elementwise_common::UnaryContigFunctor<argTy,
+                                           resTy,
+                                           RealFunctor<argTy, resTy>,
+                                           vec_sz,
+                                           n_vecs,
+                                           enable_sg_loadstore>;
+
+template <typename argTy, typename resTy, typename IndexerT>
+using RealStridedFunctor = elementwise_common::
+    UnaryStridedFunctor<argTy, resTy, IndexerT, RealFunctor<argTy, resTy>>;
+
+template <typename T>
+struct RealOutputType
+{
+    using value_type = typename std::disjunction<
+        td_ns::TypeMapResultEntry<T, bool>,
+        td_ns::TypeMapResultEntry<T, std::uint8_t>,
+        td_ns::TypeMapResultEntry<T, std::uint16_t>,
+        td_ns::TypeMapResultEntry<T, std::uint32_t>,
+        td_ns::TypeMapResultEntry<T, std::uint64_t>,
+        td_ns::TypeMapResultEntry<T, std::int8_t>,
+        td_ns::TypeMapResultEntry<T, std::int16_t>,
+        td_ns::TypeMapResultEntry<T, std::int32_t>,
+        td_ns::TypeMapResultEntry<T, std::int64_t>,
+        td_ns::TypeMapResultEntry<T, sycl::half>,
+        td_ns::TypeMapResultEntry<T, float>,
+        td_ns::TypeMapResultEntry<T, double>,
+        td_ns::TypeMapResultEntry<T, std::complex<float>, float>,
+        td_ns::TypeMapResultEntry<T, std::complex<double>, double>,
+        td_ns::DefaultResultEntry<void>>::result_type;
+
+    static constexpr bool is_defined = !std::is_same_v<value_type, void>;
+};
+
+namespace hyperparam_detail
+{
+
+namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
+
+using vsu_ns::ContigHyperparameterSetDefault;
+using vsu_ns::UnaryContigHyperparameterSetEntry;
+
+template <typename argTy>
+struct RealContigHyperparameterSet
+{
+    using value_type =
+        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
+
+    constexpr static auto vec_sz = value_type::vec_sz;
+    constexpr static auto n_vecs = value_type::n_vecs;
+};
+
+} // end of namespace hyperparam_detail
+
+template <typename T1, typename T2, std::uint8_t vec_sz, std::uint8_t n_vecs>
+class real_contig_kernel;
+
+template <typename argTy>
+sycl::event real_contig_impl(sycl::queue &exec_q,
+                             std::size_t nelems,
+                             const char *arg_p,
+                             char *res_p,
+                             const std::vector<sycl::event> &depends = {})
+{
+    using RealHS = hyperparam_detail::RealContigHyperparameterSet<argTy>;
+    static constexpr std::uint8_t vec_sz = RealHS::vec_sz;
+    static constexpr std::uint8_t n_vecs = RealHS::n_vecs;
+
+    return elementwise_common::unary_contig_impl<
+        argTy, RealOutputType, RealContigFunctor, real_contig_kernel, vec_sz,
+        n_vecs>(exec_q, nelems, arg_p, res_p, depends);
+}
+
+template <typename fnT, typename T>
+struct RealContigFactory
+{
+    fnT get()
+    {
+        if constexpr (!RealOutputType<T>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = real_contig_impl<T>;
+            return fn;
+        }
+    }
+};
+
+template <typename fnT, typename T>
+struct RealTypeMapFactory
+{
+    /*! @brief get typeid for output type of std::real(T x) */
+    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
+    {
+        using rT = typename RealOutputType<T>::value_type;
+        return td_ns::GetTypeid<rT>{}.get();
+    }
+};
+
+template <typename T1, typename T2, typename T3>
+class real_strided_kernel;
+
+template <typename argTy>
+sycl::event
+    real_strided_impl(sycl::queue &exec_q,
+                      std::size_t nelems,
+                      int nd,
+                      const ssize_t *shape_and_strides,
+                      const char *arg_p,
+                      ssize_t arg_offset,
+                      char *res_p,
+                      ssize_t res_offset,
+                      const std::vector<sycl::event> &depends,
+                      const std::vector<sycl::event> &additional_depends)
+{
+    return elementwise_common::unary_strided_impl<
+        argTy, RealOutputType, RealStridedFunctor, real_strided_kernel>(
+        exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p,
+        res_offset, depends, additional_depends);
+}
+
+template <typename fnT, typename T>
+struct RealStridedFactory
+{
+    fnT get()
+    {
+        if constexpr (!RealOutputType<T>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = real_strided_impl<T>;
+            return fn;
+        }
+    }
+};
+
+} // namespace dpctl::tensor::kernels::real
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/reciprocal.hpp b/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/reciprocal.hpp
new file mode 100644
index 000000000000..f26f4043c9ab
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/reciprocal.hpp
@@ -0,0 +1,229 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for elementwise evaluation of RECIPROCAL(x)
+/// function.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <complex>
+#include <cstddef>
+#include <cstdint>
+#include <type_traits>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "sycl_complex.hpp"
+#include "vec_size_util.hpp"
+
+#include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+
+#include "utils/type_dispatch_building.hpp"
+#include "utils/type_utils.hpp"
+
+namespace dpctl::tensor::kernels::reciprocal
+{
+
+using dpctl::tensor::ssize_t;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+using dpctl::tensor::type_utils::is_complex;
+
+template <typename argT, typename resT>
+struct ReciprocalFunctor
+{
+    // is function constant for given argT
+    using is_constant = typename std::false_type;
+    // constant value, if constant
+    // constexpr resT constant_value = resT{};
+    // is function defined for sycl::vec
+    using supports_vec = typename std::false_type;
+    // do both argTy and resTy support sugroup store/load operation
+    using supports_sg_loadstore = typename std::negation<
+        std::disjunction<is_complex<resT>, is_complex<argT>>>;
+
+    resT operator()(const argT &in) const
+    {
+        if constexpr (is_complex<argT>::value) {
+
+            using realT = typename argT::value_type;
+
+            return realT(1) / exprm_ns::complex<realT>(in);
+        }
+        else {
+            return argT(1) / in;
+        }
+    }
+};
+
+template <typename argTy,
+          typename resTy = argTy,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
+          bool enable_sg_loadstore = true>
+using ReciprocalContigFunctor =
+    elementwise_common::UnaryContigFunctor<argTy,
+                                           resTy,
+                                           ReciprocalFunctor<argTy, resTy>,
+                                           vec_sz,
+                                           n_vecs,
+                                           enable_sg_loadstore>;
+
+template <typename argTy, typename resTy, typename IndexerT>
+using ReciprocalStridedFunctor =
+    elementwise_common::UnaryStridedFunctor<argTy,
+                                            resTy,
+                                            IndexerT,
+                                            ReciprocalFunctor<argTy, resTy>>;
+
+template <typename T>
+struct ReciprocalOutputType
+{
+    using value_type = typename std::disjunction<
+        td_ns::TypeMapResultEntry<T, sycl::half>,
+        td_ns::TypeMapResultEntry<T, float>,
+        td_ns::TypeMapResultEntry<T, double>,
+        td_ns::TypeMapResultEntry<T, std::complex<float>>,
+        td_ns::TypeMapResultEntry<T, std::complex<double>>,
+        td_ns::DefaultResultEntry<void>>::result_type;
+
+    static constexpr bool is_defined = !std::is_same_v<value_type, void>;
+};
+
+namespace hyperparam_detail
+{
+
+namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
+
+using vsu_ns::ContigHyperparameterSetDefault;
+using vsu_ns::UnaryContigHyperparameterSetEntry;
+
+template <typename argTy>
+struct ReciprocalContigHyperparameterSet
+{
+    using value_type =
+        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
+
+    constexpr static auto vec_sz = value_type::vec_sz;
+    constexpr static auto n_vecs = value_type::n_vecs;
+};
+
+} // end of namespace hyperparam_detail
+
+template <typename T1, typename T2, std::uint8_t vec_sz, std::uint8_t n_vecs>
+class reciprocal_contig_kernel;
+
+template <typename argTy>
+sycl::event reciprocal_contig_impl(sycl::queue &exec_q,
+                                   std::size_t nelems,
+                                   const char *arg_p,
+                                   char *res_p,
+                                   const std::vector<sycl::event> &depends = {})
+{
+    using RecipHS = hyperparam_detail::ReciprocalContigHyperparameterSet<argTy>;
+    static constexpr std::uint8_t vec_sz = RecipHS::vec_sz;
+    static constexpr std::uint8_t n_vecs = RecipHS::n_vecs;
+
+    return elementwise_common::unary_contig_impl<
+        argTy, ReciprocalOutputType, ReciprocalContigFunctor,
+        reciprocal_contig_kernel, vec_sz, n_vecs>(exec_q, nelems, arg_p, res_p,
+                                                  depends);
+}
+
+template <typename fnT, typename T>
+struct ReciprocalContigFactory
+{
+    fnT get()
+    {
+        if constexpr (!ReciprocalOutputType<T>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = reciprocal_contig_impl<T>;
+            return fn;
+        }
+    }
+};
+
+template <typename fnT, typename T>
+struct ReciprocalTypeMapFactory
+{
+    /*! @brief get typeid for output type of 1 / x */
+    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
+    {
+        using rT = typename ReciprocalOutputType<T>::value_type;
+        return td_ns::GetTypeid<rT>{}.get();
+    }
+};
+
+template <typename T1, typename T2, typename T3>
+class reciprocal_strided_kernel;
+
+template <typename argTy>
+sycl::event
+    reciprocal_strided_impl(sycl::queue &exec_q,
+                            std::size_t nelems,
+                            int nd,
+                            const ssize_t *shape_and_strides,
+                            const char *arg_p,
+                            ssize_t arg_offset,
+                            char *res_p,
+                            ssize_t res_offset,
+                            const std::vector<sycl::event> &depends,
+                            const std::vector<sycl::event> &additional_depends)
+{
+    return elementwise_common::unary_strided_impl<argTy, ReciprocalOutputType,
+                                                  ReciprocalStridedFunctor,
+                                                  reciprocal_strided_kernel>(
+        exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p,
+        res_offset, depends, additional_depends);
+}
+
+template <typename fnT, typename T>
+struct ReciprocalStridedFactory
+{
+    fnT get()
+    {
+        if constexpr (!ReciprocalOutputType<T>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = reciprocal_strided_impl<T>;
+            return fn;
+        }
+    }
+};
+
+} // namespace dpctl::tensor::kernels::reciprocal
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/round.hpp b/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/round.hpp
new file mode 100644
index 000000000000..b20166a4d505
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/round.hpp
@@ -0,0 +1,241 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for elementwise evaluation of ROUND(x) function.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <complex>
+#include <cstddef>
+#include <cstdint>
+#include <type_traits>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "vec_size_util.hpp"
+
+#include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+
+#include "utils/type_dispatch_building.hpp"
+#include "utils/type_utils.hpp"
+
+namespace dpctl::tensor::kernels::round
+{
+
+using dpctl::tensor::ssize_t;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+using dpctl::tensor::type_utils::is_complex;
+
+template <typename argT, typename resT>
+struct RoundFunctor
+{
+
+    // is function constant for given argT
+    using is_constant = typename std::false_type;
+    // constant value, if constant
+    // constexpr resT constant_value = resT{};
+    // is function defined for sycl::vec
+    using supports_vec = typename std::false_type;
+    // do both argTy and resTy support sugroup store/load operation
+    using supports_sg_loadstore = typename std::negation<
+        std::disjunction<is_complex<resT>, is_complex<argT>>>;
+
+    resT operator()(const argT &in) const
+    {
+
+        if constexpr (std::is_integral_v<argT>) {
+            return in;
+        }
+        else if constexpr (is_complex<argT>::value) {
+            using realT = typename argT::value_type;
+            return resT{round_func<realT>(std::real(in)),
+                        round_func<realT>(std::imag(in))};
+        }
+        else {
+            return round_func<argT>(in);
+        }
+    }
+
+private:
+    template <typename T>
+    T round_func(const T &input) const
+    {
+        return sycl::rint(input);
+    }
+};
+
+template <typename argTy,
+          typename resTy = argTy,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
+          bool enable_sg_loadstore = true>
+using RoundContigFunctor =
+    elementwise_common::UnaryContigFunctor<argTy,
+                                           resTy,
+                                           RoundFunctor<argTy, resTy>,
+                                           vec_sz,
+                                           n_vecs,
+                                           enable_sg_loadstore>;
+
+template <typename argTy, typename resTy, typename IndexerT>
+using RoundStridedFunctor = elementwise_common::
+    UnaryStridedFunctor<argTy, resTy, IndexerT, RoundFunctor<argTy, resTy>>;
+
+template <typename T>
+struct RoundOutputType
+{
+    using value_type = typename std::disjunction<
+        td_ns::TypeMapResultEntry<T, std::uint8_t>,
+        td_ns::TypeMapResultEntry<T, std::uint16_t>,
+        td_ns::TypeMapResultEntry<T, std::uint32_t>,
+        td_ns::TypeMapResultEntry<T, std::uint64_t>,
+        td_ns::TypeMapResultEntry<T, std::int8_t>,
+        td_ns::TypeMapResultEntry<T, std::int16_t>,
+        td_ns::TypeMapResultEntry<T, std::int32_t>,
+        td_ns::TypeMapResultEntry<T, std::int64_t>,
+        td_ns::TypeMapResultEntry<T, sycl::half>,
+        td_ns::TypeMapResultEntry<T, float>,
+        td_ns::TypeMapResultEntry<T, double>,
+        td_ns::TypeMapResultEntry<T, std::complex<float>>,
+        td_ns::TypeMapResultEntry<T, std::complex<double>>,
+        td_ns::DefaultResultEntry<void>>::result_type;
+
+    static constexpr bool is_defined = !std::is_same_v<value_type, void>;
+};
+
+namespace hyperparam_detail
+{
+
+namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
+
+using vsu_ns::ContigHyperparameterSetDefault;
+using vsu_ns::UnaryContigHyperparameterSetEntry;
+
+template <typename argTy>
+struct RoundContigHyperparameterSet
+{
+    using value_type =
+        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
+
+    constexpr static auto vec_sz = value_type::vec_sz;
+    constexpr static auto n_vecs = value_type::n_vecs;
+};
+
+} // end of namespace hyperparam_detail
+
+template <typename T1, typename T2, std::uint8_t vec_sz, std::uint8_t n_vecs>
+class round_contig_kernel;
+
+template <typename argTy>
+sycl::event round_contig_impl(sycl::queue &exec_q,
+                              std::size_t nelems,
+                              const char *arg_p,
+                              char *res_p,
+                              const std::vector<sycl::event> &depends = {})
+{
+    using RoundHS = hyperparam_detail::RoundContigHyperparameterSet<argTy>;
+    static constexpr std::uint8_t vec_sz = RoundHS::vec_sz;
+    static constexpr std::uint8_t n_vecs = RoundHS::n_vecs;
+
+    return elementwise_common::unary_contig_impl<
+        argTy, RoundOutputType, RoundContigFunctor, round_contig_kernel, vec_sz,
+        n_vecs>(exec_q, nelems, arg_p, res_p, depends);
+}
+
+template <typename fnT, typename T>
+struct RoundContigFactory
+{
+    fnT get()
+    {
+        if constexpr (!RoundOutputType<T>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = round_contig_impl<T>;
+            return fn;
+        }
+    }
+};
+
+template <typename fnT, typename T>
+struct RoundTypeMapFactory
+{
+    /*! @brief get typeid for output type of sycl::round(T x) */
+    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
+    {
+        using rT = typename RoundOutputType<T>::value_type;
+        return td_ns::GetTypeid<rT>{}.get();
+    }
+};
+
+template <typename T1, typename T2, typename T3>
+class round_strided_kernel;
+
+template <typename argTy>
+sycl::event
+    round_strided_impl(sycl::queue &exec_q,
+                       std::size_t nelems,
+                       int nd,
+                       const ssize_t *shape_and_strides,
+                       const char *arg_p,
+                       ssize_t arg_offset,
+                       char *res_p,
+                       ssize_t res_offset,
+                       const std::vector<sycl::event> &depends,
+                       const std::vector<sycl::event> &additional_depends)
+{
+    return elementwise_common::unary_strided_impl<
+        argTy, RoundOutputType, RoundStridedFunctor, round_strided_kernel>(
+        exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p,
+        res_offset, depends, additional_depends);
+}
+
+template <typename fnT, typename T>
+struct RoundStridedFactory
+{
+    fnT get()
+    {
+        if constexpr (!RoundOutputType<T>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = round_strided_impl<T>;
+            return fn;
+        }
+    }
+};
+
+} // namespace dpctl::tensor::kernels::round
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/rsqrt.hpp b/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/rsqrt.hpp
new file mode 100644
index 000000000000..0228aecdca67
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/rsqrt.hpp
@@ -0,0 +1,209 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for elementwise evaluation of RSQRT(x)
+/// function that computes the reciprocal square root.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <cstddef>
+#include <cstdint>
+#include <type_traits>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "vec_size_util.hpp"
+
+#include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+
+#include "utils/type_dispatch_building.hpp"
+
+namespace dpctl::tensor::kernels::rsqrt
+{
+
+using dpctl::tensor::ssize_t;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+template <typename argT, typename resT>
+struct RsqrtFunctor
+{
+
+    // is function constant for given argT
+    using is_constant = typename std::false_type;
+    // constant value, if constant
+    // constexpr resT constant_value = resT{};
+    // is function defined for sycl::vec
+    using supports_vec = typename std::false_type;
+    // do both argTy and resTy support sugroup store/load operation
+    using supports_sg_loadstore = typename std::true_type;
+
+    resT operator()(const argT &in) const
+    {
+        return sycl::rsqrt(in);
+    }
+};
+
+template <typename argTy,
+          typename resTy = argTy,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
+          bool enable_sg_loadstore = true>
+using RsqrtContigFunctor =
+    elementwise_common::UnaryContigFunctor<argTy,
+                                           resTy,
+                                           RsqrtFunctor<argTy, resTy>,
+                                           vec_sz,
+                                           n_vecs,
+                                           enable_sg_loadstore>;
+
+template <typename argTy, typename resTy, typename IndexerT>
+using RsqrtStridedFunctor = elementwise_common::
+    UnaryStridedFunctor<argTy, resTy, IndexerT, RsqrtFunctor<argTy, resTy>>;
+
+template <typename T>
+struct RsqrtOutputType
+{
+    using value_type = typename std::disjunction<
+        td_ns::TypeMapResultEntry<T, sycl::half, sycl::half>,
+        td_ns::TypeMapResultEntry<T, float, float>,
+        td_ns::TypeMapResultEntry<T, double, double>,
+        td_ns::DefaultResultEntry<void>>::result_type;
+
+    static constexpr bool is_defined = !std::is_same_v<value_type, void>;
+};
+
+namespace hyperparam_detail
+{
+
+namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
+
+using vsu_ns::ContigHyperparameterSetDefault;
+using vsu_ns::UnaryContigHyperparameterSetEntry;
+
+template <typename argTy>
+struct RsqrtContigHyperparameterSet
+{
+    using value_type =
+        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
+
+    constexpr static auto vec_sz = value_type::vec_sz;
+    constexpr static auto n_vecs = value_type::n_vecs;
+};
+
+} // namespace hyperparam_detail
+
+template <typename T1, typename T2, std::uint8_t vec_sz, std::uint8_t n_vecs>
+class rsqrt_contig_kernel;
+
+template <typename argTy>
+sycl::event rsqrt_contig_impl(sycl::queue &exec_q,
+                              std::size_t nelems,
+                              const char *arg_p,
+                              char *res_p,
+                              const std::vector<sycl::event> &depends = {})
+{
+    using RsqrtHS = hyperparam_detail::RsqrtContigHyperparameterSet<argTy>;
+    static constexpr std::uint8_t vec_sz = RsqrtHS::vec_sz;
+    static constexpr std::uint8_t n_vecs = RsqrtHS::n_vecs;
+
+    return elementwise_common::unary_contig_impl<
+        argTy, RsqrtOutputType, RsqrtContigFunctor, rsqrt_contig_kernel, vec_sz,
+        n_vecs>(exec_q, nelems, arg_p, res_p, depends);
+}
+
+template <typename fnT, typename T>
+struct RsqrtContigFactory
+{
+    fnT get()
+    {
+        if constexpr (!RsqrtOutputType<T>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = rsqrt_contig_impl<T>;
+            return fn;
+        }
+    }
+};
+
+template <typename fnT, typename T>
+struct RsqrtTypeMapFactory
+{
+    /*! @brief get typeid for output type of sycl::rsqrt(T x) */
+    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
+    {
+        using rT = typename RsqrtOutputType<T>::value_type;
+        return td_ns::GetTypeid<rT>{}.get();
+    }
+};
+
+template <typename T1, typename T2, typename T3>
+class rsqrt_strided_kernel;
+
+template <typename argTy>
+sycl::event
+    rsqrt_strided_impl(sycl::queue &exec_q,
+                       std::size_t nelems,
+                       int nd,
+                       const ssize_t *shape_and_strides,
+                       const char *arg_p,
+                       ssize_t arg_offset,
+                       char *res_p,
+                       ssize_t res_offset,
+                       const std::vector<sycl::event> &depends,
+                       const std::vector<sycl::event> &additional_depends)
+{
+    return elementwise_common::unary_strided_impl<
+        argTy, RsqrtOutputType, RsqrtStridedFunctor, rsqrt_strided_kernel>(
+        exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p,
+        res_offset, depends, additional_depends);
+}
+
+template <typename fnT, typename T>
+struct RsqrtStridedFactory
+{
+    fnT get()
+    {
+        if constexpr (!RsqrtOutputType<T>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = rsqrt_strided_impl<T>;
+            return fn;
+        }
+    }
+};
+
+} // namespace dpctl::tensor::kernels::rsqrt
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/sign.hpp b/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/sign.hpp
new file mode 100644
index 000000000000..ceb3d1320f9c
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/sign.hpp
@@ -0,0 +1,258 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for elementwise evaluation of SIGN(x) function.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <cmath>
+#include <complex>
+#include <cstddef>
+#include <cstdint>
+#include <limits>
+#include <type_traits>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "cabs_impl.hpp"
+#include "vec_size_util.hpp"
+
+#include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+
+#include "utils/type_dispatch_building.hpp"
+#include "utils/type_utils.hpp"
+
+namespace dpctl::tensor::kernels::sign
+{
+
+using dpctl::tensor::ssize_t;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+using dpctl::tensor::type_utils::is_complex;
+using dpctl::tensor::type_utils::vec_cast;
+
+template <typename argT, typename resT>
+struct SignFunctor
+{
+    static_assert(std::is_same_v<resT, argT>);
+    using is_constant = typename std::false_type;
+    // constexpr resT constant_value = resT{};
+    using supports_vec = typename std::negation<
+        std::disjunction<is_complex<resT>, is_complex<argT>>>;
+    using supports_sg_loadstore = std::false_type;
+
+    resT operator()(const argT &in) const
+    {
+        if constexpr (std::is_integral_v<argT>) {
+            if constexpr (std::is_unsigned_v<argT>) {
+                return resT(0 < in);
+            }
+            else {
+                return sign_impl<argT>(in);
+            }
+        }
+        else {
+            if constexpr (is_complex<argT>::value) {
+                using realT = typename argT::value_type;
+
+                if (in == argT(0)) {
+                    return resT(0);
+                }
+                else {
+                    auto z = exprm_ns::complex<realT>(in);
+                    return (z / detail::cabs(in));
+                }
+            }
+            else {
+                if (std::isnan(in)) {
+                    return std::numeric_limits<resT>::quiet_NaN();
+                }
+                else {
+                    return sign_impl<argT>(in);
+                }
+            }
+        }
+    }
+
+private:
+    template <typename T>
+    T sign_impl(const T &v) const
+    {
+        return (T(0) < v) - (v < T(0));
+    }
+};
+
+template <typename argT,
+          typename resT = argT,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
+          bool enable_sg_loadstore = true>
+using SignContigFunctor =
+    elementwise_common::UnaryContigFunctor<argT,
+                                           resT,
+                                           SignFunctor<argT, resT>,
+                                           vec_sz,
+                                           n_vecs,
+                                           enable_sg_loadstore>;
+
+template <typename T>
+struct SignOutputType
+{
+    using value_type = typename std::disjunction<
+        td_ns::TypeMapResultEntry<T, std::uint8_t>,
+        td_ns::TypeMapResultEntry<T, std::uint16_t>,
+        td_ns::TypeMapResultEntry<T, std::uint32_t>,
+        td_ns::TypeMapResultEntry<T, std::uint64_t>,
+        td_ns::TypeMapResultEntry<T, std::int8_t>,
+        td_ns::TypeMapResultEntry<T, std::int16_t>,
+        td_ns::TypeMapResultEntry<T, std::int32_t>,
+        td_ns::TypeMapResultEntry<T, std::int64_t>,
+        td_ns::TypeMapResultEntry<T, sycl::half>,
+        td_ns::TypeMapResultEntry<T, float>,
+        td_ns::TypeMapResultEntry<T, double>,
+        td_ns::TypeMapResultEntry<T, std::complex<float>>,
+        td_ns::TypeMapResultEntry<T, std::complex<double>>,
+        td_ns::DefaultResultEntry<void>>::result_type;
+
+    static constexpr bool is_defined = !std::is_same_v<value_type, void>;
+};
+
+namespace hyperparam_detail
+{
+
+namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
+
+using vsu_ns::ContigHyperparameterSetDefault;
+using vsu_ns::UnaryContigHyperparameterSetEntry;
+
+template <typename argTy>
+struct SignContigHyperparameterSet
+{
+    using value_type =
+        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
+
+    constexpr static auto vec_sz = value_type::vec_sz;
+    constexpr static auto n_vecs = value_type::n_vecs;
+};
+
+} // end of namespace hyperparam_detail
+
+template <typename T1, typename T2, std::uint8_t vec_sz, std::uint8_t n_vecs>
+class sign_contig_kernel;
+
+template <typename argTy>
+sycl::event sign_contig_impl(sycl::queue &exec_q,
+                             std::size_t nelems,
+                             const char *arg_p,
+                             char *res_p,
+                             const std::vector<sycl::event> &depends = {})
+{
+    using SignHS = hyperparam_detail::SignContigHyperparameterSet<argTy>;
+    static constexpr std::uint8_t vec_sz = SignHS::vec_sz;
+    static constexpr std::uint8_t n_vecs = SignHS::n_vecs;
+
+    return elementwise_common::unary_contig_impl<
+        argTy, SignOutputType, SignContigFunctor, sign_contig_kernel, vec_sz,
+        n_vecs>(exec_q, nelems, arg_p, res_p, depends);
+}
+
+template <typename fnT, typename T>
+struct SignContigFactory
+{
+    fnT get()
+    {
+        if constexpr (!SignOutputType<T>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = sign_contig_impl<T>;
+            return fn;
+        }
+    }
+};
+
+template <typename fnT, typename T>
+struct SignTypeMapFactory
+{
+    /*! @brief get typeid for output type of sign(T x) */
+    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
+    {
+        using rT = typename SignOutputType<T>::value_type;
+        return td_ns::GetTypeid<rT>{}.get();
+    }
+};
+
+template <typename argTy, typename resTy, typename IndexerT>
+using SignStridedFunctor = elementwise_common::
+    UnaryStridedFunctor<argTy, resTy, IndexerT, SignFunctor<argTy, resTy>>;
+
+template <typename T1, typename T2, typename T3>
+class sign_strided_kernel;
+
+template <typename argTy>
+sycl::event
+    sign_strided_impl(sycl::queue &exec_q,
+                      std::size_t nelems,
+                      int nd,
+                      const ssize_t *shape_and_strides,
+                      const char *arg_p,
+                      ssize_t arg_offset,
+                      char *res_p,
+                      ssize_t res_offset,
+                      const std::vector<sycl::event> &depends,
+                      const std::vector<sycl::event> &additional_depends)
+{
+    return elementwise_common::unary_strided_impl<
+        argTy, SignOutputType, SignStridedFunctor, sign_strided_kernel>(
+        exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p,
+        res_offset, depends, additional_depends);
+}
+
+template <typename fnT, typename T>
+struct SignStridedFactory
+{
+    fnT get()
+    {
+        if constexpr (!SignOutputType<T>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = sign_strided_impl<T>;
+            return fn;
+        }
+    }
+};
+
+} // namespace dpctl::tensor::kernels::sign
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/signbit.hpp b/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/signbit.hpp
new file mode 100644
index 000000000000..d67120633efd
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/signbit.hpp
@@ -0,0 +1,223 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for elementwise evaluation of SIGNBIT(x)
+/// function that tests whether the sign bit of the tensor element is set.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <cmath>
+#include <cstddef>
+#include <cstdint>
+#include <type_traits>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "vec_size_util.hpp"
+
+#include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+
+#include "utils/type_dispatch_building.hpp"
+#include "utils/type_utils.hpp"
+
+namespace dpctl::tensor::kernels::signbit
+{
+
+using dpctl::tensor::ssize_t;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+using dpctl::tensor::type_utils::is_complex;
+using dpctl::tensor::type_utils::vec_cast;
+
+template <typename argT, typename resT>
+struct SignbitFunctor
+{
+    static_assert(std::is_same_v<resT, bool>);
+
+    using is_constant = std::false_type;
+    static constexpr resT constant_value = false;
+    using supports_vec = std::true_type;
+    using supports_sg_loadstore = std::true_type;
+
+    resT operator()(const argT &in) const
+    {
+        return std::signbit(in);
+    }
+
+    template <int vec_sz>
+    sycl::vec<resT, vec_sz> operator()(const sycl::vec<argT, vec_sz> &in) const
+    {
+        auto const &res_vec = sycl::signbit(in);
+
+        using deducedT = typename std::remove_cv_t<
+            std::remove_reference_t<decltype(res_vec)>>::element_type;
+
+        return vec_cast<resT, deducedT, vec_sz>(res_vec);
+    }
+};
+
+template <typename argT,
+          typename resT = bool,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
+          bool enable_sg_loadstore = true>
+using SignbitContigFunctor =
+    elementwise_common::UnaryContigFunctor<argT,
+                                           resT,
+                                           SignbitFunctor<argT, resT>,
+                                           vec_sz,
+                                           n_vecs,
+                                           enable_sg_loadstore>;
+
+template <typename argTy, typename resTy, typename IndexerT>
+using SignbitStridedFunctor = elementwise_common::
+    UnaryStridedFunctor<argTy, resTy, IndexerT, SignbitFunctor<argTy, resTy>>;
+
+template <typename argTy>
+struct SignbitOutputType
+{
+    using value_type = typename std::disjunction<
+        td_ns::TypeMapResultEntry<argTy, sycl::half, bool>,
+        td_ns::TypeMapResultEntry<argTy, float, bool>,
+        td_ns::TypeMapResultEntry<argTy, double, bool>,
+        td_ns::DefaultResultEntry<void>>::result_type;
+
+    static constexpr bool is_defined = !std::is_same_v<value_type, void>;
+};
+
+namespace hyperparam_detail
+{
+
+namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
+
+using vsu_ns::ContigHyperparameterSetDefault;
+using vsu_ns::UnaryContigHyperparameterSetEntry;
+
+template <typename argTy>
+struct SignbitContigHyperparameterSet
+{
+    using value_type =
+        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
+
+    constexpr static auto vec_sz = value_type::vec_sz;
+    constexpr static auto n_vecs = value_type::n_vecs;
+};
+
+} // end of namespace hyperparam_detail
+
+template <typename T1, typename T2, std::uint8_t vec_sz, std::uint8_t n_vecs>
+class signbit_contig_kernel;
+
+template <typename argTy>
+sycl::event signbit_contig_impl(sycl::queue &exec_q,
+                                std::size_t nelems,
+                                const char *arg_p,
+                                char *res_p,
+                                const std::vector<sycl::event> &depends = {})
+{
+    using SignbitHS = hyperparam_detail::SignbitContigHyperparameterSet<argTy>;
+    static constexpr std::uint8_t vec_sz = SignbitHS::vec_sz;
+    static constexpr std::uint8_t n_vecs = SignbitHS::n_vecs;
+
+    return elementwise_common::unary_contig_impl<
+        argTy, SignbitOutputType, SignbitContigFunctor, signbit_contig_kernel,
+        vec_sz, n_vecs>(exec_q, nelems, arg_p, res_p, depends);
+}
+
+template <typename fnT, typename T>
+struct SignbitContigFactory
+{
+    fnT get()
+    {
+        if constexpr (!SignbitOutputType<T>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = signbit_contig_impl<T>;
+            return fn;
+        }
+    }
+};
+
+template <typename fnT, typename T>
+struct SignbitTypeMapFactory
+{
+    /*! @brief get typeid for output type of sycl::isinf(T x) */
+    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
+    {
+        using rT = typename SignbitOutputType<T>::value_type;
+        return td_ns::GetTypeid<rT>{}.get();
+    }
+};
+
+template <typename T1, typename T2, typename T3>
+class signbit_strided_kernel;
+
+template <typename argTy>
+sycl::event
+    signbit_strided_impl(sycl::queue &exec_q,
+                         std::size_t nelems,
+                         int nd,
+                         const ssize_t *shape_and_strides,
+                         const char *arg_p,
+                         ssize_t arg_offset,
+                         char *res_p,
+                         ssize_t res_offset,
+                         const std::vector<sycl::event> &depends,
+                         const std::vector<sycl::event> &additional_depends)
+{
+    return elementwise_common::unary_strided_impl<argTy, SignbitOutputType,
+                                                  SignbitStridedFunctor,
+                                                  signbit_strided_kernel>(
+        exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p,
+        res_offset, depends, additional_depends);
+}
+
+template <typename fnT, typename T>
+struct SignbitStridedFactory
+{
+    fnT get()
+    {
+        if constexpr (!SignbitOutputType<T>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = signbit_strided_impl<T>;
+            return fn;
+        }
+    }
+};
+
+} // namespace dpctl::tensor::kernels::signbit
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/sin.hpp b/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/sin.hpp
new file mode 100644
index 000000000000..d1e3caa9effe
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/sin.hpp
@@ -0,0 +1,333 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for elementwise evaluation of SIN(x) function.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <cmath>
+#include <complex>
+#include <cstddef>
+#include <cstdint>
+#include <limits>
+#include <type_traits>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "sycl_complex.hpp"
+#include "vec_size_util.hpp"
+
+#include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+
+#include "utils/type_dispatch_building.hpp"
+#include "utils/type_utils.hpp"
+
+namespace dpctl::tensor::kernels::sin
+{
+
+using dpctl::tensor::ssize_t;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+using dpctl::tensor::type_utils::is_complex;
+
+template <typename argT, typename resT>
+struct SinFunctor
+{
+    // is function constant for given argT
+    using is_constant = typename std::false_type;
+    // constant value, if constant
+    // constexpr resT constant_value = resT{};
+    // is function defined for sycl::vec
+    using supports_vec = typename std::false_type;
+    // do both argTy and resTy support sugroup store/load operation
+    using supports_sg_loadstore = typename std::negation<
+        std::disjunction<is_complex<resT>, is_complex<argT>>>;
+
+    resT operator()(const argT &in) const
+    {
+        if constexpr (is_complex<argT>::value) {
+            using realT = typename argT::value_type;
+
+            static constexpr realT q_nan =
+                std::numeric_limits<realT>::quiet_NaN();
+
+            realT const &in_re = std::real(in);
+            realT const &in_im = std::imag(in);
+
+            const bool in_re_finite = std::isfinite(in_re);
+            const bool in_im_finite = std::isfinite(in_im);
+            /*
+             * Handle the nearly-non-exceptional cases where
+             * real and imaginary parts of input are finite.
+             */
+            if (in_re_finite && in_im_finite) {
+                resT res =
+                    exprm_ns::sin(exprm_ns::complex<realT>(in)); // sin(in);
+                if (in_re == realT(0)) {
+                    res.real(sycl::copysign(realT(0), in_re));
+                }
+                return res;
+            }
+
+            /*
+             * since sin(in) = -I * sinh(I * in), for special cases,
+             * we calculate real and imaginary parts of z = sinh(I * in) and
+             * then return { imag(z) , -real(z) } which is sin(in).
+             */
+            const realT x = -in_im;
+            const realT y = in_re;
+            const bool xfinite = in_im_finite;
+            const bool yfinite = in_re_finite;
+            /*
+             * sinh(+-0 +- I Inf) = sign(d(+-0, dNaN))0 + I dNaN.
+             * The sign of 0 in the result is unspecified.  Choice = normally
+             * the same as dNaN.
+             *
+             * sinh(+-0 +- I NaN) = sign(d(+-0, NaN))0 + I d(NaN).
+             * The sign of 0 in the result is unspecified.  Choice = normally
+             * the same as d(NaN).
+             */
+            if (x == realT(0) && !yfinite) {
+                const realT sinh_im = q_nan;
+                const realT sinh_re = sycl::copysign(realT(0), x * sinh_im);
+                return resT{sinh_im, -sinh_re};
+            }
+
+            /*
+             * sinh(+-Inf +- I 0) = +-Inf + I +-0.
+             *
+             * sinh(NaN +- I 0)   = d(NaN) + I +-0.
+             */
+            if (y == realT(0) && !xfinite) {
+                if (std::isnan(x)) {
+                    const realT sinh_re = x;
+                    const realT sinh_im = y;
+                    return resT{sinh_im, -sinh_re};
+                }
+                const realT sinh_re = x;
+                const realT sinh_im = sycl::copysign(realT(0), y);
+                return resT{sinh_im, -sinh_re};
+            }
+
+            /*
+             * sinh(x +- I Inf) = dNaN + I dNaN.
+             *
+             * sinh(x + I NaN) = d(NaN) + I d(NaN).
+             */
+            if (xfinite && !yfinite) {
+                const realT sinh_re = q_nan;
+                const realT sinh_im = x * sinh_re;
+                return resT{sinh_im, -sinh_re};
+            }
+
+            /*
+             * sinh(+-Inf + I NaN)  = +-Inf + I d(NaN).
+             * The sign of Inf in the result is unspecified.  Choice = normally
+             * the same as d(NaN).
+             *
+             * sinh(+-Inf +- I Inf) = +Inf + I dNaN.
+             * The sign of Inf in the result is unspecified.
+             * Choice = always - here for sinh to have positive result for
+             * imaginary part of sin.
+             *
+             * sinh(+-Inf + I y)   = +-Inf cos(y) + I Inf sin(y)
+             */
+            if (std::isinf(x)) {
+                if (!yfinite) {
+                    const realT sinh_re = -x * x;
+                    const realT sinh_im = x * (y - y);
+                    return resT{sinh_im, -sinh_re};
+                }
+                const realT sinh_re = x * sycl::cos(y);
+                const realT sinh_im =
+                    std::numeric_limits<realT>::infinity() * sycl::sin(y);
+                return resT{sinh_im, -sinh_re};
+            }
+
+            /*
+             * sinh(NaN + I NaN)  = d(NaN) + I d(NaN).
+             *
+             * sinh(NaN +- I Inf) = d(NaN) + I d(NaN).
+             *
+             * sinh(NaN + I y)    = d(NaN) + I d(NaN).
+             */
+            const realT y_m_y = (y - y);
+            const realT sinh_re = (x * x) * y_m_y;
+            const realT sinh_im = (x + x) * y_m_y;
+            return resT{sinh_im, -sinh_re};
+        }
+        else {
+            static_assert(std::is_same_v<argT, resT>);
+            if (in == 0) {
+                return in;
+            }
+            return sycl::sin(in);
+        }
+    }
+};
+
+template <typename argTy,
+          typename resTy = argTy,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
+          bool enable_sg_loadstore = true>
+using SinContigFunctor =
+    elementwise_common::UnaryContigFunctor<argTy,
+                                           resTy,
+                                           SinFunctor<argTy, resTy>,
+                                           vec_sz,
+                                           n_vecs,
+                                           enable_sg_loadstore>;
+
+template <typename argTy, typename resTy, typename IndexerT>
+using SinStridedFunctor = elementwise_common::
+    UnaryStridedFunctor<argTy, resTy, IndexerT, SinFunctor<argTy, resTy>>;
+
+template <typename T>
+struct SinOutputType
+{
+    using value_type = typename std::disjunction<
+        td_ns::TypeMapResultEntry<T, sycl::half>,
+        td_ns::TypeMapResultEntry<T, float>,
+        td_ns::TypeMapResultEntry<T, double>,
+        td_ns::TypeMapResultEntry<T, std::complex<float>>,
+        td_ns::TypeMapResultEntry<T, std::complex<double>>,
+        td_ns::DefaultResultEntry<void>>::result_type;
+
+    static constexpr bool is_defined = !std::is_same_v<value_type, void>;
+};
+
+namespace hyperparam_detail
+{
+
+namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
+
+using vsu_ns::ContigHyperparameterSetDefault;
+using vsu_ns::UnaryContigHyperparameterSetEntry;
+
+template <typename argTy>
+struct SinContigHyperparameterSet
+{
+    using value_type =
+        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
+
+    constexpr static auto vec_sz = value_type::vec_sz;
+    constexpr static auto n_vecs = value_type::n_vecs;
+};
+
+} // end of namespace hyperparam_detail
+
+template <typename T1, typename T2, std::uint8_t vec_sz, std::uint8_t n_vecs>
+class sin_contig_kernel;
+
+template <typename argTy>
+sycl::event sin_contig_impl(sycl::queue &exec_q,
+                            std::size_t nelems,
+                            const char *arg_p,
+                            char *res_p,
+                            const std::vector<sycl::event> &depends = {})
+{
+    using SinHS = hyperparam_detail::SinContigHyperparameterSet<argTy>;
+    static constexpr std::uint8_t vec_sz = SinHS::vec_sz;
+    static constexpr std::uint8_t n_vecs = SinHS::n_vecs;
+
+    return elementwise_common::unary_contig_impl<
+        argTy, SinOutputType, SinContigFunctor, sin_contig_kernel, vec_sz,
+        n_vecs>(exec_q, nelems, arg_p, res_p, depends);
+}
+
+template <typename fnT, typename T>
+struct SinContigFactory
+{
+    fnT get()
+    {
+        if constexpr (!SinOutputType<T>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = sin_contig_impl<T>;
+            return fn;
+        }
+    }
+};
+
+template <typename fnT, typename T>
+struct SinTypeMapFactory
+{
+    /*! @brief get typeid for output type of sycl::sin(T x) */
+    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
+    {
+        using rT = typename SinOutputType<T>::value_type;
+        return td_ns::GetTypeid<rT>{}.get();
+    }
+};
+
+template <typename T1, typename T2, typename T3>
+class sin_strided_kernel;
+
+template <typename argTy>
+sycl::event sin_strided_impl(sycl::queue &exec_q,
+                             std::size_t nelems,
+                             int nd,
+                             const ssize_t *shape_and_strides,
+                             const char *arg_p,
+                             ssize_t arg_offset,
+                             char *res_p,
+                             ssize_t res_offset,
+                             const std::vector<sycl::event> &depends,
+                             const std::vector<sycl::event> &additional_depends)
+{
+    return elementwise_common::unary_strided_impl<
+        argTy, SinOutputType, SinStridedFunctor, sin_strided_kernel>(
+        exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p,
+        res_offset, depends, additional_depends);
+}
+
+template <typename fnT, typename T>
+struct SinStridedFactory
+{
+    fnT get()
+    {
+        if constexpr (!SinOutputType<T>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = sin_strided_impl<T>;
+            return fn;
+        }
+    }
+};
+
+} // namespace dpctl::tensor::kernels::sin
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/sinh.hpp b/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/sinh.hpp
new file mode 100644
index 000000000000..f81a2730fd17
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/sinh.hpp
@@ -0,0 +1,302 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for elementwise evaluation of SINH(x) function.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <cmath>
+#include <complex>
+#include <cstddef>
+#include <cstdint>
+#include <limits>
+#include <type_traits>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "sycl_complex.hpp"
+#include "vec_size_util.hpp"
+
+#include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+
+#include "utils/type_dispatch_building.hpp"
+#include "utils/type_utils.hpp"
+
+namespace dpctl::tensor::kernels::sinh
+{
+
+using dpctl::tensor::ssize_t;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+using dpctl::tensor::type_utils::is_complex;
+
+template <typename argT, typename resT>
+struct SinhFunctor
+{
+
+    // is function constant for given argT
+    using is_constant = typename std::false_type;
+    // constant value, if constant
+    // constexpr resT constant_value = resT{};
+    // is function defined for sycl::vec
+    using supports_vec = typename std::false_type;
+    // do both argTy and resTy support sugroup store/load operation
+    using supports_sg_loadstore = typename std::negation<
+        std::disjunction<is_complex<resT>, is_complex<argT>>>;
+
+    resT operator()(const argT &in) const
+    {
+        if constexpr (is_complex<argT>::value) {
+            using realT = typename argT::value_type;
+
+            const realT x = std::real(in);
+            const realT y = std::imag(in);
+
+            const bool xfinite = std::isfinite(x);
+            const bool yfinite = std::isfinite(y);
+
+            /*
+             * Handle the nearly-non-exceptional cases where
+             * real and imaginary parts of input are finite.
+             */
+            if (xfinite && yfinite) {
+                return exprm_ns::sinh(exprm_ns::complex<realT>(in));
+            }
+            /*
+             * sinh(+-0 +- I Inf) = sign(d(+-0, dNaN))0 + I dNaN.
+             * The sign of 0 in the result is unspecified.  Choice = normally
+             * the same as dNaN.
+             *
+             * sinh(+-0 +- I NaN) = sign(d(+-0, NaN))0 + I d(NaN).
+             * The sign of 0 in the result is unspecified.  Choice = normally
+             * the same as d(NaN).
+             */
+            if (x == realT(0) && !yfinite) {
+                const realT res_re = sycl::copysign(realT(0), x * (y - y));
+                return resT{res_re, y - y};
+            }
+
+            /*
+             * sinh(+-Inf +- I 0) = +-Inf + I +-0.
+             *
+             * sinh(NaN +- I 0)   = d(NaN) + I +-0.
+             */
+            if (y == realT(0) && !xfinite) {
+                if (std::isnan(x)) {
+                    return resT{x, y};
+                }
+                const realT res_im = sycl::copysign(realT(0), y);
+                return resT{x, res_im};
+            }
+
+            /*
+             * sinh(x +- I Inf) = dNaN + I dNaN.
+             *
+             * sinh(x + I NaN) = d(NaN) + I d(NaN).
+             */
+            if (xfinite && !yfinite) {
+                return resT{y - y, x * (y - y)};
+            }
+
+            /*
+             * sinh(+-Inf + I NaN)  = +-Inf + I d(NaN).
+             * The sign of Inf in the result is unspecified.  Choice = normally
+             * the same as d(NaN).
+             *
+             * sinh(+-Inf +- I Inf) = +Inf + I dNaN.
+             * The sign of Inf in the result is unspecified.  Choice = always +.
+             *
+             * sinh(+-Inf + I y)   = +-Inf cos(y) + I Inf sin(y)
+             */
+            if (!xfinite && !std::isnan(x)) {
+                if (!yfinite) {
+                    return resT{x * x, x * (y - y)};
+                }
+                return resT{x * sycl::cos(y),
+                            std::numeric_limits<realT>::infinity() *
+                                sycl::sin(y)};
+            }
+
+            /*
+             * sinh(NaN + I NaN)  = d(NaN) + I d(NaN).
+             *
+             * sinh(NaN +- I Inf) = d(NaN) + I d(NaN).
+             *
+             * sinh(NaN + I y)    = d(NaN) + I d(NaN).
+             */
+            return resT{(x * x) * (y - y), (x + x) * (y - y)};
+        }
+        else {
+            static_assert(std::is_floating_point_v<argT> ||
+                          std::is_same_v<argT, sycl::half>);
+            return sycl::sinh(in);
+        }
+    }
+};
+
+template <typename argTy,
+          typename resTy = argTy,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
+          bool enable_sg_loadstore = true>
+using SinhContigFunctor =
+    elementwise_common::UnaryContigFunctor<argTy,
+                                           resTy,
+                                           SinhFunctor<argTy, resTy>,
+                                           vec_sz,
+                                           n_vecs,
+                                           enable_sg_loadstore>;
+
+template <typename argTy, typename resTy, typename IndexerT>
+using SinhStridedFunctor = elementwise_common::
+    UnaryStridedFunctor<argTy, resTy, IndexerT, SinhFunctor<argTy, resTy>>;
+
+template <typename T>
+struct SinhOutputType
+{
+    using value_type = typename std::disjunction<
+        td_ns::TypeMapResultEntry<T, sycl::half>,
+        td_ns::TypeMapResultEntry<T, float>,
+        td_ns::TypeMapResultEntry<T, double>,
+        td_ns::TypeMapResultEntry<T, std::complex<float>>,
+        td_ns::TypeMapResultEntry<T, std::complex<double>>,
+        td_ns::DefaultResultEntry<void>>::result_type;
+
+    static constexpr bool is_defined = !std::is_same_v<value_type, void>;
+};
+
+namespace hyperparam_detail
+{
+
+namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
+
+using vsu_ns::ContigHyperparameterSetDefault;
+using vsu_ns::UnaryContigHyperparameterSetEntry;
+
+template <typename argTy>
+struct SinhContigHyperparameterSet
+{
+    using value_type =
+        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
+
+    constexpr static auto vec_sz = value_type::vec_sz;
+    constexpr static auto n_vecs = value_type::n_vecs;
+};
+
+} // end of namespace hyperparam_detail
+
+template <typename T1, typename T2, std::uint8_t vec_sz, std::uint8_t n_vecs>
+class sinh_contig_kernel;
+
+template <typename argTy>
+sycl::event sinh_contig_impl(sycl::queue &exec_q,
+                             std::size_t nelems,
+                             const char *arg_p,
+                             char *res_p,
+                             const std::vector<sycl::event> &depends = {})
+{
+    using SinhHS = hyperparam_detail::SinhContigHyperparameterSet<argTy>;
+    static constexpr std::uint8_t vec_sz = SinhHS::vec_sz;
+    static constexpr std::uint8_t n_vecs = SinhHS::n_vecs;
+
+    return elementwise_common::unary_contig_impl<
+        argTy, SinhOutputType, SinhContigFunctor, sinh_contig_kernel, vec_sz,
+        n_vecs>(exec_q, nelems, arg_p, res_p, depends);
+}
+
+template <typename fnT, typename T>
+struct SinhContigFactory
+{
+    fnT get()
+    {
+        if constexpr (!SinhOutputType<T>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = sinh_contig_impl<T>;
+            return fn;
+        }
+    }
+};
+
+template <typename fnT, typename T>
+struct SinhTypeMapFactory
+{
+    /*! @brief get typeid for output type of sycl::sinh(T x) */
+    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
+    {
+        using rT = typename SinhOutputType<T>::value_type;
+        return td_ns::GetTypeid<rT>{}.get();
+    }
+};
+
+template <typename T1, typename T2, typename T3>
+class sinh_strided_kernel;
+
+template <typename argTy>
+sycl::event
+    sinh_strided_impl(sycl::queue &exec_q,
+                      std::size_t nelems,
+                      int nd,
+                      const ssize_t *shape_and_strides,
+                      const char *arg_p,
+                      ssize_t arg_offset,
+                      char *res_p,
+                      ssize_t res_offset,
+                      const std::vector<sycl::event> &depends,
+                      const std::vector<sycl::event> &additional_depends)
+{
+    return elementwise_common::unary_strided_impl<
+        argTy, SinhOutputType, SinhStridedFunctor, sinh_strided_kernel>(
+        exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p,
+        res_offset, depends, additional_depends);
+}
+
+template <typename fnT, typename T>
+struct SinhStridedFactory
+{
+    fnT get()
+    {
+        if constexpr (!SinhOutputType<T>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = sinh_strided_impl<T>;
+            return fn;
+        }
+    }
+};
+
+} // namespace dpctl::tensor::kernels::sinh
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/sqrt.hpp b/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/sqrt.hpp
new file mode 100644
index 000000000000..08b3b092d1ca
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/sqrt.hpp
@@ -0,0 +1,224 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for elementwise evaluation of SQRT(x)
+/// function that computes a square root.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <complex>
+#include <cstddef>
+#include <cstdint>
+#include <type_traits>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "sycl_complex.hpp"
+#include "vec_size_util.hpp"
+
+#include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+
+#include "utils/type_dispatch_building.hpp"
+#include "utils/type_utils.hpp"
+
+namespace dpctl::tensor::kernels::sqrt
+{
+
+using dpctl::tensor::ssize_t;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+using dpctl::tensor::type_utils::is_complex;
+
+template <typename argT, typename resT>
+struct SqrtFunctor
+{
+
+    // is function constant for given argT
+    using is_constant = typename std::false_type;
+    // constant value, if constant
+    // constexpr resT constant_value = resT{};
+    // is function defined for sycl::vec
+    using supports_vec = typename std::false_type;
+    // do both argTy and resTy support sugroup store/load operation
+    using supports_sg_loadstore = typename std::negation<
+        std::disjunction<is_complex<resT>, is_complex<argT>>>;
+
+    resT operator()(const argT &in) const
+    {
+        if constexpr (is_complex<argT>::value) {
+            using realT = typename argT::value_type;
+            return exprm_ns::sqrt(exprm_ns::complex<realT>(in));
+        }
+        else {
+            return sycl::sqrt(in);
+        }
+    }
+};
+
+template <typename argTy,
+          typename resTy = argTy,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
+          bool enable_sg_loadstore = true>
+using SqrtContigFunctor =
+    elementwise_common::UnaryContigFunctor<argTy,
+                                           resTy,
+                                           SqrtFunctor<argTy, resTy>,
+                                           vec_sz,
+                                           n_vecs,
+                                           enable_sg_loadstore>;
+
+template <typename argTy, typename resTy, typename IndexerT>
+using SqrtStridedFunctor = elementwise_common::
+    UnaryStridedFunctor<argTy, resTy, IndexerT, SqrtFunctor<argTy, resTy>>;
+
+template <typename T>
+struct SqrtOutputType
+{
+    using value_type = typename std::disjunction<
+        td_ns::TypeMapResultEntry<T, sycl::half, sycl::half>,
+        td_ns::TypeMapResultEntry<T, float, float>,
+        td_ns::TypeMapResultEntry<T, double, double>,
+        td_ns::TypeMapResultEntry<T, std::complex<float>, std::complex<float>>,
+        td_ns::
+            TypeMapResultEntry<T, std::complex<double>, std::complex<double>>,
+        td_ns::DefaultResultEntry<void>>::result_type;
+
+    static constexpr bool is_defined = !std::is_same_v<value_type, void>;
+};
+
+namespace hyperparam_detail
+{
+
+namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
+
+using vsu_ns::ContigHyperparameterSetDefault;
+using vsu_ns::UnaryContigHyperparameterSetEntry;
+
+template <typename argTy>
+struct SqrtContigHyperparameterSet
+{
+    using value_type =
+        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
+
+    constexpr static auto vec_sz = value_type::vec_sz;
+    constexpr static auto n_vecs = value_type::n_vecs;
+};
+
+} // end of namespace hyperparam_detail
+
+template <typename T1, typename T2, std::uint8_t vec_sz, std::uint8_t n_vecs>
+class sqrt_contig_kernel;
+
+template <typename argTy>
+sycl::event sqrt_contig_impl(sycl::queue &exec_q,
+                             std::size_t nelems,
+                             const char *arg_p,
+                             char *res_p,
+                             const std::vector<sycl::event> &depends = {})
+{
+    using SqrtHS = hyperparam_detail::SqrtContigHyperparameterSet<argTy>;
+    static constexpr std::uint8_t vec_sz = SqrtHS::vec_sz;
+    static constexpr std::uint8_t n_vecs = SqrtHS::n_vecs;
+
+    return elementwise_common::unary_contig_impl<
+        argTy, SqrtOutputType, SqrtContigFunctor, sqrt_contig_kernel, vec_sz,
+        n_vecs>(exec_q, nelems, arg_p, res_p, depends);
+}
+
+template <typename fnT, typename T>
+struct SqrtContigFactory
+{
+    fnT get()
+    {
+        if constexpr (!SqrtOutputType<T>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = sqrt_contig_impl<T>;
+            return fn;
+        }
+    }
+};
+
+template <typename fnT, typename T>
+struct SqrtTypeMapFactory
+{
+    /*! @brief get typeid for output type of std::sqrt(T x) */
+    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
+    {
+        using rT = typename SqrtOutputType<T>::value_type;
+        return td_ns::GetTypeid<rT>{}.get();
+    }
+};
+
+template <typename T1, typename T2, typename T3>
+class sqrt_strided_kernel;
+
+template <typename argTy>
+sycl::event
+    sqrt_strided_impl(sycl::queue &exec_q,
+                      std::size_t nelems,
+                      int nd,
+                      const ssize_t *shape_and_strides,
+                      const char *arg_p,
+                      ssize_t arg_offset,
+                      char *res_p,
+                      ssize_t res_offset,
+                      const std::vector<sycl::event> &depends,
+                      const std::vector<sycl::event> &additional_depends)
+{
+    return elementwise_common::unary_strided_impl<
+        argTy, SqrtOutputType, SqrtStridedFunctor, sqrt_strided_kernel>(
+        exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p,
+        res_offset, depends, additional_depends);
+}
+
+template <typename fnT, typename T>
+struct SqrtStridedFactory
+{
+    fnT get()
+    {
+        if constexpr (!SqrtOutputType<T>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = sqrt_strided_impl<T>;
+            return fn;
+        }
+    }
+};
+
+} // namespace dpctl::tensor::kernels::sqrt
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/square.hpp b/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/square.hpp
new file mode 100644
index 000000000000..de3007acfbea
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/square.hpp
@@ -0,0 +1,251 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for elementwise evaluation of SQUARE(x)
+///
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <complex>
+#include <cstddef>
+#include <cstdint>
+#include <type_traits>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "sycl_complex.hpp"
+#include "vec_size_util.hpp"
+
+#include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+
+#include "utils/type_dispatch_building.hpp"
+#include "utils/type_utils.hpp"
+
+namespace dpctl::tensor::kernels::square
+{
+
+using dpctl::tensor::ssize_t;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+using dpctl::tensor::type_utils::is_complex;
+using dpctl::tensor::type_utils::vec_cast;
+
+template <typename argT, typename resT>
+struct SquareFunctor
+{
+
+    // is function constant for given argT
+    using is_constant = typename std::false_type;
+    // constant value, if constant
+    // constexpr resT constant_value = resT{};
+    // is function defined for sycl::vec
+    using supports_vec = typename std::negation<
+        std::disjunction<is_complex<resT>, is_complex<argT>>>;
+    // do both argTy and resTy support sugroup store/load operation
+    using supports_sg_loadstore = typename std::negation<
+        std::disjunction<is_complex<resT>, is_complex<argT>>>;
+
+    resT operator()(const argT &in) const
+    {
+        if constexpr (is_complex<argT>::value) {
+            using realT = typename argT::value_type;
+
+            auto z = exprm_ns::complex<realT>(in);
+
+            return z * z;
+        }
+        else {
+            return in * in;
+        }
+    }
+
+    template <int vec_sz>
+    sycl::vec<resT, vec_sz> operator()(const sycl::vec<argT, vec_sz> &in) const
+    {
+        auto const &res_vec = in * in;
+        using deducedT = typename std::remove_cv_t<
+            std::remove_reference_t<decltype(res_vec)>>::element_type;
+        if constexpr (std::is_same_v<resT, deducedT>) {
+            return res_vec;
+        }
+        else {
+            return vec_cast<resT, deducedT, vec_sz>(res_vec);
+        }
+    }
+};
+
+template <typename argTy,
+          typename resTy = argTy,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
+          bool enable_sg_loadstore = true>
+using SquareContigFunctor =
+    elementwise_common::UnaryContigFunctor<argTy,
+                                           resTy,
+                                           SquareFunctor<argTy, resTy>,
+                                           vec_sz,
+                                           n_vecs,
+                                           enable_sg_loadstore>;
+
+template <typename argTy, typename resTy, typename IndexerT>
+using SquareStridedFunctor = elementwise_common::
+    UnaryStridedFunctor<argTy, resTy, IndexerT, SquareFunctor<argTy, resTy>>;
+
+template <typename T>
+struct SquareOutputType
+{
+    using value_type = typename std::disjunction<
+        td_ns::TypeMapResultEntry<T, bool, std::int8_t>,
+        td_ns::TypeMapResultEntry<T, std::uint8_t>,
+        td_ns::TypeMapResultEntry<T, std::uint16_t>,
+        td_ns::TypeMapResultEntry<T, std::uint32_t>,
+        td_ns::TypeMapResultEntry<T, std::uint64_t>,
+        td_ns::TypeMapResultEntry<T, std::int8_t>,
+        td_ns::TypeMapResultEntry<T, std::int16_t>,
+        td_ns::TypeMapResultEntry<T, std::int32_t>,
+        td_ns::TypeMapResultEntry<T, std::int64_t>,
+        td_ns::TypeMapResultEntry<T, sycl::half>,
+        td_ns::TypeMapResultEntry<T, float>,
+        td_ns::TypeMapResultEntry<T, double>,
+        td_ns::TypeMapResultEntry<T, std::complex<float>>,
+        td_ns::TypeMapResultEntry<T, std::complex<double>>,
+        td_ns::DefaultResultEntry<void>>::result_type;
+
+    static constexpr bool is_defined = !std::is_same_v<value_type, void>;
+};
+
+namespace hyperparam_detail
+{
+
+namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
+
+using vsu_ns::ContigHyperparameterSetDefault;
+using vsu_ns::UnaryContigHyperparameterSetEntry;
+
+template <typename argTy>
+struct SquareContigHyperparameterSet
+{
+    using value_type =
+        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
+
+    constexpr static auto vec_sz = value_type::vec_sz;
+    constexpr static auto n_vecs = value_type::n_vecs;
+};
+
+} // end of namespace hyperparam_detail
+
+template <typename T1, typename T2, std::uint8_t vec_sz, std::uint8_t n_vecs>
+class square_contig_kernel;
+
+template <typename argTy>
+sycl::event square_contig_impl(sycl::queue &exec_q,
+                               std::size_t nelems,
+                               const char *arg_p,
+                               char *res_p,
+                               const std::vector<sycl::event> &depends = {})
+{
+    using SquareHS = hyperparam_detail::SquareContigHyperparameterSet<argTy>;
+    static constexpr std::uint8_t vec_sz = SquareHS::vec_sz;
+    static constexpr std::uint8_t n_vecs = SquareHS::n_vecs;
+
+    return elementwise_common::unary_contig_impl<
+        argTy, SquareOutputType, SquareContigFunctor, square_contig_kernel,
+        vec_sz, n_vecs>(exec_q, nelems, arg_p, res_p, depends);
+}
+
+template <typename fnT, typename T>
+struct SquareContigFactory
+{
+    fnT get()
+    {
+        if constexpr (!SquareOutputType<T>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = square_contig_impl<T>;
+            return fn;
+        }
+    }
+};
+
+template <typename fnT, typename T>
+struct SquareTypeMapFactory
+{
+    /*! @brief get typeid for output type of x * x */
+    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
+    {
+        using rT = typename SquareOutputType<T>::value_type;
+        return td_ns::GetTypeid<rT>{}.get();
+    }
+};
+
+template <typename T1, typename T2, typename T3>
+class square_strided_kernel;
+
+template <typename argTy>
+sycl::event
+    square_strided_impl(sycl::queue &exec_q,
+                        std::size_t nelems,
+                        int nd,
+                        const ssize_t *shape_and_strides,
+                        const char *arg_p,
+                        ssize_t arg_offset,
+                        char *res_p,
+                        ssize_t res_offset,
+                        const std::vector<sycl::event> &depends,
+                        const std::vector<sycl::event> &additional_depends)
+{
+    return elementwise_common::unary_strided_impl<
+        argTy, SquareOutputType, SquareStridedFunctor, square_strided_kernel>(
+        exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p,
+        res_offset, depends, additional_depends);
+}
+
+template <typename fnT, typename T>
+struct SquareStridedFactory
+{
+    fnT get()
+    {
+        if constexpr (!SquareOutputType<T>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = square_strided_impl<T>;
+            return fn;
+        }
+    }
+};
+
+} // namespace dpctl::tensor::kernels::square
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/tan.hpp b/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/tan.hpp
new file mode 100644
index 000000000000..2db2a6b5fbf8
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/tan.hpp
@@ -0,0 +1,276 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for elementwise evaluation of TAN(x) function.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <cmath>
+#include <complex>
+#include <cstddef>
+#include <cstdint>
+#include <limits>
+#include <type_traits>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "sycl_complex.hpp"
+#include "vec_size_util.hpp"
+
+#include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+
+#include "utils/type_dispatch_building.hpp"
+#include "utils/type_utils.hpp"
+
+namespace dpctl::tensor::kernels::tan
+{
+
+using dpctl::tensor::ssize_t;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+using dpctl::tensor::type_utils::is_complex;
+
+template <typename argT, typename resT>
+struct TanFunctor
+{
+
+    // is function constant for given argT
+    using is_constant = typename std::false_type;
+    // constant value, if constant
+    // constexpr resT constant_value = resT{};
+    // is function defined for sycl::vec
+    using supports_vec = typename std::false_type;
+    // do both argTy and resTy support sugroup store/load operation
+    using supports_sg_loadstore = typename std::negation<
+        std::disjunction<is_complex<resT>, is_complex<argT>>>;
+
+    resT operator()(const argT &in) const
+    {
+        if constexpr (is_complex<argT>::value) {
+
+            using realT = typename argT::value_type;
+
+            static constexpr realT q_nan =
+                std::numeric_limits<realT>::quiet_NaN();
+            /*
+             * since tan(in) = -I * tanh(I * in), for special cases,
+             * we calculate real and imaginary parts of z = tanh(I * in) and
+             * return { imag(z) , -real(z) } which is tan(in).
+             */
+            const realT x = -std::imag(in);
+            const realT y = std::real(in);
+            /*
+             * tanh(NaN + i 0) = NaN + i 0
+             *
+             * tanh(NaN + i y) = NaN + i NaN        for y != 0
+             *
+             * The imaginary part has the sign of x*sin(2*y), but there's no
+             * special effort to get this right.
+             *
+             * tanh(+-Inf +- i Inf) = +-1 +- 0
+             *
+             * tanh(+-Inf + i y) = +-1 + 0 sin(2y)        for y finite
+             *
+             * The imaginary part of the sign is unspecified.  This special
+             * case is only needed to avoid a spurious invalid exception when
+             * y is infinite.
+             */
+            if (!std::isfinite(x)) {
+                if (std::isnan(x)) {
+                    const realT tanh_re = x;
+                    const realT tanh_im = (y == realT(0) ? y : x * y);
+                    return resT{tanh_im, -tanh_re};
+                }
+                const realT tanh_re = sycl::copysign(realT(1), x);
+                const realT tanh_im = sycl::copysign(
+                    realT(0), std::isinf(y) ? y : sycl::sin(y) * sycl::cos(y));
+                return resT{tanh_im, -tanh_re};
+            }
+            /*
+             * tanh(x + i NAN) = NaN + i NaN for non-zero x
+             * tanh(x +- i Inf) = NaN + i NaN for non-zero x
+             * tanh(0 + i NAN) = 0 + i NaN
+             * tanh(0 +- i Inf) = 0 + i NaN
+             */
+            if (!std::isfinite(y)) {
+                if (x == realT(0)) {
+                    return resT{q_nan, x};
+                }
+                return resT{q_nan, q_nan};
+            }
+            /* ordinary cases */
+            return exprm_ns::tan(exprm_ns::complex<realT>(in)); // tan(in);
+        }
+        else {
+            static_assert(std::is_floating_point_v<argT> ||
+                          std::is_same_v<argT, sycl::half>);
+            return sycl::tan(in);
+        }
+    }
+};
+
+template <typename argTy,
+          typename resTy = argTy,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
+          bool enable_sg_loadstore = true>
+using TanContigFunctor =
+    elementwise_common::UnaryContigFunctor<argTy,
+                                           resTy,
+                                           TanFunctor<argTy, resTy>,
+                                           vec_sz,
+                                           n_vecs,
+                                           enable_sg_loadstore>;
+
+template <typename argTy, typename resTy, typename IndexerT>
+using TanStridedFunctor = elementwise_common::
+    UnaryStridedFunctor<argTy, resTy, IndexerT, TanFunctor<argTy, resTy>>;
+
+template <typename T>
+struct TanOutputType
+{
+    using value_type = typename std::disjunction<
+        td_ns::TypeMapResultEntry<T, sycl::half>,
+        td_ns::TypeMapResultEntry<T, float>,
+        td_ns::TypeMapResultEntry<T, double>,
+        td_ns::TypeMapResultEntry<T, std::complex<float>>,
+        td_ns::TypeMapResultEntry<T, std::complex<double>>,
+        td_ns::DefaultResultEntry<void>>::result_type;
+
+    static constexpr bool is_defined = !std::is_same_v<value_type, void>;
+};
+
+namespace hyperparam_detail
+{
+
+namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
+
+using vsu_ns::ContigHyperparameterSetDefault;
+using vsu_ns::UnaryContigHyperparameterSetEntry;
+
+template <typename argTy>
+struct TanContigHyperparameterSet
+{
+    using value_type =
+        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
+
+    constexpr static auto vec_sz = value_type::vec_sz;
+    constexpr static auto n_vecs = value_type::n_vecs;
+};
+
+} // end of namespace hyperparam_detail
+
+template <typename T1, typename T2, std::uint8_t vec_sz, std::uint8_t n_vecs>
+class tan_contig_kernel;
+
+template <typename argTy>
+sycl::event tan_contig_impl(sycl::queue &exec_q,
+                            std::size_t nelems,
+                            const char *arg_p,
+                            char *res_p,
+                            const std::vector<sycl::event> &depends = {})
+{
+    using TanHS = hyperparam_detail::TanContigHyperparameterSet<argTy>;
+    static constexpr std::uint8_t vec_sz = TanHS::vec_sz;
+    static constexpr std::uint8_t n_vecs = TanHS::n_vecs;
+
+    return elementwise_common::unary_contig_impl<
+        argTy, TanOutputType, TanContigFunctor, tan_contig_kernel, vec_sz,
+        n_vecs>(exec_q, nelems, arg_p, res_p, depends);
+}
+
+template <typename fnT, typename T>
+struct TanContigFactory
+{
+    fnT get()
+    {
+        if constexpr (!TanOutputType<T>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = tan_contig_impl<T>;
+            return fn;
+        }
+    }
+};
+
+template <typename fnT, typename T>
+struct TanTypeMapFactory
+{
+    /*! @brief get typeid for output type of sycl::tan(T x) */
+    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
+    {
+        using rT = typename TanOutputType<T>::value_type;
+        return td_ns::GetTypeid<rT>{}.get();
+    }
+};
+
+template <typename T1, typename T2, typename T3>
+class tan_strided_kernel;
+
+template <typename argTy>
+sycl::event tan_strided_impl(sycl::queue &exec_q,
+                             std::size_t nelems,
+                             int nd,
+                             const ssize_t *shape_and_strides,
+                             const char *arg_p,
+                             ssize_t arg_offset,
+                             char *res_p,
+                             ssize_t res_offset,
+                             const std::vector<sycl::event> &depends,
+                             const std::vector<sycl::event> &additional_depends)
+{
+    return elementwise_common::unary_strided_impl<
+        argTy, TanOutputType, TanStridedFunctor, tan_strided_kernel>(
+        exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p,
+        res_offset, depends, additional_depends);
+}
+
+template <typename fnT, typename T>
+struct TanStridedFactory
+{
+    fnT get()
+    {
+        if constexpr (!TanOutputType<T>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = tan_strided_impl<T>;
+            return fn;
+        }
+    }
+};
+
+} // namespace dpctl::tensor::kernels::tan
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/tanh.hpp b/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/tanh.hpp
new file mode 100644
index 000000000000..dde16128fb1a
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/tanh.hpp
@@ -0,0 +1,270 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for elementwise evaluation of TANH(x) function.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <cmath>
+#include <complex>
+#include <cstddef>
+#include <cstdint>
+#include <limits>
+#include <type_traits>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "sycl_complex.hpp"
+#include "vec_size_util.hpp"
+
+#include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+
+#include "utils/type_dispatch_building.hpp"
+#include "utils/type_utils.hpp"
+
+namespace dpctl::tensor::kernels::tanh
+{
+
+using dpctl::tensor::ssize_t;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+using dpctl::tensor::type_utils::is_complex;
+
+template <typename argT, typename resT>
+struct TanhFunctor
+{
+
+    // is function constant for given argT
+    using is_constant = typename std::false_type;
+    // constant value, if constant
+    // constexpr resT constant_value = resT{};
+    // is function defined for sycl::vec
+    using supports_vec = typename std::false_type;
+    // do both argTy and resTy support sugroup store/load operation
+    using supports_sg_loadstore = typename std::negation<
+        std::disjunction<is_complex<resT>, is_complex<argT>>>;
+
+    resT operator()(const argT &in) const
+    {
+        if constexpr (is_complex<argT>::value) {
+            using realT = typename argT::value_type;
+
+            static constexpr realT q_nan =
+                std::numeric_limits<realT>::quiet_NaN();
+
+            const realT x = std::real(in);
+            const realT y = std::imag(in);
+            /*
+             * tanh(NaN + i 0) = NaN + i 0
+             *
+             * tanh(NaN + i y) = NaN + i NaN        for y != 0
+             *
+             * The imaginary part has the sign of x*sin(2*y), but there's no
+             * special effort to get this right.
+             *
+             * tanh(+-Inf +- i Inf) = +-1 +- 0
+             *
+             * tanh(+-Inf + i y) = +-1 + 0 sin(2y)        for y finite
+             *
+             * The imaginary part of the sign is unspecified.  This special
+             * case is only needed to avoid a spurious invalid exception when
+             * y is infinite.
+             */
+            if (!std::isfinite(x)) {
+                if (std::isnan(x)) {
+                    return resT{q_nan, (y == realT(0) ? y : q_nan)};
+                }
+                const realT res_re = sycl::copysign(realT(1), x);
+                const realT res_im = sycl::copysign(
+                    realT(0), std::isinf(y) ? y : sycl::sin(y) * sycl::cos(y));
+                return resT{res_re, res_im};
+            }
+            /*
+             * tanh(x + i NAN) = NaN + i NaN for non-zero x
+             * tanh(x +- i Inf) = NaN + i NaN for non-zero x
+             * tanh(0 + i NAN) = 0 + i NaN
+             * tanh(0 +- i Inf) = 0 + i NaN
+             */
+            if (!std::isfinite(y)) {
+                if (x == realT(0)) {
+                    return resT{x, q_nan};
+                }
+                return resT{q_nan, q_nan};
+            }
+            /* ordinary cases */
+            return exprm_ns::tanh(exprm_ns::complex<realT>(in)); // tanh(in);
+        }
+        else {
+            static_assert(std::is_floating_point_v<argT> ||
+                          std::is_same_v<argT, sycl::half>);
+            return sycl::tanh(in);
+        }
+    }
+};
+
+template <typename argTy,
+          typename resTy = argTy,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
+          bool enable_sg_loadstore = true>
+using TanhContigFunctor =
+    elementwise_common::UnaryContigFunctor<argTy,
+                                           resTy,
+                                           TanhFunctor<argTy, resTy>,
+                                           vec_sz,
+                                           n_vecs,
+                                           enable_sg_loadstore>;
+
+template <typename argTy, typename resTy, typename IndexerT>
+using TanhStridedFunctor = elementwise_common::
+    UnaryStridedFunctor<argTy, resTy, IndexerT, TanhFunctor<argTy, resTy>>;
+
+template <typename T>
+struct TanhOutputType
+{
+    using value_type = typename std::disjunction<
+        td_ns::TypeMapResultEntry<T, sycl::half>,
+        td_ns::TypeMapResultEntry<T, float>,
+        td_ns::TypeMapResultEntry<T, double>,
+        td_ns::TypeMapResultEntry<T, std::complex<float>>,
+        td_ns::TypeMapResultEntry<T, std::complex<double>>,
+        td_ns::DefaultResultEntry<void>>::result_type;
+
+    static constexpr bool is_defined = !std::is_same_v<value_type, void>;
+};
+
+namespace hyperparam_detail
+{
+
+namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
+
+using vsu_ns::ContigHyperparameterSetDefault;
+using vsu_ns::UnaryContigHyperparameterSetEntry;
+
+template <typename argTy>
+struct TanhContigHyperparameterSet
+{
+    using value_type =
+        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
+
+    constexpr static auto vec_sz = value_type::vec_sz;
+    constexpr static auto n_vecs = value_type::n_vecs;
+};
+
+} // end of namespace hyperparam_detail
+
+template <typename T1, typename T2, std::uint8_t vec_sz, std::uint8_t n_vecs>
+class tanh_contig_kernel;
+
+template <typename argTy>
+sycl::event tanh_contig_impl(sycl::queue &exec_q,
+                             std::size_t nelems,
+                             const char *arg_p,
+                             char *res_p,
+                             const std::vector<sycl::event> &depends = {})
+{
+    using TanhHS = hyperparam_detail::TanhContigHyperparameterSet<argTy>;
+    static constexpr std::uint8_t vec_sz = TanhHS::vec_sz;
+    static constexpr std::uint8_t n_vecs = TanhHS::n_vecs;
+
+    return elementwise_common::unary_contig_impl<
+        argTy, TanhOutputType, TanhContigFunctor, tanh_contig_kernel, vec_sz,
+        n_vecs>(exec_q, nelems, arg_p, res_p, depends);
+}
+
+template <typename fnT, typename T>
+struct TanhContigFactory
+{
+    fnT get()
+    {
+        if constexpr (!TanhOutputType<T>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = tanh_contig_impl<T>;
+            return fn;
+        }
+    }
+};
+
+template <typename fnT, typename T>
+struct TanhTypeMapFactory
+{
+    /*! @brief get typeid for output type of sycl::tanh(T x) */
+    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
+    {
+        using rT = typename TanhOutputType<T>::value_type;
+        return td_ns::GetTypeid<rT>{}.get();
+    }
+};
+
+template <typename T1, typename T2, typename T3>
+class tanh_strided_kernel;
+
+template <typename argTy>
+sycl::event
+    tanh_strided_impl(sycl::queue &exec_q,
+                      std::size_t nelems,
+                      int nd,
+                      const ssize_t *shape_and_strides,
+                      const char *arg_p,
+                      ssize_t arg_offset,
+                      char *res_p,
+                      ssize_t res_offset,
+                      const std::vector<sycl::event> &depends,
+                      const std::vector<sycl::event> &additional_depends)
+{
+    return elementwise_common::unary_strided_impl<
+        argTy, TanhOutputType, TanhStridedFunctor, tanh_strided_kernel>(
+        exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p,
+        res_offset, depends, additional_depends);
+}
+
+template <typename fnT, typename T>
+struct TanhStridedFactory
+{
+    fnT get()
+    {
+        if constexpr (!TanhOutputType<T>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = tanh_strided_impl<T>;
+            return fn;
+        }
+    }
+};
+
+} // namespace dpctl::tensor::kernels::tanh
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/trunc.hpp b/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/trunc.hpp
new file mode 100644
index 000000000000..6fae9c4f27e5
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/trunc.hpp
@@ -0,0 +1,226 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for elementwise evaluation of TRUNC(x) function.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <cstddef>
+#include <cstdint>
+#include <type_traits>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "vec_size_util.hpp"
+
+#include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+
+#include "utils/type_dispatch_building.hpp"
+#include "utils/type_utils.hpp"
+
+namespace dpctl::tensor::kernels::trunc
+{
+
+using dpctl::tensor::ssize_t;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+using dpctl::tensor::type_utils::is_complex;
+
+template <typename argT, typename resT>
+struct TruncFunctor
+{
+
+    // is function constant for given argT
+    using is_constant = typename std::false_type;
+    // constant value, if constant
+    // constexpr resT constant_value = resT{};
+    // is function defined for sycl::vec
+    using supports_vec = typename std::false_type;
+    // do both argTy and resTy support sugroup store/load operation
+    using supports_sg_loadstore = typename std::negation<
+        std::disjunction<is_complex<resT>, is_complex<argT>>>;
+
+    resT operator()(const argT &in) const
+    {
+        if constexpr (std::is_integral_v<argT>) {
+            return in;
+        }
+        else {
+            return sycl::trunc(in);
+        }
+    }
+};
+
+template <typename argTy,
+          typename resTy = argTy,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
+          bool enable_sg_loadstore = true>
+using TruncContigFunctor =
+    elementwise_common::UnaryContigFunctor<argTy,
+                                           resTy,
+                                           TruncFunctor<argTy, resTy>,
+                                           vec_sz,
+                                           n_vecs,
+                                           enable_sg_loadstore>;
+
+template <typename argTy, typename resTy, typename IndexerT>
+using TruncStridedFunctor = elementwise_common::
+    UnaryStridedFunctor<argTy, resTy, IndexerT, TruncFunctor<argTy, resTy>>;
+
+template <typename T>
+struct TruncOutputType
+{
+    using value_type =
+        typename std::disjunction<td_ns::TypeMapResultEntry<T, bool>,
+                                  td_ns::TypeMapResultEntry<T, std::uint8_t>,
+                                  td_ns::TypeMapResultEntry<T, std::uint16_t>,
+                                  td_ns::TypeMapResultEntry<T, std::uint32_t>,
+                                  td_ns::TypeMapResultEntry<T, std::uint64_t>,
+                                  td_ns::TypeMapResultEntry<T, std::int8_t>,
+                                  td_ns::TypeMapResultEntry<T, std::int16_t>,
+                                  td_ns::TypeMapResultEntry<T, std::int32_t>,
+                                  td_ns::TypeMapResultEntry<T, std::int64_t>,
+                                  td_ns::TypeMapResultEntry<T, sycl::half>,
+                                  td_ns::TypeMapResultEntry<T, float>,
+                                  td_ns::TypeMapResultEntry<T, double>,
+                                  td_ns::DefaultResultEntry<void>>::result_type;
+
+    static constexpr bool is_defined = !std::is_same_v<value_type, void>;
+};
+
+namespace hyperparam_detail
+{
+
+namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
+
+using vsu_ns::ContigHyperparameterSetDefault;
+using vsu_ns::UnaryContigHyperparameterSetEntry;
+
+template <typename argTy>
+struct TruncContigHyperparameterSet
+{
+    using value_type =
+        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
+
+    constexpr static auto vec_sz = value_type::vec_sz;
+    constexpr static auto n_vecs = value_type::n_vecs;
+};
+
+} // end of namespace hyperparam_detail
+
+template <typename T1, typename T2, std::uint8_t vec_sz, std::uint8_t n_vecs>
+class trunc_contig_kernel;
+
+template <typename argTy>
+sycl::event trunc_contig_impl(sycl::queue &exec_q,
+                              std::size_t nelems,
+                              const char *arg_p,
+                              char *res_p,
+                              const std::vector<sycl::event> &depends = {})
+{
+    using TruncHS = hyperparam_detail::TruncContigHyperparameterSet<argTy>;
+    static constexpr std::uint8_t vec_sz = TruncHS::vec_sz;
+    static constexpr std::uint8_t n_vecs = TruncHS::n_vecs;
+
+    return elementwise_common::unary_contig_impl<
+        argTy, TruncOutputType, TruncContigFunctor, trunc_contig_kernel, vec_sz,
+        n_vecs>(exec_q, nelems, arg_p, res_p, depends);
+}
+
+template <typename fnT, typename T>
+struct TruncContigFactory
+{
+    fnT get()
+    {
+        if constexpr (!TruncOutputType<T>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = trunc_contig_impl<T>;
+            return fn;
+        }
+    }
+};
+
+template <typename fnT, typename T>
+struct TruncTypeMapFactory
+{
+    /*! @brief get typeid for output type of sycl::trunc(T x) */
+    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
+    {
+        using rT = typename TruncOutputType<T>::value_type;
+        return td_ns::GetTypeid<rT>{}.get();
+    }
+};
+
+template <typename T1, typename T2, typename T3>
+class trunc_strided_kernel;
+
+template <typename argTy>
+sycl::event
+    trunc_strided_impl(sycl::queue &exec_q,
+                       std::size_t nelems,
+                       int nd,
+                       const ssize_t *shape_and_strides,
+                       const char *arg_p,
+                       ssize_t arg_offset,
+                       char *res_p,
+                       ssize_t res_offset,
+                       const std::vector<sycl::event> &depends,
+                       const std::vector<sycl::event> &additional_depends)
+{
+    return elementwise_common::unary_strided_impl<
+        argTy, TruncOutputType, TruncStridedFunctor, trunc_strided_kernel>(
+        exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p,
+        res_offset, depends, additional_depends);
+}
+
+template <typename fnT, typename T>
+struct TruncStridedFactory
+{
+    fnT get()
+    {
+        if constexpr (!TruncOutputType<T>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = trunc_strided_impl<T>;
+            return fn;
+        }
+    }
+};
+
+} // namespace dpctl::tensor::kernels::trunc
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/cbrt.cpp b/dpctl_ext/tensor/libtensor/source/elementwise_functions/cbrt.cpp
new file mode 100644
index 000000000000..a061235acfd7
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/elementwise_functions/cbrt.cpp
@@ -0,0 +1,125 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "cbrt.hpp"
+#include "elementwise_functions.hpp"
+#include "utils/type_dispatch.hpp"
+#include "utils/type_dispatch_building.hpp"
+
+#include "kernels/elementwise_functions/cbrt.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
+using ew_cmn_ns::unary_contig_impl_fn_ptr_t;
+using ew_cmn_ns::unary_strided_impl_fn_ptr_t;
+
+// U37: ==== CBRT   (x)
+namespace impl
+{
+
+namespace cbrt_fn_ns = dpctl::tensor::kernels::cbrt;
+
+static unary_contig_impl_fn_ptr_t cbrt_contig_dispatch_vector[td_ns::num_types];
+static int cbrt_output_typeid_vector[td_ns::num_types];
+static unary_strided_impl_fn_ptr_t
+    cbrt_strided_dispatch_vector[td_ns::num_types];
+
+void populate_cbrt_dispatch_vectors(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = cbrt_fn_ns;
+
+    using fn_ns::CbrtContigFactory;
+    DispatchVectorBuilder<unary_contig_impl_fn_ptr_t, CbrtContigFactory,
+                          num_types>
+        dvb1;
+    dvb1.populate_dispatch_vector(cbrt_contig_dispatch_vector);
+
+    using fn_ns::CbrtStridedFactory;
+    DispatchVectorBuilder<unary_strided_impl_fn_ptr_t, CbrtStridedFactory,
+                          num_types>
+        dvb2;
+    dvb2.populate_dispatch_vector(cbrt_strided_dispatch_vector);
+
+    using fn_ns::CbrtTypeMapFactory;
+    DispatchVectorBuilder<int, CbrtTypeMapFactory, num_types> dvb3;
+    dvb3.populate_dispatch_vector(cbrt_output_typeid_vector);
+};
+
+} // namespace impl
+
+void init_cbrt(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        impl::populate_cbrt_dispatch_vectors();
+        using impl::cbrt_contig_dispatch_vector;
+        using impl::cbrt_output_typeid_vector;
+        using impl::cbrt_strided_dispatch_vector;
+
+        auto cbrt_pyapi = [&](const arrayT &src, const arrayT &dst,
+                              sycl::queue &exec_q,
+                              const event_vecT &depends = {}) {
+            return py_unary_ufunc(
+                src, dst, exec_q, depends, cbrt_output_typeid_vector,
+                cbrt_contig_dispatch_vector, cbrt_strided_dispatch_vector);
+        };
+        m.def("_cbrt", cbrt_pyapi, "", py::arg("src"), py::arg("dst"),
+              py::arg("sycl_queue"), py::arg("depends") = py::list());
+
+        auto cbrt_result_type_pyapi = [&](const py::dtype &dtype) {
+            return py_unary_ufunc_result_type(dtype, cbrt_output_typeid_vector);
+        };
+        m.def("_cbrt_result_type", cbrt_result_type_pyapi);
+    }
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/cbrt.hpp b/dpctl_ext/tensor/libtensor/source/elementwise_functions/cbrt.hpp
new file mode 100644
index 000000000000..53757bff7134
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/elementwise_functions/cbrt.hpp
@@ -0,0 +1,46 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern void init_cbrt(py::module_ m);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/elementwise_common.cpp b/dpctl_ext/tensor/libtensor/source/elementwise_functions/elementwise_common.cpp
index 0a0c02f7ed31..144e39be252f 100644
--- a/dpctl_ext/tensor/libtensor/source/elementwise_functions/elementwise_common.cpp
+++ b/dpctl_ext/tensor/libtensor/source/elementwise_functions/elementwise_common.cpp
@@ -51,7 +51,7 @@
 // #include "bitwise_or.hpp"
 // #include "bitwise_right_shift.hpp"
 // #include "bitwise_xor.hpp"
-// #include "cbrt.hpp"
+#include "cbrt.hpp"
 #include "ceil.hpp"
 #include "conj.hpp"
 // #include "copysign.hpp"
@@ -59,7 +59,7 @@
 #include "cosh.hpp"
 // #include "equal.hpp"
 #include "exp.hpp"
-// #include "exp2.hpp"
+#include "exp2.hpp"
 #include "expm1.hpp"
 #include "floor.hpp"
 // #include "floor_divide.hpp"
@@ -89,23 +89,23 @@
 // #include "not_equal.hpp"
 #include "positive.hpp"
 // #include "pow.hpp"
-// #include "proj.hpp"
-// #include "real.hpp"
-// #include "reciprocal.hpp"
+#include "proj.hpp"
+#include "real.hpp"
+#include "reciprocal.hpp"
 // #include "remainder.hpp"
-// #include "round.hpp"
-// #include "rsqrt.hpp"
-// #include "sign.hpp"
-// #include "signbit.hpp"
-// #include "sin.hpp"
-// #include "sinh.hpp"
-// #include "sqrt.hpp"
-// #include "square.hpp"
+#include "round.hpp"
+#include "rsqrt.hpp"
+#include "sign.hpp"
+#include "signbit.hpp"
+#include "sin.hpp"
+#include "sinh.hpp"
+#include "sqrt.hpp"
+#include "square.hpp"
 // #include "subtract.hpp"
-// #include "tan.hpp"
-// #include "tanh.hpp"
+#include "tan.hpp"
+#include "tanh.hpp"
 // #include "true_divide.hpp"
-// #include "trunc.hpp"
+#include "trunc.hpp"
 
 namespace dpctl::tensor::py_internal
 {
@@ -131,7 +131,7 @@ void init_elementwise_functions(py::module_ m)
     // init_bitwise_or(m);
     // init_bitwise_right_shift(m);
     // init_bitwise_xor(m);
-    // init_cbrt(m);
+    init_cbrt(m);
     init_ceil(m);
     init_conj(m);
     // init_copysign(m);
@@ -140,7 +140,7 @@ void init_elementwise_functions(py::module_ m)
     // init_divide(m);
     // init_equal(m);
     init_exp(m);
-    // init_exp2(m);
+    init_exp2(m);
     init_expm1(m);
     init_floor(m);
     // init_floor_divide(m);
@@ -170,22 +170,22 @@ void init_elementwise_functions(py::module_ m)
     // init_not_equal(m);
     init_positive(m);
     // init_pow(m);
-    // init_proj(m);
-    // init_real(m);
-    // init_reciprocal(m);
+    init_proj(m);
+    init_real(m);
+    init_reciprocal(m);
     // init_remainder(m);
-    // init_round(m);
-    // init_rsqrt(m);
-    // init_sign(m);
-    // init_signbit(m);
-    // init_sin(m);
-    // init_sinh(m);
-    // init_sqrt(m);
-    // init_square(m);
+    init_round(m);
+    init_rsqrt(m);
+    init_sign(m);
+    init_signbit(m);
+    init_sin(m);
+    init_sinh(m);
+    init_sqrt(m);
+    init_square(m);
     // init_subtract(m);
-    // init_tan(m);
-    // init_tanh(m);
-    // init_trunc(m);
+    init_tan(m);
+    init_tanh(m);
+    init_trunc(m);
 }
 
 } // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/exp2.cpp b/dpctl_ext/tensor/libtensor/source/elementwise_functions/exp2.cpp
new file mode 100644
index 000000000000..fc40a8e0aab9
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/elementwise_functions/exp2.cpp
@@ -0,0 +1,125 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "elementwise_functions.hpp"
+#include "exp2.hpp"
+#include "utils/type_dispatch.hpp"
+#include "utils/type_dispatch_building.hpp"
+
+#include "kernels/elementwise_functions/common.hpp"
+#include "kernels/elementwise_functions/exp2.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
+using ew_cmn_ns::unary_contig_impl_fn_ptr_t;
+using ew_cmn_ns::unary_strided_impl_fn_ptr_t;
+
+// U38: ==== EXP2   (x)
+namespace impl
+{
+
+namespace exp2_fn_ns = dpctl::tensor::kernels::exp2;
+
+static unary_contig_impl_fn_ptr_t exp2_contig_dispatch_vector[td_ns::num_types];
+static int exp2_output_typeid_vector[td_ns::num_types];
+static unary_strided_impl_fn_ptr_t
+    exp2_strided_dispatch_vector[td_ns::num_types];
+
+void populate_exp2_dispatch_vectors(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = exp2_fn_ns;
+
+    using fn_ns::Exp2ContigFactory;
+    DispatchVectorBuilder<unary_contig_impl_fn_ptr_t, Exp2ContigFactory,
+                          num_types>
+        dvb1;
+    dvb1.populate_dispatch_vector(exp2_contig_dispatch_vector);
+
+    using fn_ns::Exp2StridedFactory;
+    DispatchVectorBuilder<unary_strided_impl_fn_ptr_t, Exp2StridedFactory,
+                          num_types>
+        dvb2;
+    dvb2.populate_dispatch_vector(exp2_strided_dispatch_vector);
+
+    using fn_ns::Exp2TypeMapFactory;
+    DispatchVectorBuilder<int, Exp2TypeMapFactory, num_types> dvb3;
+    dvb3.populate_dispatch_vector(exp2_output_typeid_vector);
+};
+
+} // namespace impl
+
+void init_exp2(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        impl::populate_exp2_dispatch_vectors();
+        using impl::exp2_contig_dispatch_vector;
+        using impl::exp2_output_typeid_vector;
+        using impl::exp2_strided_dispatch_vector;
+
+        auto exp2_pyapi = [&](const arrayT &src, const arrayT &dst,
+                              sycl::queue &exec_q,
+                              const event_vecT &depends = {}) {
+            return py_unary_ufunc(
+                src, dst, exec_q, depends, exp2_output_typeid_vector,
+                exp2_contig_dispatch_vector, exp2_strided_dispatch_vector);
+        };
+        m.def("_exp2", exp2_pyapi, "", py::arg("src"), py::arg("dst"),
+              py::arg("sycl_queue"), py::arg("depends") = py::list());
+
+        auto exp2_result_type_pyapi = [&](const py::dtype &dtype) {
+            return py_unary_ufunc_result_type(dtype, exp2_output_typeid_vector);
+        };
+        m.def("_exp2_result_type", exp2_result_type_pyapi);
+    }
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/exp2.hpp b/dpctl_ext/tensor/libtensor/source/elementwise_functions/exp2.hpp
new file mode 100644
index 000000000000..f9f315d14383
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/elementwise_functions/exp2.hpp
@@ -0,0 +1,46 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern void init_exp2(py::module_ m);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/proj.cpp b/dpctl_ext/tensor/libtensor/source/elementwise_functions/proj.cpp
new file mode 100644
index 000000000000..9583de8bd195
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/elementwise_functions/proj.cpp
@@ -0,0 +1,125 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "elementwise_functions.hpp"
+#include "proj.hpp"
+#include "utils/type_dispatch.hpp"
+#include "utils/type_dispatch_building.hpp"
+
+#include "kernels/elementwise_functions/common.hpp"
+#include "kernels/elementwise_functions/proj.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
+using ew_cmn_ns::unary_contig_impl_fn_ptr_t;
+using ew_cmn_ns::unary_strided_impl_fn_ptr_t;
+
+// U40: ==== PROJ   (x)
+namespace impl
+{
+
+namespace proj_fn_ns = dpctl::tensor::kernels::proj;
+
+static unary_contig_impl_fn_ptr_t proj_contig_dispatch_vector[td_ns::num_types];
+static int proj_output_typeid_vector[td_ns::num_types];
+static unary_strided_impl_fn_ptr_t
+    proj_strided_dispatch_vector[td_ns::num_types];
+
+void populate_proj_dispatch_vectors(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = proj_fn_ns;
+
+    using fn_ns::ProjContigFactory;
+    DispatchVectorBuilder<unary_contig_impl_fn_ptr_t, ProjContigFactory,
+                          num_types>
+        dvb1;
+    dvb1.populate_dispatch_vector(proj_contig_dispatch_vector);
+
+    using fn_ns::ProjStridedFactory;
+    DispatchVectorBuilder<unary_strided_impl_fn_ptr_t, ProjStridedFactory,
+                          num_types>
+        dvb2;
+    dvb2.populate_dispatch_vector(proj_strided_dispatch_vector);
+
+    using fn_ns::ProjTypeMapFactory;
+    DispatchVectorBuilder<int, ProjTypeMapFactory, num_types> dvb3;
+    dvb3.populate_dispatch_vector(proj_output_typeid_vector);
+};
+
+} // namespace impl
+
+void init_proj(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        impl::populate_proj_dispatch_vectors();
+        using impl::proj_contig_dispatch_vector;
+        using impl::proj_output_typeid_vector;
+        using impl::proj_strided_dispatch_vector;
+
+        auto proj_pyapi = [&](const arrayT &src, const arrayT &dst,
+                              sycl::queue &exec_q,
+                              const event_vecT &depends = {}) {
+            return py_unary_ufunc(
+                src, dst, exec_q, depends, proj_output_typeid_vector,
+                proj_contig_dispatch_vector, proj_strided_dispatch_vector);
+        };
+        m.def("_proj", proj_pyapi, "", py::arg("src"), py::arg("dst"),
+              py::arg("sycl_queue"), py::arg("depends") = py::list());
+
+        auto proj_result_type_pyapi = [&](const py::dtype &dtype) {
+            return py_unary_ufunc_result_type(dtype, proj_output_typeid_vector);
+        };
+        m.def("_proj_result_type", proj_result_type_pyapi);
+    }
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/proj.hpp b/dpctl_ext/tensor/libtensor/source/elementwise_functions/proj.hpp
new file mode 100644
index 000000000000..3cdc0e8271b0
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/elementwise_functions/proj.hpp
@@ -0,0 +1,46 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern void init_proj(py::module_ m);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/real.cpp b/dpctl_ext/tensor/libtensor/source/elementwise_functions/real.cpp
new file mode 100644
index 000000000000..6ed3f5fdc404
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/elementwise_functions/real.cpp
@@ -0,0 +1,125 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "elementwise_functions.hpp"
+#include "real.hpp"
+#include "utils/type_dispatch.hpp"
+#include "utils/type_dispatch_building.hpp"
+
+#include "kernels/elementwise_functions/common.hpp"
+#include "kernels/elementwise_functions/real.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
+using ew_cmn_ns::unary_contig_impl_fn_ptr_t;
+using ew_cmn_ns::unary_strided_impl_fn_ptr_t;
+
+// U27: ==== REAL   (x)
+namespace impl
+{
+
+namespace real_fn_ns = dpctl::tensor::kernels::real;
+
+static unary_contig_impl_fn_ptr_t real_contig_dispatch_vector[td_ns::num_types];
+static int real_output_typeid_vector[td_ns::num_types];
+static unary_strided_impl_fn_ptr_t
+    real_strided_dispatch_vector[td_ns::num_types];
+
+void populate_real_dispatch_vectors(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = real_fn_ns;
+
+    using fn_ns::RealContigFactory;
+    DispatchVectorBuilder<unary_contig_impl_fn_ptr_t, RealContigFactory,
+                          num_types>
+        dvb1;
+    dvb1.populate_dispatch_vector(real_contig_dispatch_vector);
+
+    using fn_ns::RealStridedFactory;
+    DispatchVectorBuilder<unary_strided_impl_fn_ptr_t, RealStridedFactory,
+                          num_types>
+        dvb2;
+    dvb2.populate_dispatch_vector(real_strided_dispatch_vector);
+
+    using fn_ns::RealTypeMapFactory;
+    DispatchVectorBuilder<int, RealTypeMapFactory, num_types> dvb3;
+    dvb3.populate_dispatch_vector(real_output_typeid_vector);
+};
+
+} // namespace impl
+
+void init_real(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        impl::populate_real_dispatch_vectors();
+        using impl::real_contig_dispatch_vector;
+        using impl::real_output_typeid_vector;
+        using impl::real_strided_dispatch_vector;
+
+        auto real_pyapi = [&](const arrayT &src, const arrayT &dst,
+                              sycl::queue &exec_q,
+                              const event_vecT &depends = {}) {
+            return py_unary_ufunc(
+                src, dst, exec_q, depends, real_output_typeid_vector,
+                real_contig_dispatch_vector, real_strided_dispatch_vector);
+        };
+        m.def("_real", real_pyapi, "", py::arg("src"), py::arg("dst"),
+              py::arg("sycl_queue"), py::arg("depends") = py::list());
+
+        auto real_result_type_pyapi = [&](const py::dtype &dtype) {
+            return py_unary_ufunc_result_type(dtype, real_output_typeid_vector);
+        };
+        m.def("_real_result_type", real_result_type_pyapi);
+    }
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/real.hpp b/dpctl_ext/tensor/libtensor/source/elementwise_functions/real.hpp
new file mode 100644
index 000000000000..81f4743e823b
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/elementwise_functions/real.hpp
@@ -0,0 +1,46 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern void init_real(py::module_ m);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/reciprocal.cpp b/dpctl_ext/tensor/libtensor/source/elementwise_functions/reciprocal.cpp
new file mode 100644
index 000000000000..cdb0f43dfbe0
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/elementwise_functions/reciprocal.cpp
@@ -0,0 +1,129 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "elementwise_functions.hpp"
+#include "reciprocal.hpp"
+#include "utils/type_dispatch.hpp"
+#include "utils/type_dispatch_building.hpp"
+
+#include "kernels/elementwise_functions/common.hpp"
+#include "kernels/elementwise_functions/reciprocal.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
+using ew_cmn_ns::unary_contig_impl_fn_ptr_t;
+using ew_cmn_ns::unary_strided_impl_fn_ptr_t;
+
+// U42: ==== REAL   (x)
+namespace impl
+{
+
+namespace reciprocal_fn_ns = dpctl::tensor::kernels::reciprocal;
+
+static unary_contig_impl_fn_ptr_t
+    reciprocal_contig_dispatch_vector[td_ns::num_types];
+static int reciprocal_output_typeid_vector[td_ns::num_types];
+static unary_strided_impl_fn_ptr_t
+    reciprocal_strided_dispatch_vector[td_ns::num_types];
+
+void populate_reciprocal_dispatch_vectors(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = reciprocal_fn_ns;
+
+    using fn_ns::ReciprocalContigFactory;
+    DispatchVectorBuilder<unary_contig_impl_fn_ptr_t, ReciprocalContigFactory,
+                          num_types>
+        dvb1;
+    dvb1.populate_dispatch_vector(reciprocal_contig_dispatch_vector);
+
+    using fn_ns::ReciprocalStridedFactory;
+    DispatchVectorBuilder<unary_strided_impl_fn_ptr_t, ReciprocalStridedFactory,
+                          num_types>
+        dvb2;
+    dvb2.populate_dispatch_vector(reciprocal_strided_dispatch_vector);
+
+    using fn_ns::ReciprocalTypeMapFactory;
+    DispatchVectorBuilder<int, ReciprocalTypeMapFactory, num_types> dvb3;
+    dvb3.populate_dispatch_vector(reciprocal_output_typeid_vector);
+};
+
+} // namespace impl
+
+void init_reciprocal(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        impl::populate_reciprocal_dispatch_vectors();
+        using impl::reciprocal_contig_dispatch_vector;
+        using impl::reciprocal_output_typeid_vector;
+        using impl::reciprocal_strided_dispatch_vector;
+
+        auto reciprocal_pyapi = [&](const arrayT &src, const arrayT &dst,
+                                    sycl::queue &exec_q,
+                                    const event_vecT &depends = {}) {
+            return py_unary_ufunc(src, dst, exec_q, depends,
+                                  reciprocal_output_typeid_vector,
+                                  reciprocal_contig_dispatch_vector,
+                                  reciprocal_strided_dispatch_vector);
+        };
+        m.def("_reciprocal", reciprocal_pyapi, "", py::arg("src"),
+              py::arg("dst"), py::arg("sycl_queue"),
+              py::arg("depends") = py::list());
+
+        auto reciprocal_result_type_pyapi = [&](const py::dtype &dtype) {
+            return py_unary_ufunc_result_type(dtype,
+                                              reciprocal_output_typeid_vector);
+        };
+        m.def("_reciprocal_result_type", reciprocal_result_type_pyapi);
+    }
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/reciprocal.hpp b/dpctl_ext/tensor/libtensor/source/elementwise_functions/reciprocal.hpp
new file mode 100644
index 000000000000..1d2156f3464e
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/elementwise_functions/reciprocal.hpp
@@ -0,0 +1,46 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern void init_reciprocal(py::module_ m);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/round.cpp b/dpctl_ext/tensor/libtensor/source/elementwise_functions/round.cpp
new file mode 100644
index 000000000000..d651b567c3c1
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/elementwise_functions/round.cpp
@@ -0,0 +1,126 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "elementwise_functions.hpp"
+#include "round.hpp"
+#include "utils/type_dispatch.hpp"
+#include "utils/type_dispatch_building.hpp"
+
+#include "kernels/elementwise_functions/common.hpp"
+#include "kernels/elementwise_functions/round.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
+using ew_cmn_ns::unary_contig_impl_fn_ptr_t;
+using ew_cmn_ns::unary_strided_impl_fn_ptr_t;
+
+// U28: ==== ROUND   (x)
+namespace impl
+{
+
+namespace round_fn_ns = dpctl::tensor::kernels::round;
+
+static unary_contig_impl_fn_ptr_t
+    round_contig_dispatch_vector[td_ns::num_types];
+static int round_output_typeid_vector[td_ns::num_types];
+static unary_strided_impl_fn_ptr_t
+    round_strided_dispatch_vector[td_ns::num_types];
+
+void populate_round_dispatch_vectors(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = round_fn_ns;
+
+    using fn_ns::RoundContigFactory;
+    DispatchVectorBuilder<unary_contig_impl_fn_ptr_t, RoundContigFactory,
+                          num_types>
+        dvb1;
+    dvb1.populate_dispatch_vector(round_contig_dispatch_vector);
+
+    using fn_ns::RoundStridedFactory;
+    DispatchVectorBuilder<unary_strided_impl_fn_ptr_t, RoundStridedFactory,
+                          num_types>
+        dvb2;
+    dvb2.populate_dispatch_vector(round_strided_dispatch_vector);
+
+    using fn_ns::RoundTypeMapFactory;
+    DispatchVectorBuilder<int, RoundTypeMapFactory, num_types> dvb3;
+    dvb3.populate_dispatch_vector(round_output_typeid_vector);
+};
+
+} // namespace impl
+
+void init_round(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        impl::populate_round_dispatch_vectors();
+        using impl::round_contig_dispatch_vector;
+        using impl::round_output_typeid_vector;
+        using impl::round_strided_dispatch_vector;
+
+        auto round_pyapi = [&](const arrayT &src, const arrayT &dst,
+                               sycl::queue &exec_q,
+                               const event_vecT &depends = {}) {
+            return py_unary_ufunc(
+                src, dst, exec_q, depends, round_output_typeid_vector,
+                round_contig_dispatch_vector, round_strided_dispatch_vector);
+        };
+        m.def("_round", round_pyapi, "", py::arg("src"), py::arg("dst"),
+              py::arg("sycl_queue"), py::arg("depends") = py::list());
+
+        auto round_result_type_pyapi = [&](const py::dtype &dtype) {
+            return py_unary_ufunc_result_type(dtype,
+                                              round_output_typeid_vector);
+        };
+        m.def("_round_result_type", round_result_type_pyapi);
+    }
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/round.hpp b/dpctl_ext/tensor/libtensor/source/elementwise_functions/round.hpp
new file mode 100644
index 000000000000..ca56e110eec5
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/elementwise_functions/round.hpp
@@ -0,0 +1,46 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern void init_round(py::module_ m);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/rsqrt.cpp b/dpctl_ext/tensor/libtensor/source/elementwise_functions/rsqrt.cpp
new file mode 100644
index 000000000000..738bef333d75
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/elementwise_functions/rsqrt.cpp
@@ -0,0 +1,127 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "elementwise_functions.hpp"
+#include "rsqrt.hpp"
+#include "utils/type_dispatch.hpp"
+#include "utils/type_dispatch_building.hpp"
+
+#include "kernels/elementwise_functions/common.hpp"
+#include "kernels/elementwise_functions/rsqrt.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
+using ew_cmn_ns::unary_contig_impl_fn_ptr_t;
+using ew_cmn_ns::unary_strided_impl_fn_ptr_t;
+
+// U39: ==== RSQRT   (x)
+namespace impl
+{
+
+namespace rsqrt_fn_ns = dpctl::tensor::kernels::rsqrt;
+
+static unary_contig_impl_fn_ptr_t
+    rsqrt_contig_dispatch_vector[td_ns::num_types];
+static int rsqrt_output_typeid_vector[td_ns::num_types];
+static unary_strided_impl_fn_ptr_t
+    rsqrt_strided_dispatch_vector[td_ns::num_types];
+
+void populate_rsqrt_dispatch_vectors(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = rsqrt_fn_ns;
+
+    using fn_ns::RsqrtContigFactory;
+    DispatchVectorBuilder<unary_contig_impl_fn_ptr_t, RsqrtContigFactory,
+                          num_types>
+        dvb1;
+    dvb1.populate_dispatch_vector(rsqrt_contig_dispatch_vector);
+
+    using fn_ns::RsqrtStridedFactory;
+    DispatchVectorBuilder<unary_strided_impl_fn_ptr_t, RsqrtStridedFactory,
+                          num_types>
+        dvb2;
+    dvb2.populate_dispatch_vector(rsqrt_strided_dispatch_vector);
+
+    using fn_ns::RsqrtTypeMapFactory;
+    DispatchVectorBuilder<int, RsqrtTypeMapFactory, num_types> dvb3;
+    dvb3.populate_dispatch_vector(rsqrt_output_typeid_vector);
+};
+
+} // namespace impl
+
+void init_rsqrt(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        impl::populate_rsqrt_dispatch_vectors();
+        using impl::rsqrt_contig_dispatch_vector;
+        using impl::rsqrt_output_typeid_vector;
+        using impl::rsqrt_strided_dispatch_vector;
+
+        auto rsqrt_pyapi = [&](const arrayT &src, const arrayT &dst,
+                               sycl::queue &exec_q,
+                               const event_vecT &depends = {}) {
+            return py_unary_ufunc(
+                src, dst, exec_q, depends, rsqrt_output_typeid_vector,
+                rsqrt_contig_dispatch_vector, rsqrt_strided_dispatch_vector);
+        };
+        m.def("_rsqrt", rsqrt_pyapi, "", py::arg("src"), py::arg("dst"),
+              py::arg("sycl_queue"), py::arg("depends") = py::list());
+
+        auto rsqrt_result_type_pyapi = [&](const py::dtype &dtype) {
+            return py_unary_ufunc_result_type(dtype,
+                                              rsqrt_output_typeid_vector);
+        };
+        m.def("_rsqrt_result_type", rsqrt_result_type_pyapi);
+    }
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/rsqrt.hpp b/dpctl_ext/tensor/libtensor/source/elementwise_functions/rsqrt.hpp
new file mode 100644
index 000000000000..4ba740a31777
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/elementwise_functions/rsqrt.hpp
@@ -0,0 +1,46 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern void init_rsqrt(py::module_ m);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/sign.cpp b/dpctl_ext/tensor/libtensor/source/elementwise_functions/sign.cpp
new file mode 100644
index 000000000000..5051926e7470
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/elementwise_functions/sign.cpp
@@ -0,0 +1,125 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "elementwise_functions.hpp"
+#include "sign.hpp"
+#include "utils/type_dispatch.hpp"
+#include "utils/type_dispatch_building.hpp"
+
+#include "kernels/elementwise_functions/common.hpp"
+#include "kernels/elementwise_functions/sign.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
+using ew_cmn_ns::unary_contig_impl_fn_ptr_t;
+using ew_cmn_ns::unary_strided_impl_fn_ptr_t;
+
+// U29: ==== SIGN   (x)
+namespace impl
+{
+
+namespace sign_fn_ns = dpctl::tensor::kernels::sign;
+
+static unary_contig_impl_fn_ptr_t sign_contig_dispatch_vector[td_ns::num_types];
+static int sign_output_typeid_vector[td_ns::num_types];
+static unary_strided_impl_fn_ptr_t
+    sign_strided_dispatch_vector[td_ns::num_types];
+
+void populate_sign_dispatch_vectors(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = sign_fn_ns;
+
+    using fn_ns::SignContigFactory;
+    DispatchVectorBuilder<unary_contig_impl_fn_ptr_t, SignContigFactory,
+                          num_types>
+        dvb1;
+    dvb1.populate_dispatch_vector(sign_contig_dispatch_vector);
+
+    using fn_ns::SignStridedFactory;
+    DispatchVectorBuilder<unary_strided_impl_fn_ptr_t, SignStridedFactory,
+                          num_types>
+        dvb2;
+    dvb2.populate_dispatch_vector(sign_strided_dispatch_vector);
+
+    using fn_ns::SignTypeMapFactory;
+    DispatchVectorBuilder<int, SignTypeMapFactory, num_types> dvb3;
+    dvb3.populate_dispatch_vector(sign_output_typeid_vector);
+};
+
+} // namespace impl
+
+void init_sign(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        impl::populate_sign_dispatch_vectors();
+        using impl::sign_contig_dispatch_vector;
+        using impl::sign_output_typeid_vector;
+        using impl::sign_strided_dispatch_vector;
+
+        auto sign_pyapi = [&](const arrayT &src, const arrayT &dst,
+                              sycl::queue &exec_q,
+                              const event_vecT &depends = {}) {
+            return py_unary_ufunc(
+                src, dst, exec_q, depends, sign_output_typeid_vector,
+                sign_contig_dispatch_vector, sign_strided_dispatch_vector);
+        };
+        m.def("_sign", sign_pyapi, "", py::arg("src"), py::arg("dst"),
+              py::arg("sycl_queue"), py::arg("depends") = py::list());
+
+        auto sign_result_type_pyapi = [&](const py::dtype &dtype) {
+            return py_unary_ufunc_result_type(dtype, sign_output_typeid_vector);
+        };
+        m.def("_sign_result_type", sign_result_type_pyapi);
+    }
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/sign.hpp b/dpctl_ext/tensor/libtensor/source/elementwise_functions/sign.hpp
new file mode 100644
index 000000000000..19686ada3dbf
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/elementwise_functions/sign.hpp
@@ -0,0 +1,46 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern void init_sign(py::module_ m);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/signbit.cpp b/dpctl_ext/tensor/libtensor/source/elementwise_functions/signbit.cpp
new file mode 100644
index 000000000000..eeef1de50331
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/elementwise_functions/signbit.cpp
@@ -0,0 +1,128 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "elementwise_functions.hpp"
+#include "signbit.hpp"
+#include "utils/type_dispatch.hpp"
+#include "utils/type_dispatch_building.hpp"
+
+#include "kernels/elementwise_functions/common.hpp"
+#include "kernels/elementwise_functions/signbit.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
+using ew_cmn_ns::unary_contig_impl_fn_ptr_t;
+using ew_cmn_ns::unary_strided_impl_fn_ptr_t;
+
+// U41: ==== SIGNBIT   (x)
+namespace impl
+{
+
+namespace signbit_fn_ns = dpctl::tensor::kernels::signbit;
+
+static unary_contig_impl_fn_ptr_t
+    signbit_contig_dispatch_vector[td_ns::num_types];
+static int signbit_output_typeid_vector[td_ns::num_types];
+static unary_strided_impl_fn_ptr_t
+    signbit_strided_dispatch_vector[td_ns::num_types];
+
+void populate_signbit_dispatch_vectors(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = signbit_fn_ns;
+
+    using fn_ns::SignbitContigFactory;
+    DispatchVectorBuilder<unary_contig_impl_fn_ptr_t, SignbitContigFactory,
+                          num_types>
+        dvb1;
+    dvb1.populate_dispatch_vector(signbit_contig_dispatch_vector);
+
+    using fn_ns::SignbitStridedFactory;
+    DispatchVectorBuilder<unary_strided_impl_fn_ptr_t, SignbitStridedFactory,
+                          num_types>
+        dvb2;
+    dvb2.populate_dispatch_vector(signbit_strided_dispatch_vector);
+
+    using fn_ns::SignbitTypeMapFactory;
+    DispatchVectorBuilder<int, SignbitTypeMapFactory, num_types> dvb3;
+    dvb3.populate_dispatch_vector(signbit_output_typeid_vector);
+};
+
+} // namespace impl
+
+void init_signbit(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        impl::populate_signbit_dispatch_vectors();
+        using impl::signbit_contig_dispatch_vector;
+        using impl::signbit_output_typeid_vector;
+        using impl::signbit_strided_dispatch_vector;
+
+        auto signbit_pyapi = [&](const arrayT &src, const arrayT &dst,
+                                 sycl::queue &exec_q,
+                                 const event_vecT &depends = {}) {
+            return py_unary_ufunc(src, dst, exec_q, depends,
+                                  signbit_output_typeid_vector,
+                                  signbit_contig_dispatch_vector,
+                                  signbit_strided_dispatch_vector);
+        };
+        m.def("_signbit", signbit_pyapi, "", py::arg("src"), py::arg("dst"),
+              py::arg("sycl_queue"), py::arg("depends") = py::list());
+
+        auto signbit_result_type_pyapi = [&](const py::dtype &dtype) {
+            return py_unary_ufunc_result_type(dtype,
+                                              signbit_output_typeid_vector);
+        };
+        m.def("_signbit_result_type", signbit_result_type_pyapi);
+    }
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/signbit.hpp b/dpctl_ext/tensor/libtensor/source/elementwise_functions/signbit.hpp
new file mode 100644
index 000000000000..292386b174fc
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/elementwise_functions/signbit.hpp
@@ -0,0 +1,46 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern void init_signbit(py::module_ m);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/sin.cpp b/dpctl_ext/tensor/libtensor/source/elementwise_functions/sin.cpp
new file mode 100644
index 000000000000..7db753e27c4b
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/elementwise_functions/sin.cpp
@@ -0,0 +1,125 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "elementwise_functions.hpp"
+#include "sin.hpp"
+#include "utils/type_dispatch.hpp"
+#include "utils/type_dispatch_building.hpp"
+
+#include "kernels/elementwise_functions/common.hpp"
+#include "kernels/elementwise_functions/sin.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
+using ew_cmn_ns::unary_contig_impl_fn_ptr_t;
+using ew_cmn_ns::unary_strided_impl_fn_ptr_t;
+
+// U30: ==== SIN   (x)
+namespace impl
+{
+
+namespace sin_fn_ns = dpctl::tensor::kernels::sin;
+
+static unary_contig_impl_fn_ptr_t sin_contig_dispatch_vector[td_ns::num_types];
+static int sin_output_typeid_vector[td_ns::num_types];
+static unary_strided_impl_fn_ptr_t
+    sin_strided_dispatch_vector[td_ns::num_types];
+
+void populate_sin_dispatch_vectors(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = sin_fn_ns;
+
+    using fn_ns::SinContigFactory;
+    DispatchVectorBuilder<unary_contig_impl_fn_ptr_t, SinContigFactory,
+                          num_types>
+        dvb1;
+    dvb1.populate_dispatch_vector(sin_contig_dispatch_vector);
+
+    using fn_ns::SinStridedFactory;
+    DispatchVectorBuilder<unary_strided_impl_fn_ptr_t, SinStridedFactory,
+                          num_types>
+        dvb2;
+    dvb2.populate_dispatch_vector(sin_strided_dispatch_vector);
+
+    using fn_ns::SinTypeMapFactory;
+    DispatchVectorBuilder<int, SinTypeMapFactory, num_types> dvb3;
+    dvb3.populate_dispatch_vector(sin_output_typeid_vector);
+};
+
+} // namespace impl
+
+void init_sin(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        impl::populate_sin_dispatch_vectors();
+        using impl::sin_contig_dispatch_vector;
+        using impl::sin_output_typeid_vector;
+        using impl::sin_strided_dispatch_vector;
+
+        auto sin_pyapi = [&](const arrayT &src, const arrayT &dst,
+                             sycl::queue &exec_q,
+                             const event_vecT &depends = {}) {
+            return py_unary_ufunc(
+                src, dst, exec_q, depends, sin_output_typeid_vector,
+                sin_contig_dispatch_vector, sin_strided_dispatch_vector);
+        };
+        m.def("_sin", sin_pyapi, "", py::arg("src"), py::arg("dst"),
+              py::arg("sycl_queue"), py::arg("depends") = py::list());
+
+        auto sin_result_type_pyapi = [&](const py::dtype &dtype) {
+            return py_unary_ufunc_result_type(dtype, sin_output_typeid_vector);
+        };
+        m.def("_sin_result_type", sin_result_type_pyapi);
+    }
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/sin.hpp b/dpctl_ext/tensor/libtensor/source/elementwise_functions/sin.hpp
new file mode 100644
index 000000000000..a4b3da08b7fc
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/elementwise_functions/sin.hpp
@@ -0,0 +1,46 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern void init_sin(py::module_ m);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/sinh.cpp b/dpctl_ext/tensor/libtensor/source/elementwise_functions/sinh.cpp
new file mode 100644
index 000000000000..e56a28e0c2aa
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/elementwise_functions/sinh.cpp
@@ -0,0 +1,125 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "elementwise_functions.hpp"
+#include "sinh.hpp"
+#include "utils/type_dispatch.hpp"
+#include "utils/type_dispatch_building.hpp"
+
+#include "kernels/elementwise_functions/common.hpp"
+#include "kernels/elementwise_functions/sinh.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
+using ew_cmn_ns::unary_contig_impl_fn_ptr_t;
+using ew_cmn_ns::unary_strided_impl_fn_ptr_t;
+
+// U31: ==== SINH   (x)
+namespace impl
+{
+
+namespace sinh_fn_ns = dpctl::tensor::kernels::sinh;
+
+static unary_contig_impl_fn_ptr_t sinh_contig_dispatch_vector[td_ns::num_types];
+static int sinh_output_typeid_vector[td_ns::num_types];
+static unary_strided_impl_fn_ptr_t
+    sinh_strided_dispatch_vector[td_ns::num_types];
+
+void populate_sinh_dispatch_vectors(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = sinh_fn_ns;
+
+    using fn_ns::SinhContigFactory;
+    DispatchVectorBuilder<unary_contig_impl_fn_ptr_t, SinhContigFactory,
+                          num_types>
+        dvb1;
+    dvb1.populate_dispatch_vector(sinh_contig_dispatch_vector);
+
+    using fn_ns::SinhStridedFactory;
+    DispatchVectorBuilder<unary_strided_impl_fn_ptr_t, SinhStridedFactory,
+                          num_types>
+        dvb2;
+    dvb2.populate_dispatch_vector(sinh_strided_dispatch_vector);
+
+    using fn_ns::SinhTypeMapFactory;
+    DispatchVectorBuilder<int, SinhTypeMapFactory, num_types> dvb3;
+    dvb3.populate_dispatch_vector(sinh_output_typeid_vector);
+};
+
+} // namespace impl
+
+void init_sinh(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        impl::populate_sinh_dispatch_vectors();
+        using impl::sinh_contig_dispatch_vector;
+        using impl::sinh_output_typeid_vector;
+        using impl::sinh_strided_dispatch_vector;
+
+        auto sinh_pyapi = [&](const arrayT &src, const arrayT &dst,
+                              sycl::queue &exec_q,
+                              const event_vecT &depends = {}) {
+            return py_unary_ufunc(
+                src, dst, exec_q, depends, sinh_output_typeid_vector,
+                sinh_contig_dispatch_vector, sinh_strided_dispatch_vector);
+        };
+        m.def("_sinh", sinh_pyapi, "", py::arg("src"), py::arg("dst"),
+              py::arg("sycl_queue"), py::arg("depends") = py::list());
+
+        auto sinh_result_type_pyapi = [&](const py::dtype &dtype) {
+            return py_unary_ufunc_result_type(dtype, sinh_output_typeid_vector);
+        };
+        m.def("_sinh_result_type", sinh_result_type_pyapi);
+    }
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/sinh.hpp b/dpctl_ext/tensor/libtensor/source/elementwise_functions/sinh.hpp
new file mode 100644
index 000000000000..4a0d90d24c8c
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/elementwise_functions/sinh.hpp
@@ -0,0 +1,46 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern void init_sinh(py::module_ m);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/sqrt.cpp b/dpctl_ext/tensor/libtensor/source/elementwise_functions/sqrt.cpp
new file mode 100644
index 000000000000..a4a715147055
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/elementwise_functions/sqrt.cpp
@@ -0,0 +1,125 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "elementwise_functions.hpp"
+#include "sqrt.hpp"
+#include "utils/type_dispatch.hpp"
+#include "utils/type_dispatch_building.hpp"
+
+#include "kernels/elementwise_functions/common.hpp"
+#include "kernels/elementwise_functions/sqrt.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
+using ew_cmn_ns::unary_contig_impl_fn_ptr_t;
+using ew_cmn_ns::unary_strided_impl_fn_ptr_t;
+
+// U33: ==== SQRT   (x)
+namespace impl
+{
+
+namespace sqrt_fn_ns = dpctl::tensor::kernels::sqrt;
+
+static unary_contig_impl_fn_ptr_t sqrt_contig_dispatch_vector[td_ns::num_types];
+static int sqrt_output_typeid_vector[td_ns::num_types];
+static unary_strided_impl_fn_ptr_t
+    sqrt_strided_dispatch_vector[td_ns::num_types];
+
+void populate_sqrt_dispatch_vectors(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = sqrt_fn_ns;
+
+    using fn_ns::SqrtContigFactory;
+    DispatchVectorBuilder<unary_contig_impl_fn_ptr_t, SqrtContigFactory,
+                          num_types>
+        dvb1;
+    dvb1.populate_dispatch_vector(sqrt_contig_dispatch_vector);
+
+    using fn_ns::SqrtStridedFactory;
+    DispatchVectorBuilder<unary_strided_impl_fn_ptr_t, SqrtStridedFactory,
+                          num_types>
+        dvb2;
+    dvb2.populate_dispatch_vector(sqrt_strided_dispatch_vector);
+
+    using fn_ns::SqrtTypeMapFactory;
+    DispatchVectorBuilder<int, SqrtTypeMapFactory, num_types> dvb3;
+    dvb3.populate_dispatch_vector(sqrt_output_typeid_vector);
+};
+
+} // namespace impl
+
+void init_sqrt(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        impl::populate_sqrt_dispatch_vectors();
+        using impl::sqrt_contig_dispatch_vector;
+        using impl::sqrt_output_typeid_vector;
+        using impl::sqrt_strided_dispatch_vector;
+
+        auto sqrt_pyapi = [&](const arrayT &src, const arrayT &dst,
+                              sycl::queue &exec_q,
+                              const event_vecT &depends = {}) {
+            return py_unary_ufunc(
+                src, dst, exec_q, depends, sqrt_output_typeid_vector,
+                sqrt_contig_dispatch_vector, sqrt_strided_dispatch_vector);
+        };
+        m.def("_sqrt", sqrt_pyapi, "", py::arg("src"), py::arg("dst"),
+              py::arg("sycl_queue"), py::arg("depends") = py::list());
+
+        auto sqrt_result_type_pyapi = [&](const py::dtype &dtype) {
+            return py_unary_ufunc_result_type(dtype, sqrt_output_typeid_vector);
+        };
+        m.def("_sqrt_result_type", sqrt_result_type_pyapi);
+    }
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/sqrt.hpp b/dpctl_ext/tensor/libtensor/source/elementwise_functions/sqrt.hpp
new file mode 100644
index 000000000000..e8f7014c1afc
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/elementwise_functions/sqrt.hpp
@@ -0,0 +1,46 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern void init_sqrt(py::module_ m);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/square.cpp b/dpctl_ext/tensor/libtensor/source/elementwise_functions/square.cpp
new file mode 100644
index 000000000000..d3e229ae42fc
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/elementwise_functions/square.cpp
@@ -0,0 +1,127 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "elementwise_functions.hpp"
+#include "square.hpp"
+#include "utils/type_dispatch.hpp"
+#include "utils/type_dispatch_building.hpp"
+
+#include "kernels/elementwise_functions/common.hpp"
+#include "kernels/elementwise_functions/square.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
+using ew_cmn_ns::unary_contig_impl_fn_ptr_t;
+using ew_cmn_ns::unary_strided_impl_fn_ptr_t;
+
+// U32: ==== SQUARE   (x)
+namespace impl
+{
+
+namespace square_fn_ns = dpctl::tensor::kernels::square;
+
+static unary_contig_impl_fn_ptr_t
+    square_contig_dispatch_vector[td_ns::num_types];
+static int square_output_typeid_vector[td_ns::num_types];
+static unary_strided_impl_fn_ptr_t
+    square_strided_dispatch_vector[td_ns::num_types];
+
+void populate_square_dispatch_vectors(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = square_fn_ns;
+
+    using fn_ns::SquareContigFactory;
+    DispatchVectorBuilder<unary_contig_impl_fn_ptr_t, SquareContigFactory,
+                          num_types>
+        dvb1;
+    dvb1.populate_dispatch_vector(square_contig_dispatch_vector);
+
+    using fn_ns::SquareStridedFactory;
+    DispatchVectorBuilder<unary_strided_impl_fn_ptr_t, SquareStridedFactory,
+                          num_types>
+        dvb2;
+    dvb2.populate_dispatch_vector(square_strided_dispatch_vector);
+
+    using fn_ns::SquareTypeMapFactory;
+    DispatchVectorBuilder<int, SquareTypeMapFactory, num_types> dvb3;
+    dvb3.populate_dispatch_vector(square_output_typeid_vector);
+};
+
+} // namespace impl
+
+void init_square(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        impl::populate_square_dispatch_vectors();
+        using impl::square_contig_dispatch_vector;
+        using impl::square_output_typeid_vector;
+        using impl::square_strided_dispatch_vector;
+
+        auto square_pyapi = [&](const arrayT &src, const arrayT &dst,
+                                sycl::queue &exec_q,
+                                const event_vecT &depends = {}) {
+            return py_unary_ufunc(
+                src, dst, exec_q, depends, square_output_typeid_vector,
+                square_contig_dispatch_vector, square_strided_dispatch_vector);
+        };
+        m.def("_square", square_pyapi, "", py::arg("src"), py::arg("dst"),
+              py::arg("sycl_queue"), py::arg("depends") = py::list());
+
+        auto square_result_type_pyapi = [&](const py::dtype &dtype) {
+            return py_unary_ufunc_result_type(dtype,
+                                              square_output_typeid_vector);
+        };
+        m.def("_square_result_type", square_result_type_pyapi);
+    }
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/square.hpp b/dpctl_ext/tensor/libtensor/source/elementwise_functions/square.hpp
new file mode 100644
index 000000000000..3f23f184499c
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/elementwise_functions/square.hpp
@@ -0,0 +1,46 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern void init_square(py::module_ m);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/tan.cpp b/dpctl_ext/tensor/libtensor/source/elementwise_functions/tan.cpp
new file mode 100644
index 000000000000..8abdea0e5283
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/elementwise_functions/tan.cpp
@@ -0,0 +1,125 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "elementwise_functions.hpp"
+#include "tan.hpp"
+#include "utils/type_dispatch.hpp"
+#include "utils/type_dispatch_building.hpp"
+
+#include "kernels/elementwise_functions/common.hpp"
+#include "kernels/elementwise_functions/tan.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
+using ew_cmn_ns::unary_contig_impl_fn_ptr_t;
+using ew_cmn_ns::unary_strided_impl_fn_ptr_t;
+
+// U34: ==== TAN   (x)
+namespace impl
+{
+
+namespace tan_fn_ns = dpctl::tensor::kernels::tan;
+
+static unary_contig_impl_fn_ptr_t tan_contig_dispatch_vector[td_ns::num_types];
+static int tan_output_typeid_vector[td_ns::num_types];
+static unary_strided_impl_fn_ptr_t
+    tan_strided_dispatch_vector[td_ns::num_types];
+
+void populate_tan_dispatch_vectors(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = tan_fn_ns;
+
+    using fn_ns::TanContigFactory;
+    DispatchVectorBuilder<unary_contig_impl_fn_ptr_t, TanContigFactory,
+                          num_types>
+        dvb1;
+    dvb1.populate_dispatch_vector(tan_contig_dispatch_vector);
+
+    using fn_ns::TanStridedFactory;
+    DispatchVectorBuilder<unary_strided_impl_fn_ptr_t, TanStridedFactory,
+                          num_types>
+        dvb2;
+    dvb2.populate_dispatch_vector(tan_strided_dispatch_vector);
+
+    using fn_ns::TanTypeMapFactory;
+    DispatchVectorBuilder<int, TanTypeMapFactory, num_types> dvb3;
+    dvb3.populate_dispatch_vector(tan_output_typeid_vector);
+};
+
+} // namespace impl
+
+void init_tan(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        impl::populate_tan_dispatch_vectors();
+        using impl::tan_contig_dispatch_vector;
+        using impl::tan_output_typeid_vector;
+        using impl::tan_strided_dispatch_vector;
+
+        auto tan_pyapi = [&](const arrayT &src, const arrayT &dst,
+                             sycl::queue &exec_q,
+                             const event_vecT &depends = {}) {
+            return py_unary_ufunc(
+                src, dst, exec_q, depends, tan_output_typeid_vector,
+                tan_contig_dispatch_vector, tan_strided_dispatch_vector);
+        };
+        m.def("_tan", tan_pyapi, "", py::arg("src"), py::arg("dst"),
+              py::arg("sycl_queue"), py::arg("depends") = py::list());
+
+        auto tan_result_type_pyapi = [&](const py::dtype &dtype) {
+            return py_unary_ufunc_result_type(dtype, tan_output_typeid_vector);
+        };
+        m.def("_tan_result_type", tan_result_type_pyapi);
+    }
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/tan.hpp b/dpctl_ext/tensor/libtensor/source/elementwise_functions/tan.hpp
new file mode 100644
index 000000000000..b0818a9a85c2
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/elementwise_functions/tan.hpp
@@ -0,0 +1,46 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern void init_tan(py::module_ m);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/tanh.cpp b/dpctl_ext/tensor/libtensor/source/elementwise_functions/tanh.cpp
new file mode 100644
index 000000000000..bf8ff205c0af
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/elementwise_functions/tanh.cpp
@@ -0,0 +1,125 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "elementwise_functions.hpp"
+#include "tanh.hpp"
+#include "utils/type_dispatch.hpp"
+#include "utils/type_dispatch_building.hpp"
+
+#include "kernels/elementwise_functions/common.hpp"
+#include "kernels/elementwise_functions/tanh.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
+using ew_cmn_ns::unary_contig_impl_fn_ptr_t;
+using ew_cmn_ns::unary_strided_impl_fn_ptr_t;
+
+// U35: ==== TANH   (x)
+namespace impl
+{
+
+namespace tanh_fn_ns = dpctl::tensor::kernels::tanh;
+
+static unary_contig_impl_fn_ptr_t tanh_contig_dispatch_vector[td_ns::num_types];
+static int tanh_output_typeid_vector[td_ns::num_types];
+static unary_strided_impl_fn_ptr_t
+    tanh_strided_dispatch_vector[td_ns::num_types];
+
+void populate_tanh_dispatch_vectors(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = tanh_fn_ns;
+
+    using fn_ns::TanhContigFactory;
+    DispatchVectorBuilder<unary_contig_impl_fn_ptr_t, TanhContigFactory,
+                          num_types>
+        dvb1;
+    dvb1.populate_dispatch_vector(tanh_contig_dispatch_vector);
+
+    using fn_ns::TanhStridedFactory;
+    DispatchVectorBuilder<unary_strided_impl_fn_ptr_t, TanhStridedFactory,
+                          num_types>
+        dvb2;
+    dvb2.populate_dispatch_vector(tanh_strided_dispatch_vector);
+
+    using fn_ns::TanhTypeMapFactory;
+    DispatchVectorBuilder<int, TanhTypeMapFactory, num_types> dvb3;
+    dvb3.populate_dispatch_vector(tanh_output_typeid_vector);
+};
+
+} // namespace impl
+
+void init_tanh(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        impl::populate_tanh_dispatch_vectors();
+        using impl::tanh_contig_dispatch_vector;
+        using impl::tanh_output_typeid_vector;
+        using impl::tanh_strided_dispatch_vector;
+
+        auto tanh_pyapi = [&](const arrayT &src, const arrayT &dst,
+                              sycl::queue &exec_q,
+                              const event_vecT &depends = {}) {
+            return py_unary_ufunc(
+                src, dst, exec_q, depends, tanh_output_typeid_vector,
+                tanh_contig_dispatch_vector, tanh_strided_dispatch_vector);
+        };
+        m.def("_tanh", tanh_pyapi, "", py::arg("src"), py::arg("dst"),
+              py::arg("sycl_queue"), py::arg("depends") = py::list());
+
+        auto tanh_result_type_pyapi = [&](const py::dtype &dtype) {
+            return py_unary_ufunc_result_type(dtype, tanh_output_typeid_vector);
+        };
+        m.def("_tanh_result_type", tanh_result_type_pyapi);
+    }
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/tanh.hpp b/dpctl_ext/tensor/libtensor/source/elementwise_functions/tanh.hpp
new file mode 100644
index 000000000000..d29c924d5e73
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/elementwise_functions/tanh.hpp
@@ -0,0 +1,46 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern void init_tanh(py::module_ m);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/trunc.cpp b/dpctl_ext/tensor/libtensor/source/elementwise_functions/trunc.cpp
new file mode 100644
index 000000000000..3a798d8e110d
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/elementwise_functions/trunc.cpp
@@ -0,0 +1,127 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "elementwise_functions.hpp"
+#include "trunc.hpp"
+#include "utils/type_dispatch.hpp"
+#include "utils/type_dispatch_building.hpp"
+
+#include "kernels/elementwise_functions/common.hpp"
+#include "kernels/elementwise_functions/trunc.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
+using ew_cmn_ns::unary_contig_impl_fn_ptr_t;
+using ew_cmn_ns::unary_strided_impl_fn_ptr_t;
+
+// U36: ==== TRUNC   (x)
+namespace impl
+{
+
+namespace trunc_fn_ns = dpctl::tensor::kernels::trunc;
+
+static unary_contig_impl_fn_ptr_t
+    trunc_contig_dispatch_vector[td_ns::num_types];
+static int trunc_output_typeid_vector[td_ns::num_types];
+static unary_strided_impl_fn_ptr_t
+    trunc_strided_dispatch_vector[td_ns::num_types];
+
+void populate_trunc_dispatch_vectors(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = trunc_fn_ns;
+
+    using fn_ns::TruncContigFactory;
+    DispatchVectorBuilder<unary_contig_impl_fn_ptr_t, TruncContigFactory,
+                          num_types>
+        dvb1;
+    dvb1.populate_dispatch_vector(trunc_contig_dispatch_vector);
+
+    using fn_ns::TruncStridedFactory;
+    DispatchVectorBuilder<unary_strided_impl_fn_ptr_t, TruncStridedFactory,
+                          num_types>
+        dvb2;
+    dvb2.populate_dispatch_vector(trunc_strided_dispatch_vector);
+
+    using fn_ns::TruncTypeMapFactory;
+    DispatchVectorBuilder<int, TruncTypeMapFactory, num_types> dvb3;
+    dvb3.populate_dispatch_vector(trunc_output_typeid_vector);
+};
+
+} // namespace impl
+
+void init_trunc(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        impl::populate_trunc_dispatch_vectors();
+        using impl::trunc_contig_dispatch_vector;
+        using impl::trunc_output_typeid_vector;
+        using impl::trunc_strided_dispatch_vector;
+
+        auto trunc_pyapi = [&](const arrayT &src, const arrayT &dst,
+                               sycl::queue &exec_q,
+                               const event_vecT &depends = {}) {
+            return py_unary_ufunc(
+                src, dst, exec_q, depends, trunc_output_typeid_vector,
+                trunc_contig_dispatch_vector, trunc_strided_dispatch_vector);
+        };
+        m.def("_trunc", trunc_pyapi, "", py::arg("src"), py::arg("dst"),
+              py::arg("sycl_queue"), py::arg("depends") = py::list());
+
+        auto trunc_result_type_pyapi = [&](const py::dtype &dtype) {
+            return py_unary_ufunc_result_type(dtype,
+                                              trunc_output_typeid_vector);
+        };
+        m.def("_trunc_result_type", trunc_result_type_pyapi);
+    }
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/trunc.hpp b/dpctl_ext/tensor/libtensor/source/elementwise_functions/trunc.hpp
new file mode 100644
index 000000000000..79ed6b5ded14
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/elementwise_functions/trunc.hpp
@@ -0,0 +1,46 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern void init_trunc(py::module_ m);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpnp/dpnp_iface_mathematical.py b/dpnp/dpnp_iface_mathematical.py
index a6e0d0b39a98..c84b61dad4bf 100644
--- a/dpnp/dpnp_iface_mathematical.py
+++ b/dpnp/dpnp_iface_mathematical.py
@@ -4250,8 +4250,8 @@ def prod(
 
 proj = DPNPUnaryFunc(
     "proj",
-    ti._proj_result_type,
-    ti._proj,
+    ti_ext._proj_result_type,
+    ti_ext._proj,
     _PROJ_DOCSTRING,
 )
 
@@ -4313,8 +4313,8 @@ def prod(
 
 real = DPNPReal(
     "real",
-    ti._real_result_type,
-    ti._real,
+    ti_ext._real_result_type,
+    ti_ext._real,
     _REAL_DOCSTRING,
 )
 
@@ -4596,8 +4596,8 @@ def real_if_close(a, tol=100):
 
 round = DPNPRound(
     "round",
-    ti._round_result_type,
-    ti._round,
+    ti_ext._round_result_type,
+    ti_ext._round,
     _ROUND_DOCSTRING,
     mkl_fn_to_call="_mkl_round_to_call",
     mkl_impl_fn="_round",
@@ -4668,8 +4668,8 @@ def real_if_close(a, tol=100):
 
 sign = DPNPUnaryFunc(
     "sign",
-    ti._sign_result_type,
-    ti._sign,
+    ti_ext._sign_result_type,
+    ti_ext._sign,
     _SIGN_DOCSTRING,
     acceptance_fn=acceptance_fn_sign,
 )
@@ -4730,8 +4730,8 @@ def real_if_close(a, tol=100):
 
 signbit = DPNPUnaryFunc(
     "signbit",
-    ti._signbit_result_type,
-    ti._signbit,
+    ti_ext._signbit_result_type,
+    ti_ext._signbit,
     _SIGNBIT_DOCSTRING,
 )
 
@@ -5229,8 +5229,8 @@ def trapezoid(y, x=None, dx=1.0, axis=-1):
 
 trunc = DPNPUnaryFunc(
     "trunc",
-    ti._trunc_result_type,
-    ti._trunc,
+    ti_ext._trunc_result_type,
+    ti_ext._trunc,
     _TRUNC_DOCSTRING,
     mkl_fn_to_call="_mkl_trunc_to_call",
     mkl_impl_fn="_trunc",
diff --git a/dpnp/dpnp_iface_trigonometric.py b/dpnp/dpnp_iface_trigonometric.py
index d459a3392311..6deab3a8876c 100644
--- a/dpnp/dpnp_iface_trigonometric.py
+++ b/dpnp/dpnp_iface_trigonometric.py
@@ -718,8 +718,8 @@ def _get_accumulation_res_dt(a, dtype):
 
 cbrt = DPNPUnaryFunc(
     "cbrt",
-    ti._cbrt_result_type,
-    ti._cbrt,
+    ti_ext._cbrt_result_type,
+    ti_ext._cbrt,
     _CBRT_DOCSTRING,
     mkl_fn_to_call="_mkl_cbrt_to_call",
     mkl_impl_fn="_cbrt",
@@ -1187,8 +1187,8 @@ def cumlogsumexp(
 
 exp2 = DPNPUnaryFunc(
     "exp2",
-    ti._exp2_result_type,
-    ti._exp2,
+    ti_ext._exp2_result_type,
+    ti_ext._exp2,
     _EXP2_DOCSTRING,
     mkl_fn_to_call="_mkl_exp2_to_call",
     mkl_impl_fn="_exp2",
@@ -2107,8 +2107,8 @@ def logsumexp(x, /, *, axis=None, dtype=None, keepdims=False, out=None):
 
 reciprocal = DPNPUnaryFunc(
     "reciprocal",
-    ti._reciprocal_result_type,
-    ti._reciprocal,
+    ti_ext._reciprocal_result_type,
+    ti_ext._reciprocal,
     _RECIPROCAL_DOCSTRING,
     mkl_fn_to_call="_mkl_inv_to_call",
     mkl_impl_fn="_inv",
@@ -2252,8 +2252,8 @@ def reduce_hypot(x, /, *, axis=None, dtype=None, keepdims=False, out=None):
 
 rsqrt = DPNPUnaryFunc(
     "rsqrt",
-    ti._rsqrt_result_type,
-    ti._rsqrt,
+    ti_ext._rsqrt_result_type,
+    ti_ext._rsqrt,
     _RSQRT_DOCSTRING,
 )
 
@@ -2309,8 +2309,8 @@ def reduce_hypot(x, /, *, axis=None, dtype=None, keepdims=False, out=None):
 
 sin = DPNPUnaryFunc(
     "sin",
-    ti._sin_result_type,
-    ti._sin,
+    ti_ext._sin_result_type,
+    ti_ext._sin,
     _SIN_DOCSTRING,
     mkl_fn_to_call="_mkl_sin_to_call",
     mkl_impl_fn="_sin",
@@ -2372,8 +2372,8 @@ def reduce_hypot(x, /, *, axis=None, dtype=None, keepdims=False, out=None):
 
 sinh = DPNPUnaryFunc(
     "sinh",
-    ti._sinh_result_type,
-    ti._sinh,
+    ti_ext._sinh_result_type,
+    ti_ext._sinh,
     _SINH_DOCSTRING,
     mkl_fn_to_call="_mkl_sinh_to_call",
     mkl_impl_fn="_sinh",
@@ -2449,8 +2449,8 @@ def reduce_hypot(x, /, *, axis=None, dtype=None, keepdims=False, out=None):
 
 sqrt = DPNPUnaryFunc(
     "sqrt",
-    ti._sqrt_result_type,
-    ti._sqrt,
+    ti_ext._sqrt_result_type,
+    ti_ext._sqrt,
     _SQRT_DOCSTRING,
     mkl_fn_to_call="_mkl_sqrt_to_call",
     mkl_impl_fn="_sqrt",
@@ -2508,8 +2508,8 @@ def reduce_hypot(x, /, *, axis=None, dtype=None, keepdims=False, out=None):
 
 square = DPNPUnaryFunc(
     "square",
-    ti._square_result_type,
-    ti._square,
+    ti_ext._square_result_type,
+    ti_ext._square,
     _SQUARE_DOCSTRING,
     mkl_fn_to_call="_mkl_sqr_to_call",
     mkl_impl_fn="_sqr",
@@ -2567,8 +2567,8 @@ def reduce_hypot(x, /, *, axis=None, dtype=None, keepdims=False, out=None):
 
 tan = DPNPUnaryFunc(
     "tan",
-    ti._tan_result_type,
-    ti._tan,
+    ti_ext._tan_result_type,
+    ti_ext._tan,
     _TAN_DOCSTRING,
     mkl_fn_to_call="_mkl_tan_to_call",
     mkl_impl_fn="_tan",
@@ -2632,8 +2632,8 @@ def reduce_hypot(x, /, *, axis=None, dtype=None, keepdims=False, out=None):
 
 tanh = DPNPUnaryFunc(
     "tanh",
-    ti._tanh_result_type,
-    ti._tanh,
+    ti_ext._tanh_result_type,
+    ti_ext._tanh,
     _TANH_DOCSTRING,
     mkl_fn_to_call="_mkl_tanh_to_call",
     mkl_impl_fn="_tanh",

From b0647db4c46416c091a1355d8767e87847c32ce4 Mon Sep 17 00:00:00 2001
From: ndgrigorian <46709016+ndgrigorian@users.noreply.github.com>
Date: Wed, 18 Mar 2026 05:25:17 -0700
Subject: [PATCH 16/43] add tensor linalg extension (#2799)

This PR migrates the `_tensor_linalg_impl` extension to
`dpctl_ext.tensor` and extends `dpctl_ext.tensor` Python API with
`dpctl.tensor` functions `matmul`, `matrix_transpose`, `tensordot`, and
`vecdot`
---
 dpctl_ext/tensor/CMakeLists.txt               |   16 +-
 dpctl_ext/tensor/__init__.py                  |   10 +
 dpctl_ext/tensor/_linear_algebra_functions.py | 1019 ++++
 .../kernels/linalg_functions/dot_product.hpp  | 1402 ++++++
 .../include/kernels/linalg_functions/gemm.hpp | 4240 +++++++++++++++++
 .../libtensor/source/linalg_functions/dot.cpp |  839 ++++
 .../libtensor/source/linalg_functions/dot.hpp |   45 +
 .../linalg_functions/dot_atomic_support.hpp   |   58 +
 .../source/linalg_functions/dot_dispatch.hpp  |  405 ++
 .../tensor/libtensor/source/tensor_linalg.cpp |   41 +
 dpnp/dpnp_iface_manipulation.py               |    2 +-
 dpnp/dpnp_utils/dpnp_utils_linearalgebra.py   |    5 +-
 12 files changed, 8077 insertions(+), 5 deletions(-)
 create mode 100644 dpctl_ext/tensor/_linear_algebra_functions.py
 create mode 100644 dpctl_ext/tensor/libtensor/include/kernels/linalg_functions/dot_product.hpp
 create mode 100644 dpctl_ext/tensor/libtensor/include/kernels/linalg_functions/gemm.hpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/linalg_functions/dot.cpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/linalg_functions/dot.hpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/linalg_functions/dot_atomic_support.hpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/linalg_functions/dot_dispatch.hpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/tensor_linalg.cpp

diff --git a/dpctl_ext/tensor/CMakeLists.txt b/dpctl_ext/tensor/CMakeLists.txt
index ef3565f9827e..afc7dca4db33 100644
--- a/dpctl_ext/tensor/CMakeLists.txt
+++ b/dpctl_ext/tensor/CMakeLists.txt
@@ -166,6 +166,10 @@ set(_sorting_sources
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/sorting/searchsorted.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/sorting/topk.cpp
 )
+set(_linalg_sources
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/elementwise_functions_type_utils.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/linalg_functions/dot.cpp
+)
 set(_tensor_accumulation_impl_sources
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/tensor_accumulation.cpp
     ${_accumulator_sources}
@@ -182,6 +186,10 @@ set(_tensor_sorting_impl_sources
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/tensor_sorting.cpp
     ${_sorting_sources}
 )
+set(_tensor_linalg_impl_sources
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/tensor_linalg.cpp
+    ${_linalg_sources}
+)
 
 set(_static_lib_trgt simplify_iteration_space)
 
@@ -228,6 +236,12 @@ add_sycl_to_target(TARGET ${python_module_name} SOURCES ${_tensor_sorting_impl_s
 target_link_libraries(${python_module_name} PRIVATE ${_static_lib_trgt})
 list(APPEND _py_trgts ${python_module_name})
 
+set(python_module_name _tensor_linalg_impl)
+pybind11_add_module(${python_module_name} MODULE ${_tensor_linalg_impl_sources})
+add_sycl_to_target(TARGET ${python_module_name} SOURCES ${_tensor_linalg_impl_sources})
+target_link_libraries(${python_module_name} PRIVATE ${_static_lib_trgt})
+list(APPEND _py_trgts ${python_module_name})
+
 set(_clang_prefix "")
 if(WIN32)
     set(_clang_prefix "/clang:")
@@ -245,7 +259,7 @@ list(
     ${_elementwise_sources}
     ${_reduction_sources}
     ${_sorting_sources}
-    # ${_linalg_sources}
+    ${_linalg_sources}
     ${_accumulator_sources}
 )
 
diff --git a/dpctl_ext/tensor/__init__.py b/dpctl_ext/tensor/__init__.py
index 70352687c5d6..a6127f1fc27c 100644
--- a/dpctl_ext/tensor/__init__.py
+++ b/dpctl_ext/tensor/__init__.py
@@ -107,6 +107,12 @@
     take,
     take_along_axis,
 )
+from ._linear_algebra_functions import (
+    matmul,
+    matrix_transpose,
+    tensordot,
+    vecdot,
+)
 from ._manipulation_functions import (
     broadcast_arrays,
     broadcast_to,
@@ -216,6 +222,8 @@
     "min",
     "moveaxis",
     "permute_dims",
+    "matmul",
+    "matrix_transpose",
     "negative",
     "nonzero",
     "ones",
@@ -251,6 +259,7 @@
     "take_along_axis",
     "tan",
     "tanh",
+    "tensordot",
     "tile",
     "top_k",
     "to_numpy",
@@ -262,6 +271,7 @@
     "unique_inverse",
     "unique_values",
     "unstack",
+    "vecdot",
     "where",
     "zeros",
     "zeros_like",
diff --git a/dpctl_ext/tensor/_linear_algebra_functions.py b/dpctl_ext/tensor/_linear_algebra_functions.py
new file mode 100644
index 000000000000..5f6edecf5e59
--- /dev/null
+++ b/dpctl_ext/tensor/_linear_algebra_functions.py
@@ -0,0 +1,1019 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+import operator
+
+import dpctl
+import dpctl.tensor as dpt
+from dpctl.utils import ExecutionPlacementError, SequentialOrderManager
+
+# TODO: revert to `import dpctl.tensor...`
+# when dpnp fully migrates dpctl/tensor
+import dpctl_ext.tensor as dpt_ext
+import dpctl_ext.tensor._tensor_elementwise_impl as tei
+import dpctl_ext.tensor._tensor_impl as ti
+import dpctl_ext.tensor._tensor_linalg_impl as tli
+
+from ._copy_utils import _empty_like_orderK, _empty_like_pair_orderK
+from ._manipulation_functions import _broadcast_shape_impl
+from ._numpy_helper import normalize_axis_index, normalize_axis_tuple
+from ._type_utils import (
+    _acceptance_fn_default_binary,
+    _find_buf_dtype2,
+    _to_device_supported_dtype,
+)
+
+
+def matrix_transpose(x):
+    r"""matrix_transpose(x)
+
+    Transposes the innermost two dimensions of `x`, where `x` is a
+    2-dimensional matrix or a stack of 2-dimensional matrices.
+
+    To convert from a 1-dimensional array to a 2-dimensional column
+    vector, use x[:, dpt.newaxis].
+
+    Args:
+       x (usm_ndarray):
+          Input array with shape (..., m, n).
+
+    Returns:
+       usm_ndarray:
+          Array with shape (..., n, m).
+    """
+
+    if not isinstance(x, dpt.usm_ndarray):
+        raise TypeError(
+            "Expected instance of `dpt.usm_ndarray`, got `{}`.".format(type(x))
+        )
+    if x.ndim < 2:
+        raise ValueError(
+            "dpctl.tensor.matrix_transpose requires array to have"
+            "at least 2 dimensions"
+        )
+
+    return x.mT
+
+
+def tensordot(x1, x2, axes=2):
+    r"""tensordot(x1, x2, axes=2)
+
+    Returns a tensor contraction of `x1` and `x2` over specific axes.
+
+    Args:
+        x1 (usm_ndarray):
+            first input array, expected to have numeric data type.
+        x2 (usm_ndarray):
+            second input array, expected to have numeric data type.
+            Corresponding contracted axes of `x1` and `x2` must be equal.
+        axes (Union[int, Tuple[Sequence[int], Sequence[int]]):
+            number of axes to contract or explicit sequences of axes for
+            `x1` and `x2`, respectively. If `axes` is an integer equal to `N`,
+            then the contraction is performed over last `N` axes of `x1` and
+            the first `N` axis of `x2` in order. The size of each corresponding
+            axis must match and must be non-negative.
+
+                * if `N` equals `0`, the result is the tensor outer product
+                * if `N` equals `1`, the result is the tensor dot product
+                * if `N` equals `2`, the result is the tensor double
+                  contraction (default).
+
+            If `axes` is a tuple of two sequences `(x1_axes, x2_axes)`, the
+            first sequence applies to `x1` and the second sequence applies
+            to `x2`. Both sequences must have equal length, and each axis
+            `x1_axes[i]` for `x1` must have the same size as the respective
+            axis `x2_axes[i]` for `x2`. Each sequence must consist of unique
+            integers that specify valid axes for each respective array.
+            For example, if `x1` has rank `N`, a valid axis must reside on the
+            half-open interval `[-N, N)`.
+    Returns:
+        usm_ndarray:
+            an array containing the tensor contraction whose shape consists of
+            the non-contracted axes of the first array `x1`, followed by the
+            non-contracted axes of the second array `x2`. The returned array
+            must have a data type determined by Type Promotion Rules.
+    """
+    if not isinstance(x1, dpt.usm_ndarray):
+        raise TypeError(f"Expected dpctl.tensor.usm_ndarray, got {type(x1)}")
+    if not isinstance(x2, dpt.usm_ndarray):
+        raise TypeError(f"Expected dpctl.tensor.usm_ndarray, got {type(x2)}")
+    q1, x1_usm_type = x1.sycl_queue, x1.usm_type
+    q2, x2_usm_type = x2.sycl_queue, x2.usm_type
+    exec_q = dpctl.utils.get_execution_queue((q1, q2))
+    if exec_q is None:
+        raise ExecutionPlacementError(
+            "Execution placement can not be unambiguously inferred "
+            "from input arguments."
+        )
+    res_usm_type = dpctl.utils.get_coerced_usm_type(
+        (
+            x1_usm_type,
+            x2_usm_type,
+        )
+    )
+    dpctl.utils.validate_usm_type(res_usm_type, allow_none=False)
+    # handle axes and shapes validation
+    x1_nd = x1.ndim
+    x2_nd = x2.ndim
+    x1_shape = x1.shape
+    x2_shape = x2.shape
+    if isinstance(axes, int):
+        if axes < 0:
+            raise ValueError("`axes` integer is expected to be non-negative")
+        n_axes1 = axes
+        n_axes2 = axes
+        axes1 = normalize_axis_tuple(tuple(range(-axes, 0)), x1_nd)
+        axes2 = tuple(range(0, axes))
+    elif isinstance(axes, tuple):
+        if len(axes) != 2:
+            raise ValueError(
+                "`axes` tuple is expected to contain two sequences"
+            )
+        axes1 = tuple(axes[0])
+        axes2 = tuple(axes[1])
+        n_axes1 = len(axes1)
+        n_axes2 = len(axes2)
+    else:
+        raise TypeError("`axes` must be an integer or a tuple of sequences")
+    if n_axes1 != n_axes2:
+        raise ValueError(
+            "number of axes contracted must be the same for each array"
+        )
+    if n_axes1 == 0:
+        arr1 = x1[..., dpt.newaxis]
+        arr2 = x2[dpt.newaxis, ...]
+        n_axes1 = 1
+        n_axes2 = 1
+    else:
+        same_shapes = True
+        for i in range(n_axes1):
+            axis1 = axes1[i]
+            axis2 = axes2[i]
+            same_shapes = same_shapes and (x1_shape[axis1] == x2_shape[axis2])
+        if not same_shapes:
+            raise ValueError("shape mismatch in contracted `tensordot` axes")
+        axes1 = normalize_axis_tuple(axes1, x1_nd)
+        axes2 = normalize_axis_tuple(axes2, x2_nd)
+        perm1 = [i for i in range(x1_nd) if i not in axes1] + list(axes1)
+        perm2 = list(axes2) + [i for i in range(x2_nd) if i not in axes2]
+        arr1 = dpt_ext.permute_dims(x1, perm1)
+        arr2 = dpt_ext.permute_dims(x2, perm2)
+    arr1_outer_nd = arr1.ndim - n_axes1
+    arr2_outer_nd = arr2.ndim - n_axes2
+    res_shape = arr1.shape[:arr1_outer_nd] + arr2.shape[n_axes2:]
+    # dtype validation
+    sycl_dev = exec_q.sycl_device
+    x1_dtype = x1.dtype
+    x2_dtype = x2.dtype
+    buf1_dt, buf2_dt, res_dt = _find_buf_dtype2(
+        x1_dtype,
+        x2_dtype,
+        tli._dot_result_type,
+        sycl_dev,
+        acceptance_fn=_acceptance_fn_default_binary,
+    )
+    if res_dt is None:
+        raise TypeError(
+            "function 'tensordot' does not support input types "
+            f"({x1_dtype}, {x2_dtype}), "
+            "and the inputs could not be safely coerced to any "
+            "supported types according to the casting rule ''safe''."
+        )
+
+    _manager = SequentialOrderManager[exec_q]
+    if buf1_dt is None and buf2_dt is None:
+        out = dpt_ext.empty(
+            res_shape,
+            dtype=res_dt,
+            usm_type=res_usm_type,
+            sycl_queue=exec_q,
+            order="C",
+        )
+        dep_evs = _manager.submitted_events
+        ht_dot_ev, dot_ev = tli._dot(
+            x1=arr1,
+            x2=arr2,
+            batch_dims=0,
+            x1_outer_dims=arr1_outer_nd,
+            x2_outer_dims=arr2_outer_nd,
+            inner_dims=n_axes1,
+            dst=out,
+            sycl_queue=exec_q,
+            depends=dep_evs,
+        )
+        _manager.add_event_pair(ht_dot_ev, dot_ev)
+
+        return out
+
+    elif buf1_dt is None:
+        buf2 = _empty_like_orderK(arr2, buf2_dt)
+
+        dep_evs = _manager.submitted_events
+        ht_copy_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
+            src=arr2, dst=buf2, sycl_queue=exec_q, depends=dep_evs
+        )
+        _manager.add_event_pair(ht_copy_ev, copy_ev)
+        out = dpt_ext.empty(
+            res_shape,
+            dtype=res_dt,
+            usm_type=res_usm_type,
+            sycl_queue=exec_q,
+            order="C",
+        )
+        ht_dot_ev, dot_ev = tli._dot(
+            x1=arr1,
+            x2=buf2,
+            batch_dims=0,
+            x1_outer_dims=arr1_outer_nd,
+            x2_outer_dims=arr2_outer_nd,
+            inner_dims=n_axes1,
+            dst=out,
+            sycl_queue=exec_q,
+            depends=[copy_ev],
+        )
+        _manager.add_event_pair(ht_dot_ev, dot_ev)
+
+        return out
+
+    elif buf2_dt is None:
+        buf1 = _empty_like_orderK(arr1, buf1_dt)
+        dep_evs = _manager.submitted_events
+        ht_copy_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
+            src=arr1, dst=buf1, sycl_queue=exec_q, depends=dep_evs
+        )
+        _manager.add_event_pair(ht_copy_ev, copy_ev)
+        out = dpt_ext.empty(
+            res_shape,
+            dtype=res_dt,
+            usm_type=res_usm_type,
+            sycl_queue=exec_q,
+            order="C",
+        )
+        ht_dot_ev, dot_ev = tli._dot(
+            x1=buf1,
+            x2=arr2,
+            batch_dims=0,
+            x1_outer_dims=arr1_outer_nd,
+            x2_outer_dims=arr2_outer_nd,
+            inner_dims=n_axes1,
+            dst=out,
+            sycl_queue=exec_q,
+            depends=[copy_ev],
+        )
+        _manager.add_event_pair(ht_dot_ev, dot_ev)
+
+        return out
+
+    buf1 = _empty_like_orderK(arr1, buf1_dt)
+    deps_ev = _manager.submitted_events
+    ht_copy1_ev, copy1_ev = ti._copy_usm_ndarray_into_usm_ndarray(
+        src=arr1, dst=buf1, sycl_queue=exec_q, depends=deps_ev
+    )
+    _manager.add_event_pair(ht_copy1_ev, copy1_ev)
+    buf2 = _empty_like_orderK(arr2, buf2_dt)
+    ht_copy2_ev, copy2_ev = ti._copy_usm_ndarray_into_usm_ndarray(
+        src=arr2, dst=buf2, sycl_queue=exec_q, depends=deps_ev
+    )
+    _manager.add_event_pair(ht_copy2_ev, copy2_ev)
+    out = dpt_ext.empty(
+        res_shape,
+        dtype=res_dt,
+        usm_type=res_usm_type,
+        sycl_queue=exec_q,
+        order="C",
+    )
+    ht_, dot_ev = tli._dot(
+        x1=buf1,
+        x2=buf2,
+        batch_dims=0,
+        x1_outer_dims=arr1_outer_nd,
+        x2_outer_dims=arr2_outer_nd,
+        inner_dims=n_axes1,
+        dst=out,
+        sycl_queue=exec_q,
+        depends=[copy1_ev, copy2_ev],
+    )
+    _manager.add_event_pair(ht_, dot_ev)
+
+    return out
+
+
+def vecdot(x1, x2, axis=-1):
+    r"""vecdot(x1, x2, axis=-1)
+
+    Computes the (vector) dot product of two arrays.
+
+    Args:
+        x1 (usm_ndarray):
+            first input array.
+        x2 (usm_ndarray):
+            second input array. Input arrays must have compatible
+            shapes along non-contract axes according to broadcasting
+            rules, and must have the same size along the contracted
+            axis. Input arrays should be of numeric type.
+        axis (Optional[int]):
+            axis over which to compute the dot product. The axis must
+            be an integer on the interval `[-N, -1]`, where `N` is
+            ``min(x1.ndim, x2.ndim)``. The axis along which dot product
+            is performed is counted backward from the last axes
+            (that is, `-1` refers to the last axis). By default,
+            dot product is computed over the last axis.
+            Default: `-1`.
+
+    Returns:
+        usm_ndarray:
+            if `x1` and `x2` are both one-dimensional arrays, a
+            zero-dimensional array containing the dot product value
+            is returned; otherwise, a non-zero-dimensional array containing
+            the dot products and having rank `N-1`, where `N` is the rank
+            of the shape of input arrays after broadcasting rules are applied
+            to non-contracted axes.
+    """
+    if not isinstance(x1, dpt.usm_ndarray):
+        raise TypeError(f"Expected dpctl.tensor.usm_ndarray, got {type(x1)}")
+    if not isinstance(x2, dpt.usm_ndarray):
+        raise TypeError(f"Expected dpctl.tensor.usm_ndarray, got {type(x2)}")
+    q1, x1_usm_type = x1.sycl_queue, x1.usm_type
+    q2, x2_usm_type = x2.sycl_queue, x2.usm_type
+    exec_q = dpctl.utils.get_execution_queue((q1, q2))
+    if exec_q is None:
+        raise ExecutionPlacementError(
+            "Execution placement can not be unambiguously inferred "
+            "from input arguments."
+        )
+    res_usm_type = dpctl.utils.get_coerced_usm_type(
+        (
+            x1_usm_type,
+            x2_usm_type,
+        )
+    )
+    dpctl.utils.validate_usm_type(res_usm_type, allow_none=False)
+    # axis and shape validation
+    x1_nd = x1.ndim
+    x2_nd = x2.ndim
+    x1_shape = x1.shape
+    x2_shape = x2.shape
+    if axis >= 0:
+        raise ValueError("`axis` must be negative")
+    axis = operator.index(axis)
+    x1_axis = normalize_axis_index(axis, x1_nd)
+    x2_axis = normalize_axis_index(axis, x2_nd)
+    if x1_shape[x1_axis] != x2_shape[x2_axis]:
+        raise ValueError(
+            "given axis must have the same shape for `x1` and `x2`"
+        )
+    if x1_nd > x2_nd:
+        x2_shape = (1,) * (x1_nd - x2_nd) + x2_shape
+    elif x2_nd > x1_nd:
+        x1_shape = (1,) * (x2_nd - x1_nd) + x1_shape
+    try:
+        broadcast_sh = _broadcast_shape_impl(
+            [
+                x1_shape,
+                x2_shape,
+            ]
+        )
+    except ValueError:
+        raise ValueError("mismatch in `vecdot` dimensions")
+    broadcast_nd = len(broadcast_sh)
+    contracted_axis = normalize_axis_index(axis, broadcast_nd)
+    res_sh = tuple(
+        [broadcast_sh[i] for i in range(broadcast_nd) if i != contracted_axis]
+    )
+    # dtype validation
+    sycl_dev = exec_q.sycl_device
+    x1_dtype = x1.dtype
+    x2_dtype = x2.dtype
+    buf1_dt, buf2_dt, res_dt = _find_buf_dtype2(
+        x1_dtype,
+        x2_dtype,
+        tli._dot_result_type,
+        sycl_dev,
+        acceptance_fn=_acceptance_fn_default_binary,
+    )
+    if res_dt is None:
+        raise TypeError(
+            "function 'vecdot' does not support input types "
+            f"({x1_dtype}, {x2_dtype}), "
+            "and the inputs could not be safely coerced to any "
+            "supported types according to the casting rule ''safe''."
+        )
+
+    _manager = SequentialOrderManager[exec_q]
+    if buf1_dt is None and buf2_dt is None:
+        if x1.dtype.kind == "c":
+            x1_tmp = _empty_like_orderK(x1, x1.dtype)
+            dep_evs = _manager.submitted_events
+            ht_conj_ev, conj_ev = tei._conj(
+                src=x1, dst=x1_tmp, sycl_queue=exec_q, depends=dep_evs
+            )
+            _manager.add_event_pair(ht_conj_ev, conj_ev)
+            x1 = x1_tmp
+        if x1.shape != broadcast_sh:
+            x1 = dpt_ext.broadcast_to(x1, broadcast_sh)
+        if x2.shape != broadcast_sh:
+            x2 = dpt_ext.broadcast_to(x2, broadcast_sh)
+        x1 = dpt_ext.moveaxis(x1, contracted_axis, -1)
+        x2 = dpt_ext.moveaxis(x2, contracted_axis, -1)
+        out = dpt_ext.empty(
+            res_sh,
+            dtype=res_dt,
+            usm_type=res_usm_type,
+            sycl_queue=exec_q,
+            order="C",
+        )
+        dep_evs = _manager.submitted_events
+        ht_dot_ev, dot_ev = tli._dot(
+            x1=x1,
+            x2=x2,
+            batch_dims=len(res_sh),
+            x1_outer_dims=0,
+            x2_outer_dims=0,
+            inner_dims=1,
+            dst=out,
+            sycl_queue=exec_q,
+            depends=dep_evs,
+        )
+        _manager.add_event_pair(ht_dot_ev, dot_ev)
+        return dpt_ext.reshape(out, res_sh)
+
+    elif buf1_dt is None:
+        if x1.dtype.kind == "c":
+            x1_tmp = _empty_like_orderK(x1, x1.dtype)
+            deps_ev = _manager.submitted_events
+            ht_conj_ev, conj_e = tei._conj(
+                src=x1, dst=x1_tmp, sycl_queue=exec_q, depends=deps_ev
+            )
+            _manager.add_event_pair(ht_conj_ev, conj_e)
+            x1 = x1_tmp
+        buf2 = _empty_like_orderK(x2, buf2_dt)
+        deps_ev = _manager.submitted_events
+        ht_copy_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
+            src=x2, dst=buf2, sycl_queue=exec_q, depends=deps_ev
+        )
+        _manager.add_event_pair(ht_copy_ev, copy_ev)
+        if x1.shape != broadcast_sh:
+            x1 = dpt_ext.broadcast_to(x1, broadcast_sh)
+        if buf2.shape != broadcast_sh:
+            buf2 = dpt_ext.broadcast_to(buf2, broadcast_sh)
+        x1 = dpt_ext.moveaxis(x1, contracted_axis, -1)
+        buf2 = dpt_ext.moveaxis(buf2, contracted_axis, -1)
+        out = dpt_ext.empty(
+            res_sh,
+            dtype=res_dt,
+            usm_type=res_usm_type,
+            sycl_queue=exec_q,
+            order="C",
+        )
+        ht_dot_ev, dot_ev = tli._dot(
+            x1=x1,
+            x2=buf2,
+            batch_dims=len(res_sh),
+            x1_outer_dims=0,
+            x2_outer_dims=0,
+            inner_dims=1,
+            dst=out,
+            sycl_queue=exec_q,
+            depends=[copy_ev],
+        )
+        _manager.add_event_pair(ht_dot_ev, dot_ev)
+        return dpt_ext.reshape(out, res_sh)
+
+    elif buf2_dt is None:
+        buf1 = _empty_like_orderK(x1, buf1_dt)
+        deps_ev = _manager.submitted_events
+        ht_copy_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
+            src=x1, dst=buf1, sycl_queue=exec_q, depends=deps_ev
+        )
+        _manager.add_event_pair(ht_copy_ev, copy_ev)
+        if buf1.dtype.kind == "c":
+            ht_conj_ev, conj_ev = tei._conj(
+                src=buf1, dst=buf1, sycl_queue=exec_q, depends=[copy_ev]
+            )
+            _manager.add_event_pair(ht_conj_ev, conj_ev)
+        if buf1.shape != broadcast_sh:
+            buf1 = dpt_ext.broadcast_to(buf1, broadcast_sh)
+        if x2.shape != broadcast_sh:
+            x2 = dpt_ext.broadcast_to(x2, broadcast_sh)
+        buf1 = dpt_ext.moveaxis(buf1, contracted_axis, -1)
+        x2 = dpt_ext.moveaxis(x2, contracted_axis, -1)
+        out = dpt_ext.empty(
+            res_sh,
+            dtype=res_dt,
+            usm_type=res_usm_type,
+            sycl_queue=exec_q,
+            order="C",
+        )
+        deps_ev = _manager.submitted_events
+        ht_dot_ev, dot_ev = tli._dot(
+            x1=buf1,
+            x2=x2,
+            batch_dims=len(res_sh),
+            x1_outer_dims=0,
+            x2_outer_dims=0,
+            inner_dims=1,
+            dst=out,
+            sycl_queue=exec_q,
+            depends=deps_ev,
+        )
+        _manager.add_event_pair(ht_dot_ev, dot_ev)
+        return dpt_ext.reshape(out, res_sh)
+
+    buf1 = _empty_like_orderK(x1, buf1_dt)
+    deps_ev = _manager.submitted_events
+    ht_copy1_ev, copy1_ev = ti._copy_usm_ndarray_into_usm_ndarray(
+        src=x1, dst=buf1, sycl_queue=exec_q, depends=deps_ev
+    )
+    _manager.add_event_pair(ht_copy1_ev, copy1_ev)
+    if buf1.dtype.kind == "c":
+        ht_conj_ev, conj_ev = tei._conj(
+            src=buf1, dst=buf1, sycl_queue=exec_q, depends=[copy1_ev]
+        )
+        _manager.add_event_pair(ht_conj_ev, conj_ev)
+    buf2 = _empty_like_orderK(x2, buf2_dt)
+    ht_copy2_ev, copy2_ev = ti._copy_usm_ndarray_into_usm_ndarray(
+        src=x2, dst=buf2, sycl_queue=exec_q, depends=deps_ev
+    )
+    _manager.add_event_pair(ht_copy2_ev, copy2_ev)
+    if buf1.shape != broadcast_sh:
+        buf1 = dpt_ext.broadcast_to(buf1, broadcast_sh)
+    if buf2.shape != broadcast_sh:
+        buf2 = dpt_ext.broadcast_to(buf2, broadcast_sh)
+    buf1 = dpt_ext.moveaxis(buf1, contracted_axis, -1)
+    buf2 = dpt_ext.moveaxis(buf2, contracted_axis, -1)
+    out = dpt_ext.empty(
+        res_sh,
+        dtype=res_dt,
+        usm_type=res_usm_type,
+        sycl_queue=exec_q,
+        order="C",
+    )
+    deps_ev = _manager.submitted_events
+    ht_dot_ev, dot_ev = tli._dot(
+        x1=buf1,
+        x2=buf2,
+        batch_dims=len(res_sh),
+        x1_outer_dims=0,
+        x2_outer_dims=0,
+        inner_dims=1,
+        dst=out,
+        sycl_queue=exec_q,
+        depends=deps_ev,
+    )
+    _manager.add_event_pair(ht_dot_ev, dot_ev)
+    return out
+
+
+def matmul(x1, x2, out=None, dtype=None, order="K"):
+    r"""matmul(x1, x2, out=None, order="K")
+
+    Computes the matrix product. Implements the same semantics
+    as the built-in operator `@`.
+
+    Args:
+        x1 (usm_ndarray):
+            first input array. Expected to have numeric data type, and
+            at least one dimension. If `x1` is one-dimensional having
+            shape `(M,)`, and `x2` has more than one dimension, `x1` is
+            effectively treated as a two-dimensional array with shape `(1, M)`,
+            although the prepended dimension is removed from the output array.
+            If `x1` has shape `(..., M, K)`, the innermost two dimensions form
+            matrices on which to perform matrix multiplication.
+        x2 (usm_ndarray):
+            second input array. Expected to have numeric data type, and
+            at least one dimension. If `x2` is one-dimensional having
+            shape `(N,)`, and `x1` has more than one dimension, `x2` is
+            effectively treated as a two-dimensional array with shape `(N, 1)`,
+            although the appended dimension is removed from the output array.
+            If `x2` has shape `(..., K, N)`, the innermost two dimensions form
+            matrices on which to perform matrix multiplication.
+        out (Optional[usm_ndarray]):
+            the array into which the result of the matrix product is written.
+            The data type of `out` must match the expected data type of the
+            result or (if provided) `dtype`.
+            If `None` then a new array is returned. Default: `None`.
+        dtype (Optional[dtype]):
+            data type of the returned array. If `None`, the data type of the
+            returned array is determined by the Type Promotion Rules.
+            Default: `None`.
+        order (["K", "C", "F", "A"]):
+            memory layout of the output array, if `out` is `None`, otherwise
+            the `order` parameter value is not used. Default: `K`.
+    Returns:
+        usm_ndarray:
+            * if both `x1` and `x2` are one-dimensional arrays with shape
+              `(N,)`, returned array is a zero-dimensional array containing
+              inner product as its only element.
+            * if `x1` is two-dimensional array with shape `(M, K)` and `x2` is
+              a two-dimensional array with shape `(K, N)`, returned array is a
+              two-dimensional array with shape `(M, N)` and contains the
+              conventional matrix product.
+            * if `x1` is a one-dimensional array with shape `(K,)` and `x2` is
+              an array with shape `(..., K, N)`, returned array contains the
+              conventional matrix product and has shape `(..., N)`.
+            * if `x1` is an array with shape `(..., M, K)` and `x2` is a
+              one-dimensional array with shape `(K,)`, returned array has shape
+              `(..., M)` and contains the conventional matrix product.
+            * if `x1` is a two-dimensional array with shape `(M, K)` and `x2`
+              is an array with shape `(..., K, N)`, returned array contains
+              conventional matrix product for each stacked matrix and has shape
+              `(..., M, N)`.
+            * if `x1` has shape `(..., M, K)` and `x2` is a two-dimensional
+              array with shape `(K, N)`, returned array contains conventional
+              matrix product for each stacked matrix and has shape
+              `(..., M, N)`.
+            * if both `x1` and `x2` have more than two dimensions, returned
+              array contains conventional matrix product for each stacked
+              matrix and has shape determined by broadcasting rules for
+              `x1.shape[:-2]` and `x2.shape[:-2]`.
+
+            The data type of the returned array is determined by the Type
+            Promotion Rules. If either `x1` or `x2` has a complex floating
+            point type, neither argument is complex conjugated or transposed.
+    """
+    if not isinstance(x1, dpt.usm_ndarray):
+        raise TypeError(f"Expected dpctl.tensor.usm_ndarray, got {type(x1)}")
+    if not isinstance(x2, dpt.usm_ndarray):
+        raise TypeError(f"Expected dpctl.tensor.usm_ndarray, got {type(x2)}")
+    if order not in ["K", "C", "F", "A"]:
+        order = "K"
+    q1, x1_usm_type = x1.sycl_queue, x1.usm_type
+    q2, x2_usm_type = x2.sycl_queue, x2.usm_type
+    exec_q = dpctl.utils.get_execution_queue((q1, q2))
+    if exec_q is None:
+        raise ExecutionPlacementError(
+            "Execution placement can not be unambiguously inferred "
+            "from input arguments."
+        )
+    res_usm_type = dpctl.utils.get_coerced_usm_type(
+        (
+            x1_usm_type,
+            x2_usm_type,
+        )
+    )
+    dpctl.utils.validate_usm_type(res_usm_type, allow_none=False)
+
+    x1_nd = x1.ndim
+    x2_nd = x2.ndim
+    if x1_nd == 0 or x2_nd == 0:
+        raise ValueError("one or more operands to `matmul` is 0 dimensional")
+    x1_shape = x1.shape
+    x2_shape = x2.shape
+    appended_axes = []
+    if x1_nd == 1:
+        x1 = x1[dpt.newaxis, :]
+        x1_shape = x1.shape
+        appended_axes.append(-2)
+    if x2_nd == 1:
+        x2 = x2[:, dpt.newaxis]
+        x2_shape = x2.shape
+        appended_axes.append(-1)
+    if x1_shape[-1] != x2_shape[-2]:
+        raise ValueError("mismatch in `matmul` inner dimension")
+    x1_outer_sh = x1_shape[:-2]
+    x2_outer_sh = x2_shape[:-2]
+    try:
+        res_outer_sh = _broadcast_shape_impl(
+            [
+                x1_outer_sh,
+                x2_outer_sh,
+            ]
+        )
+    except ValueError:
+        raise ValueError("mismatch in `matmul` batching dimensions")
+    x1_broadcast_shape = res_outer_sh + x1_shape[-2:]
+    x2_broadcast_shape = res_outer_sh + x2_shape[-2:]
+    res_shape = res_outer_sh + x1_shape[-2:-1] + x2_shape[-1:]
+
+    sycl_dev = exec_q.sycl_device
+    x1_dtype = x1.dtype
+    x2_dtype = x2.dtype
+    if dtype is None:
+        buf1_dt, buf2_dt, res_dt = _find_buf_dtype2(
+            x1_dtype,
+            x2_dtype,
+            tli._dot_result_type,
+            sycl_dev,
+            acceptance_fn=_acceptance_fn_default_binary,
+        )
+        if res_dt is None:
+            raise ValueError(
+                "function 'matmul' does not support input types "
+                f"({x1_dtype}, {x2_dtype}), "
+                "and the inputs could not be safely coerced to any "
+                "supported types according to the casting rule ''safe''."
+            )
+    else:
+        res_dt = dpt.dtype(dtype)
+        res_dt = _to_device_supported_dtype(res_dt, sycl_dev)
+        buf1_dt, buf2_dt = None, None
+        if x1_dtype != res_dt:
+            if dpt_ext.can_cast(x1_dtype, res_dt, casting="same_kind"):
+                buf1_dt = res_dt
+            else:
+                raise ValueError(
+                    r"`matmul` input `x1` cannot be cast from "
+                    f"{x1_dtype} to "
+                    f"requested type {res_dt} according to the casting rule "
+                    "''same_kind''."
+                )
+        if x2_dtype != res_dt:
+            if dpt_ext.can_cast(x2_dtype, res_dt, casting="same_kind"):
+                buf2_dt = res_dt
+            else:
+                raise ValueError(
+                    r"`matmul` input `x2` cannot be cast from "
+                    f"{x2_dtype} to "
+                    f"requested type {res_dt} according to the casting rule "
+                    "''same_kind''."
+                )
+
+    orig_out = out
+    if out is not None:
+        if not isinstance(out, dpt.usm_ndarray):
+            raise TypeError(
+                f"output array must be of usm_ndarray type, got {type(out)}"
+            )
+
+        if not out.flags.writable:
+            raise ValueError("provided `out` array is read-only")
+
+        final_res_shape = tuple(
+            res_shape[i]
+            for i in range(-len(res_shape), 0)
+            if i not in appended_axes
+        )
+        if out.shape != final_res_shape:
+            raise ValueError(
+                "The shape of input and output arrays are inconsistent. "
+                f"Expected output shape is {final_res_shape}, got {out.shape}"
+            )
+
+        if appended_axes:
+            out = dpt_ext.expand_dims(out, axis=appended_axes)
+            orig_out = out
+
+        if res_dt != out.dtype:
+            raise ValueError(
+                f"Output array of type {res_dt} is needed, got {out.dtype}"
+            )
+
+        if dpctl.utils.get_execution_queue((exec_q, out.sycl_queue)) is None:
+            raise ExecutionPlacementError(
+                "Input and output allocation queues are not compatible"
+            )
+
+        if ti._array_overlap(x1, out) and buf1_dt is None:
+            out = dpt_ext.empty_like(out)
+
+        if ti._array_overlap(x2, out) and buf2_dt is None:
+            # should not reach if out is reallocated
+            # after being checked against x1
+            out = dpt_ext.empty_like(out)
+
+    if order == "A":
+        order = (
+            "F"
+            if all(
+                arr.flags.f_contiguous
+                for arr in (
+                    x1,
+                    x2,
+                )
+            )
+            else "C"
+        )
+
+    _manager = SequentialOrderManager[exec_q]
+    if buf1_dt is None and buf2_dt is None:
+        if out is None:
+            if order == "K":
+                out = _empty_like_pair_orderK(
+                    x1, x2, res_dt, res_shape, res_usm_type, exec_q
+                )
+            else:
+                out = dpt_ext.empty(
+                    res_shape,
+                    dtype=res_dt,
+                    usm_type=res_usm_type,
+                    sycl_queue=exec_q,
+                    order=order,
+                )
+        if x1.shape != x1_broadcast_shape:
+            x1 = dpt_ext.broadcast_to(x1, x1_broadcast_shape)
+        if x2.shape != x2_broadcast_shape:
+            x2 = dpt_ext.broadcast_to(x2, x2_broadcast_shape)
+        deps_evs = _manager.submitted_events
+        ht_dot_ev, dot_ev = tli._dot(
+            x1=x1,
+            x2=x2,
+            batch_dims=len(res_shape[:-2]),
+            x1_outer_dims=1,
+            x2_outer_dims=1,
+            inner_dims=1,
+            dst=out,
+            sycl_queue=exec_q,
+            depends=deps_evs,
+        )
+        _manager.add_event_pair(ht_dot_ev, dot_ev)
+        if not (orig_out is None or orig_out is out):
+            # Copy the out data from temporary buffer to original memory
+            ht_copy_out_ev, cpy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
+                src=out,
+                dst=orig_out,
+                sycl_queue=exec_q,
+                depends=[dot_ev],
+            )
+            _manager.add_event_pair(ht_copy_out_ev, cpy_ev)
+            out = orig_out
+        if appended_axes:
+            out = dpt_ext.squeeze(out, tuple(appended_axes))
+        return out
+    elif buf1_dt is None:
+        if order == "K":
+            buf2 = _empty_like_orderK(x2, buf2_dt)
+        else:
+            buf2 = dpt_ext.empty_like(x2, dtype=buf2_dt, order=order)
+        deps_evs = _manager.submitted_events
+        ht_copy_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
+            src=x2, dst=buf2, sycl_queue=exec_q, depends=deps_evs
+        )
+        _manager.add_event_pair(ht_copy_ev, copy_ev)
+        if out is None:
+            if order == "K":
+                out = _empty_like_pair_orderK(
+                    x1, buf2, res_dt, res_shape, res_usm_type, exec_q
+                )
+            else:
+                out = dpt_ext.empty(
+                    res_shape,
+                    dtype=res_dt,
+                    usm_type=res_usm_type,
+                    sycl_queue=exec_q,
+                    order=order,
+                )
+
+        if x1.shape != x1_broadcast_shape:
+            x1 = dpt_ext.broadcast_to(x1, x1_broadcast_shape)
+        if buf2.shape != x2_broadcast_shape:
+            buf2 = dpt_ext.broadcast_to(buf2, x2_broadcast_shape)
+        ht_dot_ev, dot_ev = tli._dot(
+            x1=x1,
+            x2=buf2,
+            batch_dims=len(res_shape[:-2]),
+            x1_outer_dims=1,
+            x2_outer_dims=1,
+            inner_dims=1,
+            dst=out,
+            sycl_queue=exec_q,
+            depends=[copy_ev],
+        )
+        _manager.add_event_pair(ht_dot_ev, dot_ev)
+        if not (orig_out is None or orig_out is out):
+            # Copy the out data from temporary buffer to original memory
+            ht_copy_out_ev, cpy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
+                src=out,
+                dst=orig_out,
+                sycl_queue=exec_q,
+                depends=[dot_ev],
+            )
+            _manager.add_event_pair(ht_copy_out_ev, cpy_ev)
+            out = orig_out
+        if appended_axes:
+            out = dpt_ext.squeeze(out, tuple(appended_axes))
+        return out
+
+    elif buf2_dt is None:
+        if order == "K":
+            buf1 = _empty_like_orderK(x1, buf1_dt)
+        else:
+            buf1 = dpt_ext.empty_like(x1, dtype=buf1_dt, order=order)
+        deps_ev = _manager.submitted_events
+        ht_copy_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
+            src=x1, dst=buf1, sycl_queue=exec_q, depends=deps_ev
+        )
+        _manager.add_event_pair(ht_copy_ev, copy_ev)
+        if out is None:
+            if order == "K":
+                out = _empty_like_pair_orderK(
+                    buf1, x2, res_dt, res_shape, res_usm_type, exec_q
+                )
+            else:
+                out = dpt_ext.empty(
+                    res_shape,
+                    dtype=res_dt,
+                    usm_type=res_usm_type,
+                    sycl_queue=exec_q,
+                    order=order,
+                )
+
+        if buf1.shape != x1_broadcast_shape:
+            buf1 = dpt_ext.broadcast_to(buf1, x1_broadcast_shape)
+        if x2.shape != x2_broadcast_shape:
+            x2 = dpt_ext.broadcast_to(x2, x2_broadcast_shape)
+        ht_dot_ev, dot_ev = tli._dot(
+            x1=buf1,
+            x2=x2,
+            batch_dims=len(res_shape[:-2]),
+            x1_outer_dims=1,
+            x2_outer_dims=1,
+            inner_dims=1,
+            dst=out,
+            sycl_queue=exec_q,
+            depends=[copy_ev],
+        )
+        _manager.add_event_pair(ht_dot_ev, dot_ev)
+        if not (orig_out is None or orig_out is out):
+            # Copy the out data from temporary buffer to original memory
+            ht_copy_out_ev, cpy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
+                src=out,
+                dst=orig_out,
+                sycl_queue=exec_q,
+                depends=[dot_ev],
+            )
+            _manager.add_event_pair(ht_copy_out_ev, cpy_ev)
+            out = orig_out
+        if appended_axes:
+            out = dpt_ext.squeeze(out, tuple(appended_axes))
+        return out
+
+    if order == "K":
+        if x1.flags.c_contiguous and x2.flags.c_contiguous:
+            order = "C"
+        elif x1.flags.f_contiguous and x2.flags.f_contiguous:
+            order = "F"
+    if order == "K":
+        buf1 = _empty_like_orderK(x1, buf1_dt)
+    else:
+        buf1 = dpt_ext.empty_like(x1, dtype=buf1_dt, order=order)
+    deps_ev = _manager.submitted_events
+    ht_copy1_ev, copy1_ev = ti._copy_usm_ndarray_into_usm_ndarray(
+        src=x1, dst=buf1, sycl_queue=exec_q, depends=deps_ev
+    )
+    _manager.add_event_pair(ht_copy1_ev, copy1_ev)
+    if order == "K":
+        buf2 = _empty_like_orderK(x2, buf2_dt)
+    else:
+        buf2 = dpt_ext.empty_like(x2, dtype=buf2_dt, order=order)
+    ht_copy2_ev, copy2_ev = ti._copy_usm_ndarray_into_usm_ndarray(
+        src=x2, dst=buf2, sycl_queue=exec_q, depends=deps_ev
+    )
+    _manager.add_event_pair(ht_copy2_ev, copy2_ev)
+    if out is None:
+        if order == "K":
+            out = _empty_like_pair_orderK(
+                buf1, buf2, res_dt, res_shape, res_usm_type, exec_q
+            )
+        else:
+            out = dpt_ext.empty(
+                res_shape,
+                dtype=res_dt,
+                usm_type=res_usm_type,
+                sycl_queue=exec_q,
+                order=order,
+            )
+
+    if buf1.shape != x1_broadcast_shape:
+        buf1 = dpt_ext.broadcast_to(buf1, x1_broadcast_shape)
+    if buf2.shape != x2_broadcast_shape:
+        buf2 = dpt_ext.broadcast_to(buf2, x2_broadcast_shape)
+    ht_, dot_ev = tli._dot(
+        x1=buf1,
+        x2=buf2,
+        batch_dims=len(res_shape[:-2]),
+        x1_outer_dims=1,
+        x2_outer_dims=1,
+        inner_dims=1,
+        dst=out,
+        sycl_queue=exec_q,
+        depends=[copy1_ev, copy2_ev],
+    )
+    _manager.add_event_pair(ht_, dot_ev)
+    if appended_axes:
+        out = dpt_ext.squeeze(out, tuple(appended_axes))
+    return out
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/linalg_functions/dot_product.hpp b/dpctl_ext/tensor/libtensor/include/kernels/linalg_functions/dot_product.hpp
new file mode 100644
index 000000000000..697fad932755
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/include/kernels/linalg_functions/dot_product.hpp
@@ -0,0 +1,1402 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for the vector dot product.
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <algorithm>
+#include <cstddef>
+#include <sycl/sycl.hpp>
+#include <type_traits>
+#include <utility>
+#include <vector>
+
+#include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/reductions.hpp"
+#include "utils/offset_utils.hpp"
+#include "utils/sycl_alloc_utils.hpp"
+#include "utils/sycl_utils.hpp"
+#include "utils/type_utils.hpp"
+
+namespace dpctl::tensor::kernels
+{
+
+using dpctl::tensor::ssize_t;
+namespace su_ns = dpctl::tensor::sycl_utils;
+
+template <typename lhsT,
+          typename rhsT,
+          typename outT,
+          typename BatchIndexerT,
+          typename RedIndexerT>
+struct SequentialDotProduct
+{
+private:
+    const lhsT *lhs_ = nullptr;
+    const rhsT *rhs_ = nullptr;
+    outT *out_ = nullptr;
+    BatchIndexerT batch_indexer_;
+    RedIndexerT reduced_dims_indexer_;
+    std::size_t reduction_max_gid_ = 0;
+
+public:
+    SequentialDotProduct(const lhsT *lhs,
+                         const rhsT *rhs,
+                         outT *out,
+                         BatchIndexerT batch_indexer,
+                         RedIndexerT reduced_dims_indexer,
+                         std::size_t reduction_size)
+        : lhs_(lhs), rhs_(rhs), out_(out), batch_indexer_(batch_indexer),
+          reduced_dims_indexer_(reduced_dims_indexer),
+          reduction_max_gid_(reduction_size)
+    {
+    }
+
+    void operator()(sycl::id<1> id) const
+    {
+
+        auto const &batch_offsets = batch_indexer_(id[0]);
+        const ssize_t &lhs_batch_offset = batch_offsets.get_first_offset();
+        const ssize_t &rhs_batch_offset = batch_offsets.get_second_offset();
+        const ssize_t &out_batch_offset = batch_offsets.get_third_offset();
+
+        outT red_val(0);
+        for (std::size_t m = 0; m < reduction_max_gid_; ++m) {
+            auto reduction_offsets = reduced_dims_indexer_(m);
+            auto lhs_reduction_offset = reduction_offsets.get_first_offset();
+            auto rhs_reduction_offset = reduction_offsets.get_second_offset();
+
+            using dpctl::tensor::type_utils::convert_impl;
+            red_val += convert_impl<outT, lhsT>(
+                           lhs_[lhs_batch_offset + lhs_reduction_offset]) *
+                       convert_impl<outT, rhsT>(
+                           rhs_[rhs_batch_offset + rhs_reduction_offset]);
+        }
+
+        out_[out_batch_offset] = red_val;
+    }
+};
+
+template <typename lhsT,
+          typename rhsT,
+          typename outT,
+          typename ReductionOpT,
+          typename BatchIndexerT,
+          typename RedIndexerT>
+struct DotProductFunctor
+{
+private:
+    const lhsT *lhs_ = nullptr;
+    const rhsT *rhs_ = nullptr;
+    outT *out_ = nullptr;
+    ReductionOpT reduction_op_;
+    BatchIndexerT batch_indexer_;
+    RedIndexerT reduced_dims_indexer_;
+    std::size_t reduction_max_gid_ = 0;
+    std::size_t batches_ = 1;
+    std::size_t reductions_per_wi = 16;
+
+public:
+    DotProductFunctor(const lhsT *lhs,
+                      const rhsT *rhs,
+                      outT *res,
+                      const ReductionOpT &reduction_op,
+                      const BatchIndexerT &batch_indexer,
+                      const RedIndexerT &arg_reduced_dims_indexer,
+                      std::size_t reduction_size,
+                      std::size_t iteration_size,
+                      std::size_t reduction_size_per_wi)
+        : lhs_(lhs), rhs_(rhs), out_(res), reduction_op_(reduction_op),
+          batch_indexer_(batch_indexer),
+          reduced_dims_indexer_(arg_reduced_dims_indexer),
+          reduction_max_gid_(reduction_size), batches_(iteration_size),
+          reductions_per_wi(reduction_size_per_wi)
+    {
+    }
+
+    void operator()(sycl::nd_item<1> it) const
+    {
+        const std::size_t batch_id = it.get_group(0) % batches_;
+        const std::size_t reduction_batch_id = it.get_group(0) / batches_;
+
+        const std::size_t reduction_lid = it.get_local_id(0);
+        const std::size_t wg =
+            it.get_local_range(0); //   0 <= reduction_lid < wg
+
+        // work-items operate over input with indices
+        //   inp_data_id = reduction_batch_id * wg * reductions_per_wi + m * wg
+        //   + reduction_lid
+        // for 0 <= m < reductions_per_wi
+        // for each input
+
+        const auto &batch_offsets_ = batch_indexer_(batch_id);
+        const auto &lhs_batch_offset = batch_offsets_.get_first_offset();
+        const auto &rhs_batch_offset = batch_offsets_.get_second_offset();
+        const auto &out_batch_offset = batch_offsets_.get_third_offset();
+
+        outT local_red_val(0);
+        std::size_t arg_reduce_gid0 =
+            reduction_lid + reduction_batch_id * wg * reductions_per_wi;
+        std::size_t arg_reduce_gid_max = std::min(
+            reduction_max_gid_, arg_reduce_gid0 + reductions_per_wi * wg);
+
+        for (std::size_t arg_reduce_gid = arg_reduce_gid0;
+             arg_reduce_gid < arg_reduce_gid_max; arg_reduce_gid += wg)
+        {
+            auto reduction_offsets_ = reduced_dims_indexer_(arg_reduce_gid);
+            const auto &lhs_reduction_offset =
+                reduction_offsets_.get_first_offset();
+            const auto &rhs_reduction_offset =
+                reduction_offsets_.get_second_offset();
+
+            using dpctl::tensor::type_utils::convert_impl;
+            outT val = convert_impl<outT, lhsT>(
+                           lhs_[lhs_batch_offset + lhs_reduction_offset]) *
+                       convert_impl<outT, rhsT>(
+                           rhs_[rhs_batch_offset + rhs_reduction_offset]);
+
+            local_red_val += val;
+        }
+
+        auto work_group = it.get_group();
+        outT red_val_over_wg = sycl::reduce_over_group(
+            work_group, local_red_val, outT(0), reduction_op_);
+
+        if (work_group.leader()) {
+            sycl::atomic_ref<outT, sycl::memory_order::relaxed,
+                             sycl::memory_scope::device,
+                             sycl::access::address_space::global_space>
+                res_ref(out_[out_batch_offset]);
+            res_ref += red_val_over_wg;
+        }
+    }
+};
+
+template <typename lhsT,
+          typename rhsT,
+          typename outT,
+          typename ReductionOpT,
+          typename BatchIndexerT,
+          typename RedIndexerT,
+          typename SlmT>
+struct DotProductCustomFunctor
+{
+private:
+    const lhsT *lhs_ = nullptr;
+    const rhsT *rhs_ = nullptr;
+    outT *out_ = nullptr;
+    ReductionOpT reduction_op_;
+    BatchIndexerT batch_indexer_;
+    RedIndexerT reduced_dims_indexer_;
+    SlmT local_mem_;
+    std::size_t reduction_max_gid_ = 0;
+    std::size_t batches_ = 1;
+    std::size_t reductions_per_wi = 16;
+
+public:
+    DotProductCustomFunctor(const lhsT *lhs,
+                            const rhsT *rhs,
+                            outT *res,
+                            const ReductionOpT &reduction_op,
+                            const BatchIndexerT &batch_indexer,
+                            const RedIndexerT &arg_reduced_dims_indexer,
+                            SlmT local_mem,
+                            std::size_t reduction_size,
+                            std::size_t iteration_size,
+                            std::size_t reduction_size_per_wi)
+        : lhs_(lhs), rhs_(rhs), out_(res), reduction_op_(reduction_op),
+          batch_indexer_(batch_indexer),
+          reduced_dims_indexer_(arg_reduced_dims_indexer),
+          local_mem_(local_mem), reduction_max_gid_(reduction_size),
+          batches_(iteration_size), reductions_per_wi(reduction_size_per_wi)
+    {
+    }
+
+    void operator()(sycl::nd_item<1> it) const
+    {
+        const std::size_t batch_id = it.get_group(0) % batches_;
+        const std::size_t reduction_batch_id = it.get_group(0) / batches_;
+
+        const std::size_t reduction_lid = it.get_local_id(0);
+        const std::size_t wg =
+            it.get_local_range(0); //   0 <= reduction_lid < wg
+
+        // work-items operate over input with indices
+        //   inp_data_id = reduction_batch_id * wg * reductions_per_wi + m * wg
+        //   + reduction_lid
+        // for 0 <= m < reductions_per_wi
+        // for each input
+
+        const auto &batch_offsets_ = batch_indexer_(batch_id);
+        const auto &lhs_batch_offset = batch_offsets_.get_first_offset();
+        const auto &rhs_batch_offset = batch_offsets_.get_second_offset();
+        const auto &out_batch_offset = batch_offsets_.get_third_offset();
+
+        outT local_red_val(0);
+        std::size_t arg_reduce_gid0 =
+            reduction_lid + reduction_batch_id * wg * reductions_per_wi;
+        std::size_t arg_reduce_gid_max = std::min(
+            reduction_max_gid_, arg_reduce_gid0 + reductions_per_wi * wg);
+
+        for (std::size_t arg_reduce_gid = arg_reduce_gid0;
+             arg_reduce_gid < arg_reduce_gid_max; arg_reduce_gid += wg)
+        {
+            auto reduction_offsets_ = reduced_dims_indexer_(arg_reduce_gid);
+            const auto &lhs_reduction_offset =
+                reduction_offsets_.get_first_offset();
+            const auto &rhs_reduction_offset =
+                reduction_offsets_.get_second_offset();
+
+            using dpctl::tensor::type_utils::convert_impl;
+            outT val = convert_impl<outT, lhsT>(
+                           lhs_[lhs_batch_offset + lhs_reduction_offset]) *
+                       convert_impl<outT, rhsT>(
+                           rhs_[rhs_batch_offset + rhs_reduction_offset]);
+
+            local_red_val += val;
+        }
+
+        auto work_group = it.get_group();
+        outT red_val_over_wg = su_ns::custom_reduce_over_group(
+            work_group, local_mem_, local_red_val, reduction_op_);
+
+        if (work_group.leader()) {
+            sycl::atomic_ref<outT, sycl::memory_order::relaxed,
+                             sycl::memory_scope::device,
+                             sycl::access::address_space::global_space>
+                res_ref(out_[out_batch_offset]);
+            res_ref += red_val_over_wg;
+        }
+    }
+};
+
+template <
+    typename lhsTy,
+    typename rhsTy,
+    typename resTy,
+    typename BatchIndexerT,
+    typename RedIndexerT,
+    template <typename T1, typename T2, typename T3, typename T4, typename T5>
+    class kernel_name_token>
+sycl::event sequential_dot_product(sycl::queue &exec_q,
+                                   const lhsTy *lhs,
+                                   const rhsTy *rhs,
+                                   resTy *res,
+                                   std::size_t batches,
+                                   std::size_t reduction_nelems,
+                                   const BatchIndexerT &batch_indexer,
+                                   const RedIndexerT &reduction_indexer,
+                                   const std::vector<sycl::event> &depends)
+{
+    sycl::event dot_ev = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+
+        cgh.parallel_for<
+            kernel_name_token<lhsTy, rhsTy, resTy, BatchIndexerT, RedIndexerT>>(
+            sycl::range<1>(batches),
+            SequentialDotProduct<lhsTy, rhsTy, resTy, BatchIndexerT,
+                                 RedIndexerT>(lhs, rhs, res, batch_indexer,
+                                              reduction_indexer,
+                                              reduction_nelems));
+    });
+
+    return dot_ev;
+}
+
+template <typename lhsTy,
+          typename rhsTy,
+          typename resTy,
+          typename ReductionOpT,
+          typename BatchIndexerT,
+          typename RedIndexerT,
+          template <typename T1,
+                    typename T2,
+                    typename T3,
+                    typename T4,
+                    typename T5,
+                    typename T6>
+          class kernel_name_token>
+sycl::event submit_atomic_dot_product(sycl::queue &exec_q,
+                                      const lhsTy *lhs,
+                                      const rhsTy *rhs,
+                                      resTy *res,
+                                      std::size_t wg,
+                                      std::size_t batches,
+                                      std::size_t reduction_nelems,
+                                      std::size_t reductions_per_wi,
+                                      std::size_t reduction_groups,
+                                      const BatchIndexerT &batch_indexer,
+                                      const RedIndexerT &reduction_indexer,
+                                      const std::vector<sycl::event> &depends)
+{
+    sycl::event dot_ev = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+
+        auto globalRange = sycl::range<1>{batches * reduction_groups * wg};
+        auto localRange = sycl::range<1>{wg};
+        auto ndRange = sycl::nd_range<1>(globalRange, localRange);
+
+        if constexpr (can_use_reduce_over_group<ReductionOpT, resTy>::value) {
+            using KernelName =
+                class kernel_name_token<lhsTy, rhsTy, resTy, ReductionOpT,
+                                        BatchIndexerT, RedIndexerT>;
+
+            cgh.parallel_for<KernelName>(
+                ndRange, DotProductFunctor<lhsTy, rhsTy, resTy, ReductionOpT,
+                                           BatchIndexerT, RedIndexerT>(
+                             lhs, rhs, res, ReductionOpT(), batch_indexer,
+                             reduction_indexer, reduction_nelems, batches,
+                             reductions_per_wi));
+        }
+        else {
+            using SlmT = sycl::local_accessor<resTy, 1>;
+            SlmT local_memory = SlmT(localRange, cgh);
+
+            using KernelName = class custom_reduction_wrapper<kernel_name_token<
+                lhsTy, rhsTy, resTy, ReductionOpT, BatchIndexerT, RedIndexerT>>;
+
+            cgh.parallel_for<KernelName>(
+                ndRange,
+                DotProductCustomFunctor<lhsTy, rhsTy, resTy, ReductionOpT,
+                                        BatchIndexerT, RedIndexerT, SlmT>(
+                    lhs, rhs, res, ReductionOpT(), batch_indexer,
+                    reduction_indexer, local_memory, reduction_nelems, batches,
+                    reductions_per_wi));
+        }
+    });
+    return dot_ev;
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5>
+class dot_product_seq_krn;
+
+template <typename T1, typename T2, typename T3>
+class dot_product_init_krn;
+
+template <typename T1,
+          typename T2,
+          typename T3,
+          typename T4,
+          typename T5,
+          typename T6>
+class dot_product_krn;
+
+typedef sycl::event (*dot_product_impl_fn_ptr_t)(
+    sycl::queue &,
+    std::size_t,
+    std::size_t,
+    const char *,
+    const char *,
+    char *,
+    int,
+    const ssize_t *,
+    ssize_t,
+    ssize_t,
+    ssize_t,
+    int,
+    const ssize_t *,
+    ssize_t,
+    ssize_t,
+    const std::vector<sycl::event> &);
+
+template <typename lhsTy, typename rhsTy, typename resTy>
+sycl::event dot_product_impl(sycl::queue &exec_q,
+                             std::size_t batches,
+                             std::size_t reduction_nelems,
+                             const char *lhs_cp,
+                             const char *rhs_cp,
+                             char *res_cp,
+                             int batch_nd,
+                             const ssize_t *batch_shape_and_strides,
+                             ssize_t batch_lhs_offset,
+                             ssize_t batch_rhs_offset,
+                             ssize_t batch_res_offset,
+                             int red_nd,
+                             const ssize_t *reduction_shape_stride,
+                             ssize_t reduction_lhs_offset,
+                             ssize_t reduction_rhs_offset,
+                             const std::vector<sycl::event> &depends = {})
+{
+    const lhsTy *lhs_tp = reinterpret_cast<const lhsTy *>(lhs_cp);
+    const rhsTy *rhs_tp = reinterpret_cast<const rhsTy *>(rhs_cp);
+    resTy *res_tp = reinterpret_cast<resTy *>(res_cp);
+
+    const sycl::device &d = exec_q.get_device();
+    const auto &sg_sizes = d.get_info<sycl::info::device::sub_group_sizes>();
+    std::size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes);
+
+    if (reduction_nelems < wg) {
+        using InputOutputBatchIndexerT =
+            dpctl::tensor::offset_utils::ThreeOffsets_StridedIndexer;
+        using ReductionIndexerT =
+            dpctl::tensor::offset_utils::TwoOffsets_StridedIndexer;
+
+        const InputOutputBatchIndexerT inp_out_batch_indexer{
+            batch_nd, batch_lhs_offset, batch_rhs_offset, batch_res_offset,
+            batch_shape_and_strides};
+        const ReductionIndexerT reduction_indexer{red_nd, reduction_lhs_offset,
+                                                  reduction_rhs_offset,
+                                                  reduction_shape_stride};
+
+        sycl::event dot_ev =
+            sequential_dot_product<lhsTy, rhsTy, resTy,
+                                   InputOutputBatchIndexerT, ReductionIndexerT,
+                                   dot_product_seq_krn>(
+                exec_q, lhs_tp, rhs_tp, res_tp, batches, reduction_nelems,
+                inp_out_batch_indexer, reduction_indexer, depends);
+
+        return dot_ev;
+    }
+    else {
+        sycl::event res_init_ev = exec_q.submit([&](sycl::handler &cgh) {
+            using IndexerT =
+                dpctl::tensor::offset_utils::UnpackedStridedIndexer;
+
+            const ssize_t *const &res_shape = batch_shape_and_strides;
+            const ssize_t *const &res_strides =
+                batch_shape_and_strides + 3 * batch_nd;
+            const IndexerT res_indexer(batch_nd, batch_res_offset, res_shape,
+                                       res_strides);
+            using InitKernelName =
+                class dot_product_init_krn<lhsTy, rhsTy, resTy>;
+            cgh.depends_on(depends);
+
+            cgh.parallel_for<InitKernelName>(
+                sycl::range<1>(batches), [=](sycl::id<1> id) {
+                    auto res_offset = res_indexer(id[0]);
+                    res_tp[res_offset] = 0;
+                });
+        });
+
+        using ReductionOpT = sycl::plus<resTy>;
+
+        using BatchIndexerT =
+            dpctl::tensor::offset_utils::ThreeOffsets_StridedIndexer;
+        using ReductionIndexerT =
+            dpctl::tensor::offset_utils::TwoOffsets_StridedIndexer;
+
+        const BatchIndexerT batch_indexer{batch_nd, batch_lhs_offset,
+                                          batch_rhs_offset, batch_res_offset,
+                                          batch_shape_and_strides};
+        const ReductionIndexerT reduction_indexer{red_nd, reduction_lhs_offset,
+                                                  reduction_rhs_offset,
+                                                  reduction_shape_stride};
+
+        static constexpr std::size_t preferred_reductions_per_wi =
+            4; // determined experimentally
+        std::size_t reductions_per_wi =
+            (reduction_nelems < preferred_reductions_per_wi * wg)
+                ? std::max<std::size_t>(1, (reduction_nelems + wg - 1) / wg)
+                : preferred_reductions_per_wi;
+
+        std::size_t reduction_groups =
+            (reduction_nelems + reductions_per_wi * wg - 1) /
+            (reductions_per_wi * wg);
+
+        sycl::event dot_ev =
+            submit_atomic_dot_product<lhsTy, rhsTy, resTy, ReductionOpT,
+                                      BatchIndexerT, ReductionIndexerT,
+                                      dot_product_krn>(
+                exec_q, lhs_tp, rhs_tp, res_tp, wg, batches, reduction_nelems,
+                reductions_per_wi, reduction_groups, batch_indexer,
+                reduction_indexer, {res_init_ev});
+
+        return dot_ev;
+    }
+}
+
+typedef sycl::event (*dot_product_contig_impl_fn_ptr_t)(
+    sycl::queue &,
+    std::size_t,
+    std::size_t,
+    const char *,
+    const char *,
+    char *,
+    ssize_t,
+    ssize_t,
+    ssize_t,
+    ssize_t,
+    ssize_t,
+    const std::vector<sycl::event> &);
+
+template <typename lhsTy, typename rhsTy, typename resTy>
+sycl::event
+    dot_product_contig_impl(sycl::queue &exec_q,
+                            std::size_t batches,
+                            std::size_t reduction_nelems,
+                            const char *lhs_cp,
+                            const char *rhs_cp,
+                            char *res_cp,
+                            ssize_t batch_lhs_offset,
+                            ssize_t batch_rhs_offset,
+                            ssize_t batch_res_offset,
+                            ssize_t reduction_lhs_offset,
+                            ssize_t reduction_rhs_offset,
+                            const std::vector<sycl::event> &depends = {})
+{
+    const lhsTy *lhs_tp = reinterpret_cast<const lhsTy *>(lhs_cp) +
+                          batch_lhs_offset + reduction_lhs_offset;
+    const rhsTy *rhs_tp = reinterpret_cast<const rhsTy *>(rhs_cp) +
+                          batch_rhs_offset + reduction_rhs_offset;
+    resTy *res_tp = reinterpret_cast<resTy *>(res_cp) + batch_res_offset;
+
+    const sycl::device &d = exec_q.get_device();
+    const auto &sg_sizes = d.get_info<sycl::info::device::sub_group_sizes>();
+    std::size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes);
+
+    if (reduction_nelems < wg) {
+        using InputBatchIndexerT =
+            dpctl::tensor::offset_utils::Strided1DIndexer;
+        using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+        using InputOutputBatchIndexerT =
+            dpctl::tensor::offset_utils::ThreeOffsets_CombinedIndexer<
+                InputBatchIndexerT, InputBatchIndexerT, NoOpIndexerT>;
+        using ReductionIndexerT =
+            dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
+                NoOpIndexerT, NoOpIndexerT>;
+
+        const InputBatchIndexerT inp_batch_indexer{/* size */ batches,
+                                                   /* step */ reduction_nelems};
+        const InputOutputBatchIndexerT inp_out_batch_indexer{
+            inp_batch_indexer, inp_batch_indexer, NoOpIndexerT{}};
+        static constexpr ReductionIndexerT reduction_indexer{NoOpIndexerT{},
+                                                             NoOpIndexerT{}};
+
+        sycl::event dot_ev =
+            sequential_dot_product<lhsTy, rhsTy, resTy,
+                                   InputOutputBatchIndexerT, ReductionIndexerT,
+                                   dot_product_seq_krn>(
+                exec_q, lhs_tp, rhs_tp, res_tp, batches, reduction_nelems,
+                inp_out_batch_indexer, reduction_indexer, depends);
+
+        return dot_ev;
+    }
+    else {
+        sycl::event res_init_ev = exec_q.submit([&](sycl::handler &cgh) {
+            cgh.depends_on(depends);
+
+            cgh.fill<resTy>(res_tp, resTy(0), batches);
+        });
+
+        using ReductionOpT = sycl::plus<resTy>;
+
+        using InputBatchIndexerT =
+            dpctl::tensor::offset_utils::Strided1DIndexer;
+        using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+        using InputOutputBatchIndexerT =
+            dpctl::tensor::offset_utils::ThreeOffsets_CombinedIndexer<
+                InputBatchIndexerT, InputBatchIndexerT, NoOpIndexerT>;
+        using ReductionIndexerT =
+            dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
+                NoOpIndexerT, NoOpIndexerT>;
+
+        const InputBatchIndexerT inp_batch_indexer{/* size */ batches,
+                                                   /* step */ reduction_nelems};
+        const InputOutputBatchIndexerT inp_out_batch_indexer{
+            inp_batch_indexer, inp_batch_indexer, NoOpIndexerT{}};
+        static constexpr ReductionIndexerT reduction_indexer{NoOpIndexerT{},
+                                                             NoOpIndexerT{}};
+
+        static constexpr std::size_t preferred_reductions_per_wi =
+            4; // determined experimentally
+        std::size_t reductions_per_wi =
+            (reduction_nelems < preferred_reductions_per_wi * wg)
+                ? std::max<std::size_t>(1, (reduction_nelems + wg - 1) / wg)
+                : preferred_reductions_per_wi;
+
+        std::size_t reduction_groups =
+            (reduction_nelems + reductions_per_wi * wg - 1) /
+            (reductions_per_wi * wg);
+
+        sycl::event dot_ev =
+            submit_atomic_dot_product<lhsTy, rhsTy, resTy, ReductionOpT,
+                                      InputOutputBatchIndexerT,
+                                      ReductionIndexerT, dot_product_krn>(
+                exec_q, lhs_tp, rhs_tp, res_tp, wg, batches, reduction_nelems,
+                reductions_per_wi, reduction_groups, inp_out_batch_indexer,
+                reduction_indexer, {res_init_ev});
+
+        return dot_ev;
+    }
+}
+
+template <typename lhsT,
+          typename rhsT,
+          typename outT,
+          typename ReductionOpT,
+          typename BatchIndexerT,
+          typename RedIndexerT>
+struct DotProductNoAtomicFunctor
+{
+private:
+    const lhsT *lhs_ = nullptr;
+    const rhsT *rhs_ = nullptr;
+    outT *out_ = nullptr;
+    ReductionOpT reduction_op_;
+    BatchIndexerT batch_indexer_;
+    RedIndexerT reduced_dims_indexer_;
+    std::size_t reduction_max_gid_ = 0;
+    std::size_t batches_ = 1;
+    std::size_t reductions_per_wi = 16;
+
+public:
+    DotProductNoAtomicFunctor(const lhsT *lhs,
+                              const rhsT *rhs,
+                              outT *res,
+                              const ReductionOpT &reduction_op,
+                              const BatchIndexerT &batch_indexer,
+                              const RedIndexerT &arg_reduced_dims_indexer,
+                              std::size_t reduction_size,
+                              std::size_t iteration_size,
+                              std::size_t reduction_size_per_wi)
+        : lhs_(lhs), rhs_(rhs), out_(res), reduction_op_(reduction_op),
+          batch_indexer_(batch_indexer),
+          reduced_dims_indexer_(arg_reduced_dims_indexer),
+          reduction_max_gid_(reduction_size), batches_(iteration_size),
+          reductions_per_wi(reduction_size_per_wi)
+    {
+    }
+
+    void operator()(sycl::nd_item<1> it) const
+    {
+        const std::size_t reduction_lid = it.get_local_id(0);
+        const std::size_t wg =
+            it.get_local_range(0); //   0 <= reduction_lid < wg
+
+        const std::size_t batch_id = it.get_group(0) % batches_;
+        const std::size_t reduction_batch_id = it.get_group(0) / batches_;
+        const std::size_t n_reduction_groups = it.get_group_range(0) / batches_;
+
+        // work-items operate over input with indices
+        //   inp_data_id = reduction_batch_id * wg * reductions_per_wi + m * wg
+        //   + reduction_lid
+        // for 0 <= m < reductions_per_wi
+        // for each input
+
+        const auto &batch_offsets_ = batch_indexer_(batch_id);
+        const auto &lhs_batch_offset = batch_offsets_.get_first_offset();
+        const auto &rhs_batch_offset = batch_offsets_.get_second_offset();
+        const auto &out_batch_offset = batch_offsets_.get_third_offset();
+
+        outT local_red_val(0);
+        std::size_t arg_reduce_gid0 =
+            reduction_lid + reduction_batch_id * wg * reductions_per_wi;
+        std::size_t arg_reduce_gid_max = std::min(
+            reduction_max_gid_, arg_reduce_gid0 + reductions_per_wi * wg);
+
+        for (std::size_t arg_reduce_gid = arg_reduce_gid0;
+             arg_reduce_gid < arg_reduce_gid_max; arg_reduce_gid += wg)
+        {
+            auto reduction_offsets_ = reduced_dims_indexer_(arg_reduce_gid);
+            const auto &lhs_reduction_offset =
+                reduction_offsets_.get_first_offset();
+            const auto &rhs_reduction_offset =
+                reduction_offsets_.get_second_offset();
+
+            using dpctl::tensor::type_utils::convert_impl;
+            outT val = convert_impl<outT, lhsT>(
+                           lhs_[lhs_batch_offset + lhs_reduction_offset]) *
+                       convert_impl<outT, rhsT>(
+                           rhs_[rhs_batch_offset + rhs_reduction_offset]);
+
+            local_red_val += val;
+        }
+
+        auto work_group = it.get_group();
+
+        using RedOpT = typename std::conditional<std::is_same_v<outT, bool>,
+                                                 sycl::logical_or<outT>,
+                                                 sycl::plus<outT>>::type;
+        outT red_val_over_wg = sycl::reduce_over_group(
+            work_group, local_red_val, outT(0), RedOpT());
+
+        if (work_group.leader()) {
+            // each group writes to a different memory location
+            out_[out_batch_offset * n_reduction_groups + reduction_batch_id] =
+                red_val_over_wg;
+        }
+    }
+};
+
+template <typename lhsT,
+          typename rhsT,
+          typename outT,
+          typename ReductionOpT,
+          typename BatchIndexerT,
+          typename RedIndexerT,
+          typename SlmT>
+struct DotProductNoAtomicCustomFunctor
+{
+private:
+    const lhsT *lhs_ = nullptr;
+    const rhsT *rhs_ = nullptr;
+    outT *out_ = nullptr;
+    ReductionOpT reduction_op_;
+    BatchIndexerT batch_indexer_;
+    RedIndexerT reduced_dims_indexer_;
+    SlmT local_mem_;
+    std::size_t reduction_max_gid_ = 0;
+    std::size_t batches_ = 1;
+    std::size_t reductions_per_wi = 16;
+
+public:
+    DotProductNoAtomicCustomFunctor(const lhsT *lhs,
+                                    const rhsT *rhs,
+                                    outT *res,
+                                    const ReductionOpT &reduction_op,
+                                    const BatchIndexerT &batch_indexer,
+                                    const RedIndexerT &arg_reduced_dims_indexer,
+                                    SlmT local_mem,
+                                    std::size_t reduction_size,
+                                    std::size_t iteration_size,
+                                    std::size_t reduction_size_per_wi)
+        : lhs_(lhs), rhs_(rhs), out_(res), reduction_op_(reduction_op),
+          batch_indexer_(batch_indexer),
+          reduced_dims_indexer_(arg_reduced_dims_indexer),
+          local_mem_(local_mem), reduction_max_gid_(reduction_size),
+          batches_(iteration_size), reductions_per_wi(reduction_size_per_wi)
+    {
+    }
+
+    void operator()(sycl::nd_item<1> it) const
+    {
+        const std::size_t reduction_lid = it.get_local_id(0);
+        const std::size_t wg =
+            it.get_local_range(0); //   0 <= reduction_lid < wg
+
+        const std::size_t batch_id = it.get_group(0) % batches_;
+        const std::size_t reduction_batch_id = it.get_group(0) / batches_;
+        const std::size_t n_reduction_groups = it.get_group_range(0) / batches_;
+
+        // work-items operate over input with indices
+        //   inp_data_id = reduction_batch_id * wg * reductions_per_wi + m * wg
+        //   + reduction_lid
+        // for 0 <= m < reductions_per_wi
+        // for each input
+
+        const auto &batch_offsets_ = batch_indexer_(batch_id);
+        const auto &lhs_batch_offset = batch_offsets_.get_first_offset();
+        const auto &rhs_batch_offset = batch_offsets_.get_second_offset();
+        const auto &out_batch_offset = batch_offsets_.get_third_offset();
+
+        outT local_red_val(0);
+        std::size_t arg_reduce_gid0 =
+            reduction_lid + reduction_batch_id * wg * reductions_per_wi;
+        std::size_t arg_reduce_gid_max = std::min(
+            reduction_max_gid_, arg_reduce_gid0 + reductions_per_wi * wg);
+
+        for (std::size_t arg_reduce_gid = arg_reduce_gid0;
+             arg_reduce_gid < arg_reduce_gid_max; arg_reduce_gid += wg)
+        {
+            auto reduction_offsets_ = reduced_dims_indexer_(arg_reduce_gid);
+            const auto &lhs_reduction_offset =
+                reduction_offsets_.get_first_offset();
+            const auto &rhs_reduction_offset =
+                reduction_offsets_.get_second_offset();
+
+            using dpctl::tensor::type_utils::convert_impl;
+            outT val = convert_impl<outT, lhsT>(
+                           lhs_[lhs_batch_offset + lhs_reduction_offset]) *
+                       convert_impl<outT, rhsT>(
+                           rhs_[rhs_batch_offset + rhs_reduction_offset]);
+
+            local_red_val += val;
+        }
+
+        auto work_group = it.get_group();
+
+        outT red_val_over_wg = su_ns::custom_reduce_over_group(
+            work_group, local_mem_, local_red_val, reduction_op_);
+
+        if (work_group.leader()) {
+            // each group writes to a different memory location
+            out_[out_batch_offset * n_reduction_groups + reduction_batch_id] =
+                red_val_over_wg;
+        }
+    }
+};
+
+template <typename lhsTy,
+          typename rhsTy,
+          typename resTy,
+          typename ReductionOpT,
+          typename BatchIndexerT,
+          typename RedIndexerT,
+          template <typename T1,
+                    typename T2,
+                    typename T3,
+                    typename T4,
+                    typename T5,
+                    typename T6>
+          class kernel_name_token>
+sycl::event
+    submit_no_atomic_dot_product(sycl::queue &exec_q,
+                                 const lhsTy *lhs,
+                                 const rhsTy *rhs,
+                                 resTy *res,
+                                 std::size_t wg,
+                                 std::size_t batches,
+                                 std::size_t reduction_nelems,
+                                 std::size_t reductions_per_wi,
+                                 std::size_t reduction_groups,
+                                 const BatchIndexerT &batch_indexer,
+                                 const RedIndexerT &reduction_indexer,
+                                 const std::vector<sycl::event> &depends)
+{
+    sycl::event dot_ev = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+
+        auto globalRange = sycl::range<1>{batches * reduction_groups * wg};
+        auto localRange = sycl::range<1>{wg};
+        auto ndRange = sycl::nd_range<1>(globalRange, localRange);
+
+        if constexpr (can_use_reduce_over_group<ReductionOpT, resTy>::value) {
+            using KernelName =
+                class kernel_name_token<lhsTy, rhsTy, resTy, ReductionOpT,
+                                        BatchIndexerT, RedIndexerT>;
+
+            cgh.parallel_for<KernelName>(
+                ndRange,
+                DotProductNoAtomicFunctor<lhsTy, rhsTy, resTy, ReductionOpT,
+                                          BatchIndexerT, RedIndexerT>(
+                    lhs, rhs, res, ReductionOpT(), batch_indexer,
+                    reduction_indexer, reduction_nelems, batches,
+                    reductions_per_wi));
+        }
+        else {
+            using SlmT = sycl::local_accessor<resTy, 1>;
+            SlmT local_memory = SlmT(localRange, cgh);
+
+            using KernelName = class custom_reduction_wrapper<kernel_name_token<
+                lhsTy, rhsTy, resTy, ReductionOpT, BatchIndexerT, RedIndexerT>>;
+
+            cgh.parallel_for<KernelName>(
+                ndRange,
+                DotProductNoAtomicCustomFunctor<lhsTy, rhsTy, resTy,
+                                                ReductionOpT, BatchIndexerT,
+                                                RedIndexerT, SlmT>(
+                    lhs, rhs, res, ReductionOpT(), batch_indexer,
+                    reduction_indexer, local_memory, reduction_nelems, batches,
+                    reductions_per_wi));
+        }
+    });
+    return dot_ev;
+}
+
+template <typename T1,
+          typename T2,
+          typename T3,
+          typename T4,
+          typename T5,
+          typename T6>
+class dot_product_tree_krn;
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5>
+class dot_product_tree_reduction_krn;
+
+template <typename lhsTy, typename rhsTy, typename resTy>
+sycl::event dot_product_tree_impl(sycl::queue &exec_q,
+                                  std::size_t batches,
+                                  std::size_t reduction_nelems,
+                                  const char *lhs_cp,
+                                  const char *rhs_cp,
+                                  char *res_cp,
+                                  int batch_nd,
+                                  const ssize_t *batch_shape_and_strides,
+                                  ssize_t batch_lhs_offset,
+                                  ssize_t batch_rhs_offset,
+                                  ssize_t batch_res_offset,
+                                  int red_nd,
+                                  const ssize_t *reduction_shape_stride,
+                                  ssize_t reduction_lhs_offset,
+                                  ssize_t reduction_rhs_offset,
+                                  const std::vector<sycl::event> &depends = {})
+{
+    const lhsTy *lhs_tp = reinterpret_cast<const lhsTy *>(lhs_cp);
+    const rhsTy *rhs_tp = reinterpret_cast<const rhsTy *>(rhs_cp);
+    resTy *res_tp = reinterpret_cast<resTy *>(res_cp);
+
+    const sycl::device &d = exec_q.get_device();
+    const auto &sg_sizes = d.get_info<sycl::info::device::sub_group_sizes>();
+    std::size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes);
+
+    if (reduction_nelems < wg) {
+        using InputOutputBatchIndexerT =
+            dpctl::tensor::offset_utils::ThreeOffsets_StridedIndexer;
+        using ReductionIndexerT =
+            dpctl::tensor::offset_utils::TwoOffsets_StridedIndexer;
+
+        const InputOutputBatchIndexerT inp_out_batch_indexer{
+            batch_nd, batch_lhs_offset, batch_rhs_offset, batch_res_offset,
+            batch_shape_and_strides};
+        const ReductionIndexerT reduction_indexer{red_nd, reduction_lhs_offset,
+                                                  reduction_rhs_offset,
+                                                  reduction_shape_stride};
+
+        sycl::event dot_ev =
+            sequential_dot_product<lhsTy, rhsTy, resTy,
+                                   InputOutputBatchIndexerT, ReductionIndexerT,
+                                   dot_product_seq_krn>(
+                exec_q, lhs_tp, rhs_tp, res_tp, batches, reduction_nelems,
+                inp_out_batch_indexer, reduction_indexer, depends);
+
+        return dot_ev;
+    }
+
+    static constexpr std::size_t preferred_reductions_per_wi = 8;
+    // prevents running out of resources on CPU
+    std::size_t max_wg = reduction_detail::get_work_group_size(d);
+
+    using ReductionOpT = typename std::conditional<std::is_same_v<resTy, bool>,
+                                                   sycl::logical_or<resTy>,
+                                                   sycl::plus<resTy>>::type;
+
+    std::size_t reductions_per_wi(preferred_reductions_per_wi);
+    if (reduction_nelems <= preferred_reductions_per_wi * max_wg) {
+        using BatchIndexerT =
+            dpctl::tensor::offset_utils::ThreeOffsets_StridedIndexer;
+        using ReductionIndexerT =
+            dpctl::tensor::offset_utils::TwoOffsets_StridedIndexer;
+
+        const BatchIndexerT batch_indexer{batch_nd, batch_lhs_offset,
+                                          batch_rhs_offset, batch_res_offset,
+                                          batch_shape_and_strides};
+        const ReductionIndexerT reduction_indexer{red_nd, reduction_lhs_offset,
+                                                  reduction_rhs_offset,
+                                                  reduction_shape_stride};
+
+        if (batches == 1) {
+            // increase GPU occupancy
+            wg = max_wg;
+        }
+        reductions_per_wi =
+            std::max<std::size_t>(1, (reduction_nelems + wg - 1) / wg);
+
+        std::size_t reduction_groups =
+            (reduction_nelems + reductions_per_wi * wg - 1) /
+            (reductions_per_wi * wg);
+        assert(reduction_groups == 1);
+
+        sycl::event dot_ev =
+            submit_no_atomic_dot_product<lhsTy, rhsTy, resTy, ReductionOpT,
+                                         BatchIndexerT, ReductionIndexerT,
+                                         dot_product_tree_krn>(
+                exec_q, lhs_tp, rhs_tp, res_tp, wg, batches, reduction_nelems,
+                reductions_per_wi, reduction_groups, batch_indexer,
+                reduction_indexer, depends);
+
+        return dot_ev;
+    }
+    else {
+        static constexpr resTy identity_val =
+            sycl::known_identity<ReductionOpT, resTy>::value;
+
+        // more than one work-groups is needed, requires a temporary
+        std::size_t reduction_groups =
+            (reduction_nelems + preferred_reductions_per_wi * wg - 1) /
+            (preferred_reductions_per_wi * wg);
+        assert(reduction_groups > 1);
+
+        std::size_t second_iter_reduction_groups_ =
+            (reduction_groups + preferred_reductions_per_wi * wg - 1) /
+            (preferred_reductions_per_wi * wg);
+
+        // returns unique_ptr
+        auto partially_reduced_tmp_owner =
+            dpctl::tensor::alloc_utils::smart_malloc_device<resTy>(
+                batches * (reduction_groups + second_iter_reduction_groups_),
+                exec_q);
+
+        resTy *partially_reduced_tmp = partially_reduced_tmp_owner.get();
+        resTy *partially_reduced_tmp2 =
+            partially_reduced_tmp + reduction_groups * batches;
+
+        sycl::event first_reduction_ev;
+        {
+            using LhsIndexerT = dpctl::tensor::offset_utils::StridedIndexer;
+            using RhsIndexerT =
+                dpctl::tensor::offset_utils::UnpackedStridedIndexer;
+            using ResIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+            using InputOutputBatchIndexerT =
+                dpctl::tensor::offset_utils::ThreeOffsets_CombinedIndexer<
+                    LhsIndexerT, RhsIndexerT, ResIndexerT>;
+            using ReductionIndexerT =
+                dpctl::tensor::offset_utils::TwoOffsets_StridedIndexer;
+
+            const LhsIndexerT lhs_indexer(batch_nd, batch_lhs_offset,
+                                          batch_shape_and_strides);
+            const RhsIndexerT rhs_indexer(
+                batch_nd, batch_rhs_offset, batch_shape_and_strides,
+                batch_shape_and_strides + 2 * batch_nd);
+            static constexpr ResIndexerT noop_tmp_indexer{};
+
+            const InputOutputBatchIndexerT in_out_iter_indexer{
+                lhs_indexer, rhs_indexer, noop_tmp_indexer};
+            const ReductionIndexerT reduction_indexer{
+                red_nd, reduction_lhs_offset, reduction_rhs_offset,
+                reduction_shape_stride};
+
+            first_reduction_ev = submit_no_atomic_dot_product<
+                lhsTy, rhsTy, resTy, ReductionOpT, InputOutputBatchIndexerT,
+                ReductionIndexerT, dot_product_tree_krn>(
+                exec_q, lhs_tp, rhs_tp, partially_reduced_tmp, wg, batches,
+                reduction_nelems, preferred_reductions_per_wi, reduction_groups,
+                in_out_iter_indexer, reduction_indexer, depends);
+        }
+
+        std::size_t remaining_reduction_nelems = reduction_groups;
+
+        resTy *temp_arg = partially_reduced_tmp;
+        resTy *temp2_arg = partially_reduced_tmp2;
+        sycl::event dependent_ev = first_reduction_ev;
+
+        while (remaining_reduction_nelems >
+               preferred_reductions_per_wi * max_wg) {
+            std::size_t reduction_groups_ =
+                (remaining_reduction_nelems + preferred_reductions_per_wi * wg -
+                 1) /
+                (preferred_reductions_per_wi * wg);
+            assert(reduction_groups_ > 1);
+
+            using InputIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer;
+            using ResIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+            using InputOutputIterIndexerT =
+                dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
+                    InputIndexerT, ResIndexerT>;
+            using ReductionIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+
+            const InputIndexerT inp_indexer{/* size */ batches,
+                                            /* step */ reduction_groups_};
+            static constexpr ResIndexerT res_iter_indexer{};
+
+            const InputOutputIterIndexerT in_out_iter_indexer{inp_indexer,
+                                                              res_iter_indexer};
+            static constexpr ReductionIndexerT reduction_indexer{};
+
+            sycl::event partial_reduction_ev =
+                dpctl::tensor::kernels::submit_no_atomic_reduction<
+                    resTy, resTy, ReductionOpT, InputOutputIterIndexerT,
+                    ReductionIndexerT, dot_product_tree_reduction_krn>(
+                    exec_q, temp_arg, temp2_arg, identity_val, wg, batches,
+                    remaining_reduction_nelems, preferred_reductions_per_wi,
+                    reduction_groups_, in_out_iter_indexer, reduction_indexer,
+                    {dependent_ev});
+
+            remaining_reduction_nelems = reduction_groups_;
+            std::swap(temp_arg, temp2_arg);
+            dependent_ev = std::move(partial_reduction_ev);
+        }
+
+        // final reduction to res
+        using InputIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer;
+        using ResIndexerT = dpctl::tensor::offset_utils::UnpackedStridedIndexer;
+        using InputOutputIterIndexerT =
+            dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
+                InputIndexerT, ResIndexerT>;
+        using ReductionIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+
+        const InputIndexerT inp_indexer{/* size */ batches,
+                                        /* step */ remaining_reduction_nelems};
+        const ResIndexerT res_iter_indexer{
+            batch_nd, batch_res_offset,
+            /* shape */ batch_shape_and_strides,
+            /* strides */ batch_shape_and_strides + 2 * batch_nd};
+
+        const InputOutputIterIndexerT in_out_iter_indexer{inp_indexer,
+                                                          res_iter_indexer};
+        static constexpr ReductionIndexerT reduction_indexer{};
+
+        wg = max_wg;
+        reductions_per_wi = std::max<std::size_t>(
+            1, (remaining_reduction_nelems + wg - 1) / wg);
+
+        reduction_groups =
+            (remaining_reduction_nelems + reductions_per_wi * wg - 1) /
+            (reductions_per_wi * wg);
+        assert(reduction_groups == 1);
+
+        sycl::event final_reduction_ev =
+            dpctl::tensor::kernels::submit_no_atomic_reduction<
+                resTy, resTy, ReductionOpT, InputOutputIterIndexerT,
+                ReductionIndexerT, dot_product_tree_reduction_krn>(
+                exec_q, temp_arg, res_tp, identity_val, wg, batches,
+                remaining_reduction_nelems, reductions_per_wi, reduction_groups,
+                in_out_iter_indexer, reduction_indexer, {dependent_ev});
+
+        // transfer ownership of USM allocation to host_task
+        sycl::event cleanup_host_task_event =
+            dpctl::tensor::alloc_utils::async_smart_free(
+                exec_q, {final_reduction_ev}, partially_reduced_tmp_owner);
+
+        return cleanup_host_task_event;
+    }
+}
+
+template <typename lhsTy, typename rhsTy, typename resTy>
+sycl::event
+    dot_product_contig_tree_impl(sycl::queue &exec_q,
+                                 std::size_t batches,
+                                 std::size_t reduction_nelems,
+                                 const char *lhs_cp,
+                                 const char *rhs_cp,
+                                 char *res_cp,
+                                 ssize_t batch_lhs_offset,
+                                 ssize_t batch_rhs_offset,
+                                 ssize_t batch_res_offset,
+                                 ssize_t reduction_lhs_offset,
+                                 ssize_t reduction_rhs_offset,
+                                 const std::vector<sycl::event> &depends = {})
+{
+    const lhsTy *lhs_tp = reinterpret_cast<const lhsTy *>(lhs_cp) +
+                          batch_lhs_offset + reduction_lhs_offset;
+    const rhsTy *rhs_tp = reinterpret_cast<const rhsTy *>(rhs_cp) +
+                          batch_rhs_offset + reduction_rhs_offset;
+    resTy *res_tp = reinterpret_cast<resTy *>(res_cp) + batch_res_offset;
+
+    const sycl::device &d = exec_q.get_device();
+    const auto &sg_sizes = d.get_info<sycl::info::device::sub_group_sizes>();
+    std::size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes);
+
+    if (reduction_nelems < wg) {
+        using InputBatchIndexerT =
+            dpctl::tensor::offset_utils::Strided1DIndexer;
+        using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+        using InputOutputBatchIndexerT =
+            dpctl::tensor::offset_utils::ThreeOffsets_CombinedIndexer<
+                InputBatchIndexerT, InputBatchIndexerT, NoOpIndexerT>;
+        using ReductionIndexerT =
+            dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
+                NoOpIndexerT, NoOpIndexerT>;
+
+        const InputBatchIndexerT inp_batch_indexer{/* size */ batches,
+                                                   /* step */ reduction_nelems};
+        const InputOutputBatchIndexerT inp_out_batch_indexer{
+            inp_batch_indexer, inp_batch_indexer, NoOpIndexerT{}};
+        static constexpr ReductionIndexerT reduction_indexer{NoOpIndexerT{},
+                                                             NoOpIndexerT{}};
+
+        sycl::event dot_ev =
+            sequential_dot_product<lhsTy, rhsTy, resTy,
+                                   InputOutputBatchIndexerT, ReductionIndexerT,
+                                   dot_product_seq_krn>(
+                exec_q, lhs_tp, rhs_tp, res_tp, batches, reduction_nelems,
+                inp_out_batch_indexer, reduction_indexer, depends);
+
+        return dot_ev;
+    }
+
+    static constexpr std::size_t preferred_reductions_per_wi = 8;
+    // prevents running out of resources on CPU
+    std::size_t max_wg = reduction_detail::get_work_group_size(d);
+
+    using ReductionOpT = typename std::conditional<std::is_same_v<resTy, bool>,
+                                                   sycl::logical_or<resTy>,
+                                                   sycl::plus<resTy>>::type;
+
+    std::size_t reductions_per_wi(preferred_reductions_per_wi);
+    if (reduction_nelems <= preferred_reductions_per_wi * max_wg) {
+        using InputBatchIndexerT =
+            dpctl::tensor::offset_utils::Strided1DIndexer;
+        using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+        using InputOutputBatchIndexerT =
+            dpctl::tensor::offset_utils::ThreeOffsets_CombinedIndexer<
+                InputBatchIndexerT, InputBatchIndexerT, NoOpIndexerT>;
+        using ReductionIndexerT =
+            dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
+                NoOpIndexerT, NoOpIndexerT>;
+
+        const InputBatchIndexerT inp_batch_indexer{/* size */ batches,
+                                                   /* step */ reduction_nelems};
+        const InputOutputBatchIndexerT inp_out_batch_indexer{
+            inp_batch_indexer, inp_batch_indexer, NoOpIndexerT{}};
+        static constexpr ReductionIndexerT reduction_indexer{NoOpIndexerT{},
+                                                             NoOpIndexerT{}};
+
+        if (batches == 1) {
+            // increase GPU occupancy
+            wg = max_wg;
+        }
+        reductions_per_wi =
+            std::max<std::size_t>(1, (reduction_nelems + wg - 1) / wg);
+
+        std::size_t reduction_groups =
+            (reduction_nelems + reductions_per_wi * wg - 1) /
+            (reductions_per_wi * wg);
+        assert(reduction_groups == 1);
+
+        sycl::event dot_ev = submit_no_atomic_dot_product<
+            lhsTy, rhsTy, resTy, ReductionOpT, InputOutputBatchIndexerT,
+            ReductionIndexerT, dot_product_tree_krn>(
+            exec_q, lhs_tp, rhs_tp, res_tp, wg, batches, reduction_nelems,
+            reductions_per_wi, reduction_groups, inp_out_batch_indexer,
+            reduction_indexer, depends);
+
+        return dot_ev;
+    }
+    else {
+        static constexpr resTy identity_val =
+            sycl::known_identity<ReductionOpT, resTy>::value;
+
+        // more than one work-groups is needed, requires a temporary
+        std::size_t reduction_groups =
+            (reduction_nelems + preferred_reductions_per_wi * wg - 1) /
+            (preferred_reductions_per_wi * wg);
+        assert(reduction_groups > 1);
+
+        std::size_t second_iter_reduction_groups_ =
+            (reduction_groups + preferred_reductions_per_wi * wg - 1) /
+            (preferred_reductions_per_wi * wg);
+
+        // unique_ptr that owns temporary allocation for partial reductions
+        auto partially_reduced_tmp_owner =
+            dpctl::tensor::alloc_utils::smart_malloc_device<resTy>(
+                batches * (reduction_groups + second_iter_reduction_groups_),
+                exec_q);
+        // get raw pointers
+        resTy *partially_reduced_tmp = partially_reduced_tmp_owner.get();
+        resTy *partially_reduced_tmp2 =
+            partially_reduced_tmp + reduction_groups * batches;
+
+        sycl::event first_reduction_ev;
+        {
+            using InputBatchIndexerT =
+                dpctl::tensor::offset_utils::Strided1DIndexer;
+            using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+            using InputOutputBatchIndexerT =
+                dpctl::tensor::offset_utils::ThreeOffsets_CombinedIndexer<
+                    InputBatchIndexerT, InputBatchIndexerT, NoOpIndexerT>;
+            using ReductionIndexerT =
+                dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
+                    NoOpIndexerT, NoOpIndexerT>;
+
+            const InputBatchIndexerT inp_batch_indexer{
+                /* size */ batches,
+                /* step */ reduction_nelems};
+            const InputOutputBatchIndexerT inp_out_batch_indexer{
+                inp_batch_indexer, inp_batch_indexer, NoOpIndexerT{}};
+            static constexpr ReductionIndexerT reduction_indexer{
+                NoOpIndexerT{}, NoOpIndexerT{}};
+
+            first_reduction_ev = submit_no_atomic_dot_product<
+                lhsTy, rhsTy, resTy, ReductionOpT, InputOutputBatchIndexerT,
+                ReductionIndexerT, dot_product_tree_krn>(
+                exec_q, lhs_tp, rhs_tp, partially_reduced_tmp, wg, batches,
+                reduction_nelems, preferred_reductions_per_wi, reduction_groups,
+                inp_out_batch_indexer, reduction_indexer, depends);
+        }
+
+        std::size_t remaining_reduction_nelems = reduction_groups;
+
+        resTy *temp_arg = partially_reduced_tmp;
+        resTy *temp2_arg = partially_reduced_tmp2;
+        sycl::event dependent_ev = first_reduction_ev;
+
+        while (remaining_reduction_nelems >
+               preferred_reductions_per_wi * max_wg) {
+            std::size_t reduction_groups_ =
+                (remaining_reduction_nelems + preferred_reductions_per_wi * wg -
+                 1) /
+                (preferred_reductions_per_wi * wg);
+            assert(reduction_groups_ > 1);
+
+            using InputIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer;
+            using ResIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+            using InputOutputIterIndexerT =
+                dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
+                    InputIndexerT, ResIndexerT>;
+            using ReductionIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+
+            const InputIndexerT inp_indexer{/* size */ batches,
+                                            /* step */ reduction_groups_};
+            static constexpr ResIndexerT res_iter_indexer{};
+
+            const InputOutputIterIndexerT in_out_iter_indexer{inp_indexer,
+                                                              res_iter_indexer};
+            static constexpr ReductionIndexerT reduction_indexer{};
+
+            sycl::event partial_reduction_ev =
+                dpctl::tensor::kernels::submit_no_atomic_reduction<
+                    resTy, resTy, ReductionOpT, InputOutputIterIndexerT,
+                    ReductionIndexerT, dot_product_tree_reduction_krn>(
+                    exec_q, temp_arg, temp2_arg, identity_val, wg, batches,
+                    remaining_reduction_nelems, preferred_reductions_per_wi,
+                    reduction_groups_, in_out_iter_indexer, reduction_indexer,
+                    {dependent_ev});
+
+            remaining_reduction_nelems = reduction_groups_;
+            std::swap(temp_arg, temp2_arg);
+            dependent_ev = std::move(partial_reduction_ev);
+        }
+
+        // final reduction to res
+        using InputIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer;
+        using ResIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+        using InputOutputIterIndexerT =
+            dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
+                InputIndexerT, ResIndexerT>;
+        using ReductionIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+
+        const InputIndexerT inp_indexer{/* size */ batches,
+                                        /* step */ remaining_reduction_nelems};
+        static constexpr ResIndexerT res_iter_indexer{};
+
+        const InputOutputIterIndexerT in_out_iter_indexer{inp_indexer,
+                                                          res_iter_indexer};
+        static constexpr ReductionIndexerT reduction_indexer{};
+
+        wg = max_wg;
+        reductions_per_wi = std::max<std::size_t>(
+            1, (remaining_reduction_nelems + wg - 1) / wg);
+
+        reduction_groups =
+            (remaining_reduction_nelems + reductions_per_wi * wg - 1) /
+            (reductions_per_wi * wg);
+        assert(reduction_groups == 1);
+
+        sycl::event final_reduction_ev =
+            dpctl::tensor::kernels::submit_no_atomic_reduction<
+                resTy, resTy, ReductionOpT, InputOutputIterIndexerT,
+                ReductionIndexerT, dot_product_tree_reduction_krn>(
+                exec_q, temp_arg, res_tp, identity_val, wg, batches,
+                remaining_reduction_nelems, reductions_per_wi, reduction_groups,
+                in_out_iter_indexer, reduction_indexer, {dependent_ev});
+
+        sycl::event cleanup_host_task_event =
+            dpctl::tensor::alloc_utils::async_smart_free(
+                exec_q, {final_reduction_ev}, partially_reduced_tmp_owner);
+
+        return cleanup_host_task_event;
+    }
+}
+
+} // namespace dpctl::tensor::kernels
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/linalg_functions/gemm.hpp b/dpctl_ext/tensor/libtensor/include/kernels/linalg_functions/gemm.hpp
new file mode 100644
index 000000000000..8d2f1948754b
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/include/kernels/linalg_functions/gemm.hpp
@@ -0,0 +1,4240 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for general matrix multiplication (GEMM).
+//===---------------------------------------------------------------------===//
+
+#pragma once
+
+#include <algorithm>
+#include <complex>
+#include <cstddef>
+#include <cstdint>
+#include <stdexcept>
+#include <sycl/sycl.hpp>
+#include <type_traits>
+#include <utility>
+#include <vector>
+
+#include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/reductions.hpp"
+#include "utils/offset_utils.hpp"
+#include "utils/sycl_alloc_utils.hpp"
+#include "utils/sycl_utils.hpp"
+#include "utils/type_utils.hpp"
+
+namespace dpctl::tensor::kernels
+{
+
+using dpctl::tensor::ssize_t;
+
+namespace gemm_detail
+{
+
+template <typename T, std::size_t m_groups>
+void scale_gemm_k_parameters(const std::size_t &local_mem_size,
+                             const std::size_t &reserved_slm_size,
+                             const std::size_t delta_k,
+                             std::size_t &n_wi,
+                             std::size_t &delta_n)
+{
+    static constexpr std::size_t slm_elem_size = sizeof(T) * m_groups;
+
+    while (slm_elem_size * (n_wi + delta_n) * delta_k + reserved_slm_size >=
+           local_mem_size)
+    {
+        n_wi = n_wi / 2;
+        delta_n = delta_n / 2;
+        if (delta_n == 0)
+            throw std::runtime_error("Insufficient resources");
+    }
+}
+
+template <typename T, int wi_delta_m>
+void scale_gemm_nm_parameters(const std::size_t &local_mem_size,
+                              const std::size_t &reserved_slm_size,
+                              const std::size_t &wi_delta_n,
+                              std::size_t &wi_delta_k,
+                              std::size_t &wg_delta_n,
+                              std::size_t &wg_delta_m)
+{
+    static constexpr std::size_t slm_A_elem_size = sizeof(T);
+    static constexpr std::size_t slm_B_elem_size = sizeof(T) * wi_delta_m;
+
+    while ((wi_delta_n * wg_delta_n * wi_delta_k * slm_A_elem_size) +
+               (wi_delta_k * wg_delta_m * slm_B_elem_size) +
+               reserved_slm_size >=
+           local_mem_size)
+    {
+        wg_delta_n /= 2;
+        wg_delta_m /= 2;
+        wi_delta_k /= 2;
+        if (wg_delta_n == 0)
+            throw std::runtime_error("Insufficient resources");
+    }
+}
+} // namespace gemm_detail
+
+using dpctl::tensor::sycl_utils::choose_workgroup_size;
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5>
+class gemm_seq_reduction_krn;
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5>
+class gemm_tree_reduction_krn;
+
+template <typename T, typename ReductionOpT>
+sycl::event single_reduction_for_gemm(sycl::queue &exec_q,
+                                      T *tmp_tp,
+                                      T *res_tp,
+                                      T identity_val,
+                                      std::size_t iter_nelems,
+                                      std::size_t reduction_nelems,
+                                      std::size_t reduction_groups,
+                                      std::size_t wg,
+                                      std::size_t max_wg,
+                                      std::size_t preferred_reductions_per_wi,
+                                      std::size_t reductions_per_wi,
+                                      int res_nd,
+                                      ssize_t res_offset,
+                                      const ssize_t *res_shapes_strides,
+                                      const std::vector<sycl::event> &depends)
+{
+    sycl::event red_ev;
+    if (reduction_nelems < wg) {
+        using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+        using ResIndexerT = dpctl::tensor::offset_utils::StridedIndexer;
+        using InputOutputIterIndexerT =
+            dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
+                NoOpIndexerT, ResIndexerT>;
+        using ReductionIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer;
+
+        const ResIndexerT res_iter_indexer{res_nd, 0, res_shapes_strides};
+        const InputOutputIterIndexerT in_out_iter_indexer{NoOpIndexerT{},
+                                                          res_iter_indexer};
+        const ReductionIndexerT reduction_indexer{/* size   */ reduction_nelems,
+                                                  /* step   */ iter_nelems};
+
+        red_ev = exec_q.submit([&](sycl::handler &cgh) {
+            cgh.depends_on(depends);
+
+            sycl::range<1> iter_range{iter_nelems};
+
+            cgh.parallel_for<class gemm_seq_reduction_krn<
+                T, T, ReductionOpT, InputOutputIterIndexerT,
+                ReductionIndexerT>>(
+                iter_range,
+                SequentialReduction<T, T, ReductionOpT, InputOutputIterIndexerT,
+                                    ReductionIndexerT>(
+                    tmp_tp, res_tp, ReductionOpT(), identity_val,
+                    in_out_iter_indexer, reduction_indexer, reduction_nelems));
+        });
+    }
+    else {
+        using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+        using ResIndexerT = dpctl::tensor::offset_utils::StridedIndexer;
+        using InputOutputIterIndexerT =
+            dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
+                NoOpIndexerT, ResIndexerT>;
+        using ReductionIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer;
+
+        const ResIndexerT res_iter_indexer{res_nd, 0, res_shapes_strides};
+        const InputOutputIterIndexerT in_out_iter_indexer{NoOpIndexerT{},
+                                                          res_iter_indexer};
+        const ReductionIndexerT reduction_indexer{/* size */ reduction_nelems,
+                                                  /* step */ iter_nelems};
+
+        if (iter_nelems == 1) {
+            // increase GPU occupancy
+            wg = max_wg;
+        }
+        reductions_per_wi =
+            std::max<std::size_t>(1, (reduction_nelems + wg - 1) / wg);
+
+        std::size_t reduction_groups =
+            (reduction_nelems + reductions_per_wi * wg - 1) /
+            (reductions_per_wi * wg);
+        assert(reduction_groups == 1);
+
+        red_ev = dpctl::tensor::kernels::submit_no_atomic_reduction<
+            T, T, ReductionOpT, InputOutputIterIndexerT, ReductionIndexerT,
+            gemm_tree_reduction_krn>(
+            exec_q, tmp_tp, res_tp, identity_val, wg, iter_nelems,
+            reduction_nelems, reductions_per_wi, reduction_groups,
+            in_out_iter_indexer, reduction_indexer, depends);
+    }
+    return red_ev;
+}
+
+template <typename T, typename ReductionOpT>
+sycl::event
+    single_reduction_for_gemm_contig(sycl::queue &exec_q,
+                                     T *tmp_tp,
+                                     T *res_tp,
+                                     T identity_val,
+                                     std::size_t iter_nelems,
+                                     std::size_t reduction_nelems,
+                                     std::size_t reduction_groups,
+                                     std::size_t wg,
+                                     std::size_t max_wg,
+                                     std::size_t preferred_reductions_per_wi,
+                                     std::size_t reductions_per_wi,
+                                     const std::vector<sycl::event> &depends)
+{
+    sycl::event red_ev;
+    if (reduction_nelems < wg) {
+        using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+        using InputOutputIterIndexerT =
+            dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
+                NoOpIndexerT, NoOpIndexerT>;
+        using ReductionIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer;
+
+        static constexpr InputOutputIterIndexerT in_out_iter_indexer{
+            NoOpIndexerT{}, NoOpIndexerT{}};
+        // tmp allocation is a C-contiguous matrix (reduction_nelems,
+        // iter_nelems) and we are reducing by axis 0
+        const ReductionIndexerT reduction_indexer{/* size */ reduction_nelems,
+                                                  /* step */ iter_nelems};
+
+        red_ev = exec_q.submit([&](sycl::handler &cgh) {
+            cgh.depends_on(depends);
+
+            sycl::range<1> iter_range{iter_nelems};
+
+            cgh.parallel_for<class gemm_seq_reduction_krn<
+                T, T, ReductionOpT, InputOutputIterIndexerT,
+                ReductionIndexerT>>(
+                iter_range,
+                SequentialReduction<T, T, ReductionOpT, InputOutputIterIndexerT,
+                                    ReductionIndexerT>(
+                    tmp_tp, res_tp, ReductionOpT(), identity_val,
+                    in_out_iter_indexer, reduction_indexer, reduction_nelems));
+        });
+    }
+    else {
+        using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+        using InputOutputIterIndexerT =
+            dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
+                NoOpIndexerT, NoOpIndexerT>;
+        using ReductionIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer;
+
+        static constexpr InputOutputIterIndexerT in_out_iter_indexer{
+            NoOpIndexerT{}, NoOpIndexerT{}};
+        // tmp allocation is a C-contiguous matrix
+        // (reduction_nelems, iter_nelems). Reducing along axis 0
+        const ReductionIndexerT reduction_indexer{/* size */ reduction_nelems,
+                                                  /* step */ iter_nelems};
+
+        if (iter_nelems == 1) {
+            // increase GPU occupancy
+            wg = max_wg;
+        }
+        reductions_per_wi =
+            std::max<std::size_t>(1, (reduction_nelems + wg - 1) / wg);
+
+        std::size_t reduction_groups =
+            (reduction_nelems + reductions_per_wi * wg - 1) /
+            (reductions_per_wi * wg);
+        assert(reduction_groups == 1);
+
+        red_ev = dpctl::tensor::kernels::submit_no_atomic_reduction<
+            T, T, ReductionOpT, InputOutputIterIndexerT, ReductionIndexerT,
+            gemm_tree_reduction_krn>(
+            exec_q, tmp_tp, res_tp, identity_val, wg, iter_nelems,
+            reduction_nelems, reductions_per_wi, reduction_groups,
+            in_out_iter_indexer, reduction_indexer, depends);
+    }
+    return red_ev;
+}
+
+template <typename T, typename ReductionOpT>
+sycl::event tree_reduction_for_gemm(sycl::queue &exec_q,
+                                    T *partially_reduced_tmp,
+                                    T *partially_reduced_tmp2,
+                                    T *res_tp,
+                                    T identity_val,
+                                    std::size_t iter_nelems,
+                                    std::size_t reduction_nelems,
+                                    std::size_t reduction_groups,
+                                    std::size_t wg,
+                                    std::size_t max_wg,
+                                    std::size_t preferred_reductions_per_wi,
+                                    std::size_t reductions_per_wi,
+                                    int res_nd,
+                                    ssize_t res_offset,
+                                    const ssize_t *res_shape_strides,
+                                    const std::vector<sycl::event> &depends)
+{
+    sycl::event first_reduction_ev;
+    {
+        using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+        using InputOutputIterIndexerT =
+            dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
+                NoOpIndexerT, NoOpIndexerT>;
+        using ReductionIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer;
+
+        static constexpr InputOutputIterIndexerT in_out_iter_indexer{
+            NoOpIndexerT{}, NoOpIndexerT{}};
+        // partially_reduced_tmp is C-contig matrix with shape
+        // (reduction_nelems, iter_nelems). Reducing along axis 0.
+        const ReductionIndexerT reduction_indexer{/* size */ reduction_nelems,
+                                                  /* step */ iter_nelems};
+
+        first_reduction_ev = dpctl::tensor::kernels::submit_no_atomic_reduction<
+            T, T, ReductionOpT, InputOutputIterIndexerT, ReductionIndexerT,
+            gemm_tree_reduction_krn>(
+            exec_q, partially_reduced_tmp, partially_reduced_tmp2, identity_val,
+            wg, iter_nelems, reduction_nelems, reductions_per_wi,
+            reduction_groups, in_out_iter_indexer, reduction_indexer, depends);
+    }
+
+    std::size_t remaining_reduction_nelems = reduction_groups;
+
+    T *temp_arg = partially_reduced_tmp2;
+    T *temp2_arg = partially_reduced_tmp;
+    sycl::event dependent_ev = first_reduction_ev;
+
+    while (remaining_reduction_nelems > preferred_reductions_per_wi * max_wg) {
+        std::size_t reduction_groups_ = (remaining_reduction_nelems +
+                                         preferred_reductions_per_wi * wg - 1) /
+                                        (preferred_reductions_per_wi * wg);
+        assert(reduction_groups_ > 1);
+
+        // keep reducing
+        using InputIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer;
+        using ResIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+        using InputOutputIterIndexerT =
+            dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
+                InputIndexerT, ResIndexerT>;
+        using ReductionIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+
+        const InputIndexerT inp_indexer{/* size */ iter_nelems,
+                                        /* step */ reduction_groups_};
+        static constexpr ResIndexerT res_iter_indexer{};
+
+        const InputOutputIterIndexerT in_out_iter_indexer{inp_indexer,
+                                                          res_iter_indexer};
+
+        static constexpr ReductionIndexerT reduction_indexer{};
+
+        sycl::event partial_reduction_ev =
+            dpctl::tensor::kernels::submit_no_atomic_reduction<
+                T, T, ReductionOpT, InputOutputIterIndexerT, ReductionIndexerT,
+                gemm_tree_reduction_krn>(
+                exec_q, temp_arg, temp2_arg, identity_val, wg, iter_nelems,
+                remaining_reduction_nelems, reductions_per_wi,
+                reduction_groups_, in_out_iter_indexer, reduction_indexer,
+                {dependent_ev});
+
+        remaining_reduction_nelems = reduction_groups_;
+        std::swap(temp_arg, temp2_arg);
+        dependent_ev = std::move(partial_reduction_ev);
+    }
+
+    // final reduction to res
+    using InputIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer;
+    using ResIndexerT = dpctl::tensor::offset_utils::StridedIndexer;
+    using InputOutputIterIndexerT =
+        dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<InputIndexerT,
+                                                                ResIndexerT>;
+    using ReductionIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+
+    const InputIndexerT inp_indexer{/* size */ iter_nelems,
+                                    /* step */ remaining_reduction_nelems};
+    const ResIndexerT res_iter_indexer{
+        /* ndim                */ res_nd,
+        /* offset              */ static_cast<ssize_t>(res_offset),
+        /* packed shape_strides*/ res_shape_strides};
+
+    const InputOutputIterIndexerT in_out_iter_indexer{inp_indexer,
+                                                      res_iter_indexer};
+    static constexpr ReductionIndexerT reduction_indexer{};
+
+    wg = max_wg;
+    reductions_per_wi =
+        std::max<std::size_t>(1, (remaining_reduction_nelems + wg - 1) / wg);
+
+    reduction_groups =
+        (remaining_reduction_nelems + reductions_per_wi * wg - 1) /
+        (reductions_per_wi * wg);
+    assert(reduction_groups == 1);
+
+    sycl::event final_reduction_ev =
+        dpctl::tensor::kernels::submit_no_atomic_reduction<
+            T, T, ReductionOpT, InputOutputIterIndexerT, ReductionIndexerT,
+            gemm_tree_reduction_krn>(
+            exec_q, temp_arg, res_tp, identity_val, wg, iter_nelems,
+            remaining_reduction_nelems, reductions_per_wi, reduction_groups,
+            in_out_iter_indexer, reduction_indexer, {dependent_ev});
+
+    return final_reduction_ev;
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5>
+class gemm_reduction_over_group_temps_contig_krn;
+
+template <typename T, typename ReductionOpT>
+sycl::event
+    tree_reduction_for_gemm_contig(sycl::queue &exec_q,
+                                   T *partially_reduced_tmp,
+                                   T *partially_reduced_tmp2,
+                                   T *res_tp,
+                                   T identity_val,
+                                   std::size_t iter_nelems,
+                                   std::size_t reduction_nelems,
+                                   std::size_t reduction_groups,
+                                   std::size_t wg,
+                                   std::size_t max_wg,
+                                   std::size_t preferred_reductions_per_wi,
+                                   std::size_t reductions_per_wi,
+                                   const std::vector<sycl::event> &depends)
+{
+    using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+    using InputOutputIterIndexerT =
+        dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<NoOpIndexerT,
+                                                                NoOpIndexerT>;
+    using ReductionIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer;
+
+    static constexpr InputOutputIterIndexerT in_out_iter_indexer{
+        NoOpIndexerT{}, NoOpIndexerT{}};
+    const ReductionIndexerT reduction_indexer{/* size */ reduction_nelems,
+                                              /* step */ iter_nelems};
+
+    const sycl::event &first_reduction_ev =
+        dpctl::tensor::kernels::submit_no_atomic_reduction<
+            T, T, ReductionOpT, InputOutputIterIndexerT, ReductionIndexerT,
+            gemm_reduction_over_group_temps_contig_krn>(
+            exec_q, partially_reduced_tmp, partially_reduced_tmp2, identity_val,
+            wg, iter_nelems, reduction_nelems, reductions_per_wi,
+            reduction_groups, in_out_iter_indexer, reduction_indexer, depends);
+
+    std::size_t remaining_reduction_nelems = reduction_groups;
+
+    T *temp_arg = partially_reduced_tmp2;
+    T *temp2_arg = partially_reduced_tmp;
+    sycl::event dependent_ev = first_reduction_ev;
+
+    while (remaining_reduction_nelems > preferred_reductions_per_wi * max_wg) {
+        std::size_t reduction_groups_ = (remaining_reduction_nelems +
+                                         preferred_reductions_per_wi * wg - 1) /
+                                        (preferred_reductions_per_wi * wg);
+        assert(reduction_groups_ > 1);
+
+        // keep reducing
+        using InputIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer;
+        using ResIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+        using InputOutputIterIndexerT =
+            dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
+                InputIndexerT, ResIndexerT>;
+        using ReductionIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+
+        // n * m = iter_nelems because essentially, this process
+        // creates a stack of reduction_nelems 2D matrices and we reduce
+        // along the stack axis
+        const InputIndexerT inp_indexer{/* size */ iter_nelems,
+                                        /* step */ reduction_groups_};
+        static constexpr ResIndexerT res_iter_indexer{};
+
+        const InputOutputIterIndexerT in_out_iter_indexer{inp_indexer,
+                                                          res_iter_indexer};
+
+        static constexpr ReductionIndexerT reduction_indexer{};
+
+        sycl::event partial_reduction_ev =
+            dpctl::tensor::kernels::submit_no_atomic_reduction<
+                T, T, ReductionOpT, InputOutputIterIndexerT, ReductionIndexerT,
+                gemm_reduction_over_group_temps_contig_krn>(
+                exec_q, temp_arg, temp2_arg, identity_val, wg, iter_nelems,
+                remaining_reduction_nelems, reductions_per_wi,
+                reduction_groups_, in_out_iter_indexer, reduction_indexer,
+                {dependent_ev});
+
+        remaining_reduction_nelems = reduction_groups_;
+        std::swap(temp_arg, temp2_arg);
+        dependent_ev = std::move(partial_reduction_ev);
+    }
+
+    // final reduction to res
+    {
+        using InputIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer;
+        using ResIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+        using InputOutputIterIndexerT =
+            dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
+                InputIndexerT, ResIndexerT>;
+        using ReductionIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+
+        const InputIndexerT inp_indexer{
+            /* size   */ iter_nelems,
+            /* step   */ remaining_reduction_nelems};
+        static constexpr ResIndexerT res_iter_indexer{};
+
+        const InputOutputIterIndexerT in_out_iter_indexer{inp_indexer,
+                                                          res_iter_indexer};
+        static constexpr ReductionIndexerT reduction_indexer{};
+
+        wg = max_wg;
+        reductions_per_wi = std::max<std::size_t>(
+            1, (remaining_reduction_nelems + wg - 1) / wg);
+
+        std::size_t reduction_groups =
+            (remaining_reduction_nelems + reductions_per_wi * wg - 1) /
+            (reductions_per_wi * wg);
+        assert(reduction_groups == 1);
+
+        sycl::event final_reduction_ev =
+            dpctl::tensor::kernels::submit_no_atomic_reduction<
+                T, T, ReductionOpT, InputOutputIterIndexerT, ReductionIndexerT,
+                gemm_reduction_over_group_temps_contig_krn>(
+                exec_q, temp_arg, res_tp, identity_val, wg, iter_nelems,
+                remaining_reduction_nelems, reductions_per_wi, reduction_groups,
+                in_out_iter_indexer, reduction_indexer, {dependent_ev});
+
+        return final_reduction_ev;
+    }
+}
+
+template <typename lhsT,
+          typename rhsT,
+          typename resT,
+          typename LocAccT,
+          typename OuterInnerDimsIndexerT,
+          typename BatchDimsIndexerT,
+          std::size_t m_groups>
+class GemmBatchFunctorThreadK
+{
+private:
+    const lhsT *lhs = nullptr;
+    const rhsT *rhs = nullptr;
+    resT *res = nullptr;
+    LocAccT workspace;
+    LocAccT local_B_block;
+    std::size_t n = 0;
+    std::size_t n_blocks = 0;
+    std::size_t delta_n = 0;
+    std::size_t k = 0;
+    std::size_t k_blocks = 0;
+    std::size_t delta_k = 0;
+    std::size_t n_wi = 0;
+    std::size_t m = 0;
+    std::size_t batch_nelems = 0;
+    BatchDimsIndexerT batch_indexer;
+    OuterInnerDimsIndexerT lhs_indexer;
+    OuterInnerDimsIndexerT rhs_indexer;
+    OuterInnerDimsIndexerT res_indexer;
+
+public:
+    GemmBatchFunctorThreadK(const lhsT *lhs_,
+                            const rhsT *rhs_,
+                            resT *res_,
+                            LocAccT workspace_,
+                            LocAccT local_B_block_,
+                            std::size_t n_,
+                            std::size_t n_blocks_,
+                            std::size_t delta_n_,
+                            std::size_t k_,
+                            std::size_t k_blocks_,
+                            std::size_t delta_k_,
+                            std::size_t n_wi_,
+                            std::size_t m_,
+                            std::size_t batch_nelems_,
+                            const BatchDimsIndexerT &batch_indexer_,
+                            const OuterInnerDimsIndexerT &lhs_indexer_,
+                            const OuterInnerDimsIndexerT &rhs_indexer_,
+                            const OuterInnerDimsIndexerT &res_indexer_)
+        : lhs(lhs_), rhs(rhs_), res(res_), workspace(workspace_),
+          local_B_block(local_B_block_), n(n_), n_blocks(n_blocks_),
+          delta_n(delta_n_), k(k_), k_blocks(k_blocks_), delta_k(delta_k_),
+          n_wi(n_wi_), m(m_), batch_nelems(batch_nelems_),
+          batch_indexer(batch_indexer_), lhs_indexer(lhs_indexer_),
+          rhs_indexer(rhs_indexer_), res_indexer(res_indexer_)
+    {
+    }
+
+    void operator()(sycl::nd_item<1> it) const
+    {
+        // for batching:
+        // (current matrix in batch) m_id = global_id / (global_range /
+        // batch_nelems) for lhs, offset = m_id * (n * k) for rhs, offset =
+        // m_id
+        // * (k * m) for res, offset = m_id * (n * m)
+        const std::size_t n_groups_per_batch =
+            it.get_group_range(0) / batch_nelems;
+        const std::size_t m_id = it.get_group_linear_id() / n_groups_per_batch;
+        const std::size_t gr_id =
+            it.get_group_linear_id() - m_id * n_groups_per_batch;
+        const std::size_t lid = it.get_local_linear_id();
+
+        const auto &three_offsets_ = batch_indexer(static_cast<ssize_t>(m_id));
+
+        const auto &lhs_offset = three_offsets_.get_first_offset();
+        const auto &rhs_offset = three_offsets_.get_second_offset();
+        const auto &res_offset = three_offsets_.get_third_offset();
+
+        // lift gr_id -> (block_i, block_j, block_s)
+        //   block_i moves fastest, then block_s, then block_j
+
+        const std::size_t r_size = (n_blocks * k_blocks);
+        // 0 <= block_j < m_blocks,
+        const std::size_t block_j = gr_id / r_size;
+        // 0 <= block_r < n_blocks * k_blocks
+        const std::size_t block_r = gr_id - block_j * r_size;
+        // 0 <= block_s < k_blocks
+        const std::size_t block_s = block_r / n_blocks;
+        // 0 <= block_i < n_blocks
+        const std::size_t block_i = block_r - block_s * n_blocks;
+
+        // 0 <= local_i < delta_n
+        const std::size_t local_i = lid / (delta_k);
+        // 0 <= local_s < delta_k
+        const std::size_t local_s = lid - local_i * (delta_k);
+
+        std::size_t i = block_i * delta_n + local_i;
+        std::size_t j = m_groups * block_j;
+        std::size_t s = block_s * delta_k * n_wi + local_s;
+
+        using accV_t = typename LocAccT::value_type;
+
+        static constexpr resT identity_ = resT(0);
+        if (local_i == 0) {
+            for (std::size_t q = 0; q < n_wi * delta_k; q += delta_k) {
+                const std::size_t sq = s + q;
+                const std::size_t sqmj = sq * m + j;
+
+                if constexpr (m_groups == 1 && std::is_same_v<accV_t, resT>) {
+                    local_B_block[local_s + q] =
+                        (sq < k && j < m)
+                            ? static_cast<resT>(
+                                  rhs[rhs_offset + rhs_indexer(sqmj)])
+                            : identity_;
+                }
+                else {
+                    accV_t local_B_vec;
+#pragma unroll
+                    for (std::size_t vec_idx = 0; vec_idx < m_groups; ++vec_idx)
+                    {
+                        local_B_vec[vec_idx] =
+                            (sq < k && j + vec_idx < m)
+                                ? static_cast<resT>(
+                                      rhs[rhs_offset +
+                                          rhs_indexer(sqmj + vec_idx)])
+                                : identity_;
+                    }
+                    local_B_block[local_s + q] = local_B_vec;
+                }
+            }
+        }
+
+        it.barrier(sycl::access::fence_space::local_space);
+
+        std::size_t t_shift = block_s * delta_k * n_wi;
+        std::size_t global_s_offset = i * k + t_shift;
+
+        accV_t private_sum(identity_);
+        static constexpr accV_t vec_identity_(identity_);
+        for (std::size_t t = local_s; t < local_B_block.size(); t += delta_k) {
+            private_sum +=
+                ((i < n) && (t + t_shift < k))
+                    ? (static_cast<resT>(
+                           lhs[lhs_offset + lhs_indexer(global_s_offset + t)]) *
+                       local_B_block[t])
+                    : vec_identity_;
+        }
+
+        std::size_t workspace_i_shift = local_i * delta_k;
+        workspace[workspace_i_shift + local_s] = private_sum;
+
+        it.barrier(sycl::access::fence_space::local_space);
+
+        if (local_s == 0 && i < n) {
+            accV_t local_sum(workspace[workspace_i_shift]);
+            for (std::size_t t = 1; t < delta_k; ++t) {
+                local_sum += workspace[workspace_i_shift + t];
+            }
+
+            sycl::atomic_ref<resT, sycl::memory_order::relaxed,
+                             sycl::memory_scope::device,
+                             sycl::access::address_space::global_space>
+                aout0(res[res_offset + res_indexer(i * m + j)]);
+
+            if constexpr (m_groups == 1 && std::is_same_v<accV_t, resT>) {
+                aout0 += local_sum;
+            }
+            else {
+                aout0 += local_sum[0];
+
+#pragma unroll
+                for (std::size_t vec_id = 1; vec_id < m_groups; ++vec_id) {
+                    if (j + vec_id < m) {
+                        sycl::atomic_ref<
+                            resT, sycl::memory_order::relaxed,
+                            sycl::memory_scope::device,
+                            sycl::access::address_space::global_space>
+                            aout1(res[res_offset +
+                                      res_indexer(i * m + j + vec_id)]);
+
+                        aout1 += local_sum[vec_id];
+                    }
+                }
+            }
+        }
+    }
+};
+
+template <typename T1, typename T2, typename T3>
+class gemm_init_krn;
+
+template <typename T1, typename T2, typename T3, typename T4, std::size_t>
+class gemm_k_krn;
+
+template <typename T1, typename T2, typename T3, typename T4, std::size_t>
+class gemm_nm_krn;
+
+template <typename T1,
+          typename T2,
+          typename T3,
+          typename T4,
+          typename T5,
+          std::size_t>
+class gemm_batch_k_krn;
+
+template <typename T1,
+          typename T2,
+          typename T3,
+          typename T4,
+          typename T5,
+          std::size_t>
+class gemm_batch_nm_krn;
+
+namespace gemm_detail
+{
+
+template <typename lhsTy,
+          typename rhsTy,
+          typename resTy,
+          typename BatchIndexerT,
+          typename LhsIndexerT,
+          typename RhsIndexerT,
+          typename ResIndexerT>
+sycl::event _gemm_k_impl(sycl::queue &exec_q,
+                         const lhsTy *lhs_tp,
+                         const rhsTy *rhs_tp,
+                         resTy *res_tp,
+                         const std::size_t batch_nelems,
+                         const std::size_t n,
+                         const std::size_t k,
+                         const std::size_t m,
+                         const BatchIndexerT &batch_indexer,
+                         const LhsIndexerT &lhs_indexer,
+                         const RhsIndexerT &rhs_indexer,
+                         const ResIndexerT &res_indexer,
+                         const std::vector<sycl::event> &depends)
+{
+    static constexpr std::size_t m_groups = 4;
+    const std::size_t delta_k(4);
+    std::size_t n_wi(64);
+    std::size_t delta_n(32);
+
+    static_assert(std::is_same_v<LhsIndexerT, RhsIndexerT>);
+    static_assert(std::is_same_v<LhsIndexerT, ResIndexerT>);
+
+    const sycl::device &dev = exec_q.get_device();
+    const std::size_t local_mem_size =
+        dev.get_info<sycl::info::device::local_mem_size>();
+    const std::size_t reserved_slm_size = 512;
+
+    gemm_detail::scale_gemm_k_parameters<resTy, m_groups>(
+        local_mem_size, reserved_slm_size, delta_k,
+        n_wi,   // modified by reference
+        delta_n // modified by reference
+    );
+
+    std::size_t n_blocks = (n + delta_n - 1) / delta_n;
+    std::size_t m_blocks = (m + m_groups - 1) / m_groups;
+    std::size_t k_blocks = (k + n_wi * delta_k - 1) / (n_wi * delta_k);
+
+    std::size_t lws = delta_n * delta_k;
+
+    auto gRange =
+        sycl::range<1>(batch_nelems * n_blocks * m_blocks * k_blocks * lws);
+    auto lRange = sycl::range<1>(lws);
+
+    auto ndRange = sycl::nd_range<1>(gRange, lRange);
+
+    sycl::event gemm_ev = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+
+        using LocAccT = sycl::local_accessor<sycl::vec<resTy, m_groups>, 1>;
+        LocAccT local_B_block(n_wi * delta_k, cgh);
+        LocAccT workspace(delta_n * delta_k, cgh);
+
+        using KernelName =
+            class gemm_batch_k_krn<lhsTy, rhsTy, resTy, LhsIndexerT,
+                                   BatchIndexerT, m_groups>;
+        cgh.parallel_for<KernelName>(
+            ndRange,
+            GemmBatchFunctorThreadK<lhsTy, rhsTy, resTy, LocAccT, LhsIndexerT,
+                                    BatchIndexerT, m_groups>(
+                lhs_tp, rhs_tp, res_tp, std::move(workspace),
+                std::move(local_B_block), n, n_blocks, delta_n, k, k_blocks,
+                delta_k, n_wi, m, batch_nelems, batch_indexer, lhs_indexer,
+                rhs_indexer, res_indexer));
+    });
+    return gemm_ev;
+}
+
+template <typename lhsTy,
+          typename rhsTy,
+          typename resTy,
+          typename BatchIndexerT,
+          typename LhsIndexerT,
+          typename RhsIndexerT,
+          typename ResIndexerT>
+sycl::event _gemm_small_m_impl(sycl::queue &exec_q,
+                               const lhsTy *lhs_tp,
+                               const rhsTy *rhs_tp,
+                               resTy *res_tp,
+                               const std::size_t batch_nelems,
+                               const std::size_t n,
+                               const std::size_t k,
+                               const std::size_t m,
+                               const BatchIndexerT &batch_indexer,
+                               const LhsIndexerT &lhs_indexer,
+                               const RhsIndexerT &rhs_indexer,
+                               const ResIndexerT &res_indexer,
+                               const std::vector<sycl::event> &depends)
+{
+    static constexpr std::size_t m_groups = 1;
+    const std::size_t delta_k(4);
+    std::size_t n_wi(64);
+    std::size_t delta_n(32);
+
+    static_assert(std::is_same_v<LhsIndexerT, RhsIndexerT>);
+    static_assert(std::is_same_v<LhsIndexerT, ResIndexerT>);
+
+    const sycl::device &dev = exec_q.get_device();
+    const std::size_t local_mem_size =
+        dev.get_info<sycl::info::device::local_mem_size>();
+    const std::size_t reserved_slm_size = 512;
+
+    gemm_detail::scale_gemm_k_parameters<resTy, m_groups>(
+        local_mem_size, reserved_slm_size, delta_k,
+        n_wi,   // modified by reference
+        delta_n // modified by reference
+    );
+
+    std::size_t n_blocks = (n + delta_n - 1) / delta_n;
+    std::size_t m_blocks = (m + m_groups - 1) / m_groups;
+    std::size_t k_blocks = (k + n_wi * delta_k - 1) / (n_wi * delta_k);
+
+    std::size_t lws = delta_n * delta_k;
+
+    auto gRange =
+        sycl::range<1>(batch_nelems * n_blocks * m_blocks * k_blocks * lws);
+    auto lRange = sycl::range<1>(lws);
+
+    auto ndRange = sycl::nd_range<1>(gRange, lRange);
+
+    sycl::event gemm_ev = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+
+        using LocAccT = sycl::local_accessor<resTy, 1>;
+        LocAccT local_B_block(n_wi * delta_k, cgh);
+        LocAccT workspace(delta_n * delta_k, cgh);
+
+        using KernelName =
+            class gemm_batch_k_krn<lhsTy, rhsTy, resTy, LhsIndexerT,
+                                   BatchIndexerT, m_groups>;
+        cgh.parallel_for<KernelName>(
+            ndRange,
+            GemmBatchFunctorThreadK<lhsTy, rhsTy, resTy, LocAccT, LhsIndexerT,
+                                    BatchIndexerT, m_groups>(
+                lhs_tp, rhs_tp, res_tp, std::move(workspace),
+                std::move(local_B_block), n, n_blocks, delta_n, k, k_blocks,
+                delta_k, n_wi, m, batch_nelems, batch_indexer, lhs_indexer,
+                rhs_indexer, res_indexer));
+    });
+
+    return gemm_ev;
+}
+
+} // end of namespace gemm_detail
+
+template <typename lhsT,
+          typename rhsT,
+          typename resT,
+          typename LocAccT1,
+          typename LocAccT2,
+          typename BatchDimsIndexerT,
+          typename LhsIndexerT,
+          typename RhsIndexerT,
+          typename ResIndexerT,
+          std::uint32_t wi_delta_n,
+          std::uint32_t wi_delta_m_vecs,
+          std::uint32_t m_vec_size>
+class GemmBatchFunctorThreadNM_vecm
+{
+private:
+    const lhsT *lhs = nullptr;
+    const rhsT *rhs = nullptr;
+    resT *res = nullptr;
+    LocAccT1 local_lhs_block;
+    LocAccT2 local_rhs_block;
+    std::size_t batch_nelems;
+    std::size_t n = 0;
+    std::size_t k = 0;
+    std::size_t m = 0;
+    std::size_t n_groups = 0;
+    std::uint32_t wg_delta_n = 0;
+    std::uint32_t wg_delta_m = 0;
+    std::uint32_t wi_delta_k = 0;
+    BatchDimsIndexerT batch_indexer;
+    LhsIndexerT lhs_indexer;
+    RhsIndexerT rhs_indexer;
+    ResIndexerT res_indexer;
+
+public:
+    /*! @brief */
+    GemmBatchFunctorThreadNM_vecm(const lhsT *lhs_,
+                                  const rhsT *rhs_,
+                                  resT *res_,
+                                  LocAccT1 local_lhs_block_,
+                                  LocAccT2 local_rhs_block_,
+                                  std::size_t batch_nelems_,
+                                  std::size_t n_,
+                                  std::size_t k_,
+                                  std::size_t m_,
+                                  std::size_t n_groups_,
+                                  std::size_t wg_delta_n_,
+                                  std::size_t wg_delta_m_,
+                                  std::size_t wi_delta_k_,
+                                  const BatchDimsIndexerT &batch_indexer_,
+                                  const LhsIndexerT &lhs_indexer_,
+                                  const RhsIndexerT &rhs_indexer_,
+                                  const ResIndexerT &res_indexer_)
+        : lhs(lhs_), rhs(rhs_), res(res_), local_lhs_block(local_lhs_block_),
+          local_rhs_block(local_rhs_block_), batch_nelems(batch_nelems_), n(n_),
+          k(k_), m(m_), n_groups(n_groups_), wg_delta_n(wg_delta_n_),
+          wg_delta_m(wg_delta_m_), wi_delta_k(wi_delta_k_),
+          batch_indexer(batch_indexer_), lhs_indexer(lhs_indexer_),
+          rhs_indexer(rhs_indexer_), res_indexer(res_indexer_)
+    {
+    }
+
+    void operator()(sycl::nd_item<1> it) const
+    {
+        static constexpr resT zero_(0);
+        static constexpr std::uint32_t wi_total_delta_m =
+            wi_delta_m_vecs * m_vec_size;
+
+        const std::size_t gws_per_batch = it.get_group_range(0) / batch_nelems;
+        const std::size_t batch_id = it.get_group_linear_id() / gws_per_batch;
+        const std::size_t gr_id =
+            it.get_group_linear_id() - batch_id * gws_per_batch;
+
+        const auto &three_offsets_ =
+            batch_indexer(static_cast<ssize_t>(batch_id));
+
+        const auto &lhs_offset = three_offsets_.get_first_offset();
+        const auto &rhs_offset = three_offsets_.get_second_offset();
+        const auto &res_offset = three_offsets_.get_third_offset();
+
+        // 0 <= block_j < m_groups
+        const std::size_t block_j = gr_id / n_groups;
+        // 0 <= block_i < n_groups
+        const std::size_t block_i = gr_id - block_j * n_groups;
+
+        // Assumption: lws == wg_delta_n * wg_delta_m
+        const std::uint32_t lid = it.get_local_linear_id();
+        // 0 <= local_j < (lws / wg_delta_n == wg_delta_m)
+        const std::uint32_t local_j = lid / wg_delta_n;
+        // sub-group lanes map to adjacent local_i
+        const std::uint32_t local_i = lid - local_j * wg_delta_n;
+
+        // Coordinates of the block of C the work-group works on
+        std::size_t i = block_i * wg_delta_n * wi_delta_n;
+        std::size_t j = block_j * wg_delta_m * wi_total_delta_m;
+
+        using slmA_t = typename LocAccT1::value_type;
+        using slmB_t = typename LocAccT2::value_type;
+
+        const std::size_t a_st0 = k;
+        const std::size_t a_st1 = 1;
+
+        const std::size_t b_st0 = m;
+        const std::size_t b_st1 = 1;
+
+        const std::size_t c_st0 = m;
+        const std::size_t c_st1 = 1;
+
+        // allocate/initialize private matrix C
+        // size ( wi_total_delta_n, wi_total_delta_m )
+        static constexpr std::uint32_t C_size = wi_delta_n * wi_delta_m_vecs;
+        std::array<slmB_t, C_size> private_C{slmB_t{zero_}};
+
+        for (std::size_t s = 0; s < k; s += wi_delta_k) {
+            // populate local_lhs_block<resT> ( wg_delta_n * wi_delta_n,
+            // wi_delta_k)
+            for (std::uint32_t vid = lid; vid < local_lhs_block.size();
+                 vid += it.get_local_range()[0])
+            {
+                // 0 <= v_i < wg_delta_n * wi_delta_n
+                const std::uint32_t v_i = vid / wi_delta_k;
+                // 0 <= v_s < wi_delta_k
+                const std::uint32_t v_s = vid - v_i * wi_delta_k;
+
+                const std::size_t g_i = i + v_i;
+                const std::size_t g_s = s + v_s;
+
+                const std::uint32_t mapped_vid =
+                    wg_delta_n * wi_delta_n * v_s + v_i;
+                local_lhs_block[mapped_vid] =
+                    (g_i < n && g_s < k)
+                        ? static_cast<resT>(
+                              lhs[lhs_offset +
+                                  lhs_indexer(g_i * a_st0 + g_s * a_st1)])
+                        : zero_;
+            }
+
+            // populate local_rhs_block<vec<resT, m_vec_size>> ( wg_delta_m *
+            // wi_delta_m_vecs, wi_delta_k )
+            for (std::uint32_t vid = lid; vid < local_rhs_block.size();
+                 vid += it.get_local_range()[0])
+            {
+                // 0 <= v_j < wg_delta_m * wi_delta_m_vecs
+                const std::uint32_t v_j = vid / wi_delta_k;
+                // 0 <= v_s < wi_delta_k
+                const std::uint32_t v_s = vid - v_j * wi_delta_k;
+
+                const std::size_t g_j = j + v_j * m_vec_size;
+                const std::size_t g_s = s + v_s;
+                const std::uint32_t mapped_vid =
+                    wg_delta_m * wi_delta_m_vecs * v_s + v_j;
+
+                if constexpr (m_vec_size == 1) {
+                    local_rhs_block[mapped_vid] =
+                        (g_j < m && g_s < k)
+                            ? static_cast<resT>(
+                                  rhs[rhs_offset +
+                                      rhs_indexer(g_s * b_st0 + g_j * b_st1)])
+                            : zero_;
+                }
+                else {
+                    slmB_t vec{};
+#pragma unroll
+                    for (std::uint32_t lane_id = 0; lane_id < m_vec_size;
+                         ++lane_id) {
+                        const std::size_t g_j1 = g_j + lane_id;
+                        vec[lane_id] = (g_j1 < m && g_s < k)
+                                           ? static_cast<resT>(
+                                                 rhs[rhs_offset +
+                                                     rhs_indexer(g_s * b_st0 +
+                                                                 g_j1 * b_st1)])
+                                           : zero_;
+                    };
+
+                    local_rhs_block[mapped_vid] = vec;
+                }
+            }
+
+            it.barrier(sycl::access::fence_space::local_space);
+
+            const std::uint32_t lo_lhs_st_k = (wg_delta_n * wi_delta_n);
+            const std::uint32_t lo_rhs_rk_k = (wg_delta_m * wi_delta_m_vecs);
+            for (std::uint32_t pr_k = 0; pr_k < wi_delta_k; ++pr_k) {
+                std::array<slmA_t, wi_delta_n> pr_lhs{};
+#pragma unroll
+                for (std::uint32_t pr_i = 0; pr_i < wi_delta_n; ++pr_i) {
+                    pr_lhs[pr_i] =
+                        local_lhs_block[pr_k * lo_lhs_st_k +
+                                        (local_i + pr_i * wg_delta_n)];
+                }
+
+                std::array<slmB_t, wi_delta_m_vecs> pr_rhs{};
+#pragma unroll
+                for (std::uint32_t pr_j = 0; pr_j < wi_delta_m_vecs; ++pr_j) {
+                    pr_rhs[pr_j] =
+                        local_rhs_block[pr_k * lo_rhs_rk_k +
+                                        (local_j + pr_j * wg_delta_m)];
+                }
+
+#pragma unroll
+                for (std::uint32_t pr_i = 0; pr_i < wi_delta_n; ++pr_i) {
+#pragma unroll
+                    for (std::uint32_t pr_j = 0; pr_j < wi_delta_m_vecs; ++pr_j)
+                    {
+                        private_C[pr_i * wi_delta_m_vecs + pr_j] +=
+                            pr_lhs[pr_i] * pr_rhs[pr_j];
+                    }
+                }
+            }
+
+            it.barrier(sycl::access::fence_space::local_space);
+        }
+
+        if constexpr (m_vec_size == 1) {
+#pragma unroll
+            for (std::uint32_t pr_i = 0; pr_i < wi_delta_n; ++pr_i) {
+                std::size_t out_i = i + local_i + pr_i * wg_delta_n;
+                if (out_i < n) {
+#pragma unroll
+                    for (std::uint32_t pr_j = 0; pr_j < wi_delta_m_vecs; ++pr_j)
+                    {
+                        const std::size_t out_j =
+                            j + (local_j + pr_j * wg_delta_m) * m_vec_size;
+                        const std::size_t out_flat_id =
+                            out_i * c_st0 + out_j * c_st1;
+                        if (out_j < m) {
+                            res[res_offset + res_indexer(out_flat_id)] =
+                                private_C[pr_i * wi_delta_m_vecs + pr_j];
+                        }
+                    }
+                }
+            }
+        }
+        else {
+#pragma unroll
+            for (std::uint32_t pr_i = 0; pr_i < wi_delta_n; ++pr_i) {
+                std::size_t out_i = i + local_i + pr_i * wg_delta_n;
+                if (out_i < n) {
+                    // could be unrolled
+                    for (std::uint32_t pr_j = 0; pr_j < wi_delta_m_vecs; ++pr_j)
+                    {
+                        std::size_t out_j =
+                            j + (local_j + pr_j * wg_delta_m) * m_vec_size;
+#pragma unroll
+                        for (std::uint32_t lane_id = 0; lane_id < m_vec_size;
+                             ++lane_id) {
+                            const std::size_t out_flat_id =
+                                out_i * c_st0 + (out_j + lane_id) * c_st1;
+                            if (out_j + lane_id < m) {
+                                res[res_offset + res_indexer(out_flat_id)] =
+                                    private_C[pr_i * wi_delta_m_vecs + pr_j]
+                                             [lane_id];
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    }
+};
+
+struct GemmBatchFunctorThreadNM_vecm_HyperParameters
+{
+private:
+    std::uint32_t wi_delta_n = 2;
+    std::uint32_t wi_delta_m_vecs = 4;
+    std::uint32_t m_vec_size = 1;
+
+public:
+    constexpr GemmBatchFunctorThreadNM_vecm_HyperParameters();
+    constexpr GemmBatchFunctorThreadNM_vecm_HyperParameters(
+        std::uint32_t wi_delta_n_,
+        std::uint32_t wi_delta_m_vecs_,
+        std::uint32_t m_vec_size_)
+        : wi_delta_n(wi_delta_n_), wi_delta_m_vecs(wi_delta_m_vecs_),
+          m_vec_size(m_vec_size_)
+    {
+    }
+
+    constexpr std::uint32_t get_wi_delta_n() const
+    {
+        return wi_delta_n;
+    }
+    constexpr std::uint32_t get_wi_delta_m_vecs() const
+    {
+        return wi_delta_m_vecs;
+    }
+    constexpr std::uint32_t get_m_vec_size() const
+    {
+        return m_vec_size;
+    }
+};
+
+template <typename resT>
+struct GemmBatchFunctorThreadNM_vecm_HyperParametersSelector
+{
+    constexpr GemmBatchFunctorThreadNM_vecm_HyperParametersSelector() {}
+
+    constexpr GemmBatchFunctorThreadNM_vecm_HyperParameters get() const
+    {
+        if constexpr (sizeof(resT) == 1) {
+            // 1 * 8 * 2 * 4 == 64
+            return GemmBatchFunctorThreadNM_vecm_HyperParameters(8, 2, 4);
+        }
+        else if constexpr (sizeof(resT) == 2) {
+            // 2 * 4 * 2 * 4 == 64
+            return GemmBatchFunctorThreadNM_vecm_HyperParameters(4, 2, 4);
+        }
+        else if constexpr (sizeof(resT) == 4) {
+            // 4 * 4 * 1 * 4 == 64
+            return GemmBatchFunctorThreadNM_vecm_HyperParameters(4, 1, 4);
+        }
+        else if constexpr (sizeof(resT) == 8) {
+            // 8 * 2 * 1 * 4 == 64
+            if constexpr (std::is_same_v<resT, std::complex<float>>) {
+                return GemmBatchFunctorThreadNM_vecm_HyperParameters(2, 4, 1);
+            }
+            else {
+                return GemmBatchFunctorThreadNM_vecm_HyperParameters(2, 1, 4);
+            }
+        }
+        else if constexpr (std::is_same_v<resT, std::complex<double>>) {
+            // 16 * 2 * 2 * 1 == 64
+            return GemmBatchFunctorThreadNM_vecm_HyperParameters(2, 2, 1);
+        }
+        else {
+            return GemmBatchFunctorThreadNM_vecm_HyperParameters(2, 2, 1);
+        }
+    }
+};
+
+template <typename T1,
+          typename T2,
+          typename T3,
+          typename T4,
+          typename T5,
+          typename T6,
+          typename T7,
+          std::uint32_t p1,
+          std::uint32_t p2,
+          std::uint32_t p3>
+class gemm_batch_nm_vecm_krn;
+
+namespace gemm_detail
+{
+
+template <typename T, std::uint32_t wi_delta_n, std::uint32_t wi_delta_m>
+std::tuple<std::uint32_t, std::uint32_t>
+    get_wg_delta_m_and_wi_delta_k(const std::size_t slm_byte_size,
+                                  const std::uint32_t wg_delta_n,
+                                  const std::uint32_t suggested_wg_delta_m)
+{
+    std::uint32_t wg_delta_m = suggested_wg_delta_m;
+
+    const std::size_t slm_max_rows =
+        slm_byte_size /
+        ((wg_delta_n * wi_delta_n + wg_delta_m * wi_delta_m) * sizeof(T));
+
+    std::uint32_t wi_delta_k =
+        (slm_max_rows >= 64)
+            ? 64
+            : 32 * static_cast<std::uint32_t>(slm_max_rows / 32);
+
+    for (std::uint32_t it = 0; !wi_delta_k && (it < 4); ++it) {
+        wg_delta_m /= 2;
+
+        const std::size_t slm_max_rows =
+            slm_byte_size /
+            ((wg_delta_n * wi_delta_n + wg_delta_m * wi_delta_m) * sizeof(T));
+
+        wi_delta_k =
+            (slm_max_rows >= 64)
+                ? 64
+                : ((slm_max_rows >= 32)
+                       ? 32
+                       : (slm_max_rows >= 16 ? 16
+                                             : 8 * static_cast<std::uint32_t>(
+                                                       slm_max_rows / 8)));
+    }
+
+    if (!wi_delta_k) {
+        throw std::runtime_error("Insufficient resources");
+    }
+
+    return std::make_tuple(wg_delta_m, wi_delta_k);
+}
+
+template <typename lhsTy,
+          typename rhsTy,
+          typename resTy,
+          typename BatchIndexerT,
+          typename LhsIndexerT,
+          typename RhsIndexerT,
+          typename ResIndexerT>
+sycl::event _gemm_batch_nm_impl(sycl::queue &exec_q,
+                                const lhsTy *lhs_tp,
+                                const rhsTy *rhs_tp,
+                                resTy *res_tp,
+                                const std::size_t batch_nelems,
+                                const std::size_t n,
+                                const std::size_t k,
+                                const std::size_t m,
+                                const BatchIndexerT &batch_indexer,
+                                const LhsIndexerT &lhs_indexer,
+                                const RhsIndexerT &rhs_indexer,
+                                const ResIndexerT &res_indexer,
+                                std::vector<sycl::event> const &depends)
+{
+    static constexpr GemmBatchFunctorThreadNM_vecm_HyperParametersSelector<
+        resTy>
+        selector{};
+    static constexpr auto hyper_params = selector.get();
+
+    static constexpr std::uint32_t wi_delta_n = hyper_params.get_wi_delta_n();
+    static constexpr std::uint32_t wi_delta_m_vecs =
+        hyper_params.get_wi_delta_m_vecs();
+    static constexpr std::uint32_t m_vec_size = hyper_params.get_m_vec_size();
+
+    static constexpr std::uint32_t wi_total_delta_m =
+        wi_delta_m_vecs * m_vec_size;
+
+    using KernelName =
+        class gemm_batch_nm_vecm_krn<lhsTy, rhsTy, resTy, BatchIndexerT,
+                                     LhsIndexerT, RhsIndexerT, ResIndexerT,
+                                     wi_delta_n, wi_delta_m_vecs, m_vec_size>;
+
+    const auto &kernel_id = sycl::get_kernel_id<KernelName>();
+
+    auto const &ctx = exec_q.get_context();
+    auto const &dev = exec_q.get_device();
+    auto kb = sycl::get_kernel_bundle<sycl::bundle_state::executable>(
+        ctx, {dev}, {kernel_id});
+
+    auto krn = kb.get_kernel(kernel_id);
+
+    const std::uint32_t max_sg_size = krn.template get_info<
+        sycl::info::kernel_device_specific::max_sub_group_size>(dev);
+
+    const std::size_t k_wg_sz = krn.template get_info<
+        sycl::info::kernel_device_specific::work_group_size>(dev);
+
+    // Limit work-group size
+    static constexpr std::size_t wg_sz_limit(2048);
+    const std::size_t max_wg_sz = std::min(wg_sz_limit, k_wg_sz);
+
+    const std::uint32_t max_subgroups_per_wg =
+        static_cast<std::uint32_t>(max_wg_sz / max_sg_size);
+
+    const std::size_t reserved_slm_byte_size = 512;
+    const std::size_t slm_byte_size =
+        dev.get_info<sycl::info::device::local_mem_size>();
+
+    const std::uint32_t wg_delta_n = max_sg_size;
+    std::uint32_t wg_delta_m = 0;
+    std::uint32_t wi_delta_k = 0;
+
+    std::tie(wg_delta_m, wi_delta_k) =
+        get_wg_delta_m_and_wi_delta_k<resTy, wi_delta_n, wi_total_delta_m>(
+            slm_byte_size - reserved_slm_byte_size, wg_delta_n,
+            max_subgroups_per_wg);
+
+    const std::uint32_t lws = wg_delta_n * wg_delta_m;
+
+    const std::size_t n_groups =
+        (n + wg_delta_n * wi_delta_n - 1) / (wg_delta_n * wi_delta_n);
+    const std::size_t m_groups = (m + wg_delta_m * wi_total_delta_m - 1) /
+                                 (wg_delta_m * wi_total_delta_m);
+
+    const std::size_t gws = lws * batch_nelems * n_groups * m_groups;
+
+    sycl::range<1> lRange(lws);
+    sycl::range<1> gRange(gws);
+    sycl::nd_range<1> ndRange(gRange, lRange);
+
+    using slmB_t =
+        typename std::conditional<m_vec_size == 1, resTy,
+                                  sycl::vec<resTy, m_vec_size>>::type;
+
+    sycl::event gemm_ev = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+
+        cgh.use_kernel_bundle(kb);
+
+        using LocAccT1 = sycl::local_accessor<resTy, 1>;
+        LocAccT1 local_A_block(wg_delta_n * wi_delta_n * wi_delta_k, cgh);
+
+        using LocAccT2 = sycl::local_accessor<slmB_t, 1>;
+        LocAccT2 local_B_block(wg_delta_m * wi_delta_m_vecs * wi_delta_k, cgh);
+
+        using Impl_FunctorT = GemmBatchFunctorThreadNM_vecm<
+            lhsTy, rhsTy, resTy, LocAccT1, LocAccT2, BatchIndexerT, LhsIndexerT,
+            RhsIndexerT, ResIndexerT, wi_delta_n, wi_delta_m_vecs, m_vec_size>;
+
+        cgh.parallel_for<KernelName>(
+            ndRange, Impl_FunctorT(
+                         lhs_tp, rhs_tp, res_tp, std::move(local_A_block),
+                         std::move(local_B_block), batch_nelems, n, k, m,
+                         n_groups, wg_delta_n, wg_delta_m, wi_delta_k,
+                         batch_indexer, lhs_indexer, rhs_indexer, res_indexer));
+    });
+    return gemm_ev;
+}
+
+} // namespace gemm_detail
+
+typedef sycl::event (*gemm_impl_fn_ptr_t)(
+    sycl::queue &,
+    const char *,    // lhs
+    const char *,    // rhs
+    char *,          // res
+    std::size_t,     // lhs_outer_nelems (n)
+    std::size_t,     // inner_nelems (k)
+    std::size_t,     // rhs_outer_nelems (m)
+    int,             // inner nd
+    int,             // lhs outer nd
+    const ssize_t *, // lhs shape and strides
+    int,             // rhs outer nd
+    const ssize_t *, // rhs shape and strides
+    int,             // res outer nd
+    const ssize_t *, // res shape and strides
+    std::vector<sycl::event> const &);
+
+template <typename lhsTy, typename rhsTy, typename resTy>
+sycl::event gemm_impl(sycl::queue &exec_q,
+                      const char *lhs_cp,
+                      const char *rhs_cp,
+                      char *res_cp,
+                      std::size_t n,
+                      std::size_t k,
+                      std::size_t m,
+                      int inner_nd,
+                      int lhs_outer_nd,
+                      const ssize_t *lhs_shape_strides,
+                      int rhs_outer_nd,
+                      const ssize_t *rhs_shape_strides,
+                      int res_outer_nd,
+                      const ssize_t *res_shape_strides,
+                      std::vector<sycl::event> const &depends = {})
+{
+    const lhsTy *lhs_tp = reinterpret_cast<const lhsTy *>(lhs_cp);
+    const rhsTy *rhs_tp = reinterpret_cast<const rhsTy *>(rhs_cp);
+    resTy *res_tp = reinterpret_cast<resTy *>(res_cp);
+
+    using OuterInnerIndexerT = dpctl::tensor::offset_utils::StridedIndexer;
+    const OuterInnerIndexerT lhs_indexer(inner_nd + lhs_outer_nd, 0,
+                                         lhs_shape_strides);
+    const OuterInnerIndexerT rhs_indexer(inner_nd + rhs_outer_nd, 0,
+                                         rhs_shape_strides);
+    const OuterInnerIndexerT res_indexer(res_outer_nd, 0, res_shape_strides);
+
+    using BatchIndexerT = dpctl::tensor::offset_utils::ThreeZeroOffsets_Indexer;
+    static constexpr BatchIndexerT batch_indexer{};
+
+    static constexpr std::size_t single_batch_nelems = 1;
+
+    const std::size_t min_nm = std::min(n, m);
+    const std::size_t max_nm = std::max(n, m);
+
+    if (min_nm > 0 && (max_nm >= ((64 * 1024) / min_nm))) {
+        return gemm_detail::_gemm_batch_nm_impl<
+            lhsTy, rhsTy, resTy, BatchIndexerT, OuterInnerIndexerT,
+            OuterInnerIndexerT, OuterInnerIndexerT>(
+            exec_q, lhs_tp, rhs_tp, res_tp, single_batch_nelems, n, k, m,
+            batch_indexer, lhs_indexer, rhs_indexer, res_indexer, depends);
+    }
+
+    sycl::event res_init_ev = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+
+        using IndexerT = dpctl::tensor::offset_utils::StridedIndexer;
+        const IndexerT res_indexer(res_outer_nd, 0, res_shape_strides);
+        using InitKernelName = class gemm_init_krn<lhsTy, rhsTy, resTy>;
+        cgh.parallel_for<InitKernelName>(
+            sycl::range<1>(n * m), [=](sycl::id<1> id) {
+                auto res_offset = res_indexer(id[0]);
+                res_tp[res_offset] = resTy(0);
+            });
+    });
+
+    if (k == 0) {
+        return res_init_ev;
+    }
+
+    if ((max_nm < 64)) {
+        if (m < 4) {
+            return gemm_detail::_gemm_small_m_impl<
+                lhsTy, rhsTy, resTy, BatchIndexerT, OuterInnerIndexerT,
+                OuterInnerIndexerT, OuterInnerIndexerT>(
+                exec_q, lhs_tp, rhs_tp, res_tp, single_batch_nelems, n, k, m,
+                batch_indexer, lhs_indexer, rhs_indexer, res_indexer,
+                {res_init_ev});
+        }
+        return gemm_detail::_gemm_k_impl<lhsTy, rhsTy, resTy, BatchIndexerT,
+                                         OuterInnerIndexerT, OuterInnerIndexerT,
+                                         OuterInnerIndexerT>(
+            exec_q, lhs_tp, rhs_tp, res_tp, single_batch_nelems, n, k, m,
+            batch_indexer, lhs_indexer, rhs_indexer, res_indexer,
+            {res_init_ev});
+    }
+
+    return gemm_detail::_gemm_batch_nm_impl<
+        lhsTy, rhsTy, resTy, BatchIndexerT, OuterInnerIndexerT,
+        OuterInnerIndexerT, OuterInnerIndexerT>(
+        exec_q, lhs_tp, rhs_tp, res_tp, single_batch_nelems, n, k, m,
+        batch_indexer, lhs_indexer, rhs_indexer, res_indexer, {res_init_ev});
+}
+
+typedef sycl::event (*gemm_contig_impl_fn_ptr_t)(
+    sycl::queue &,
+    const char *, // lhs
+    const char *, // rhs
+    char *,       // res
+    std::size_t,  // n
+    std::size_t,  // k
+    std::size_t,  // m
+    std::vector<sycl::event> const &);
+
+template <typename lhsTy, typename rhsTy, typename resTy>
+sycl::event gemm_contig_impl(sycl::queue &exec_q,
+                             const char *lhs_cp,
+                             const char *rhs_cp,
+                             char *res_cp,
+                             std::size_t n,
+                             std::size_t k,
+                             std::size_t m,
+                             std::vector<sycl::event> const &depends = {})
+{
+    const lhsTy *lhs_tp = reinterpret_cast<const lhsTy *>(lhs_cp);
+    const rhsTy *rhs_tp = reinterpret_cast<const rhsTy *>(rhs_cp);
+    resTy *res_tp = reinterpret_cast<resTy *>(res_cp);
+
+    using OuterInnerIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+    static constexpr OuterInnerIndexerT lhs_indexer{};
+    static constexpr OuterInnerIndexerT rhs_indexer{};
+    static constexpr OuterInnerIndexerT res_indexer{};
+
+    using BatchIndexerT = dpctl::tensor::offset_utils::ThreeZeroOffsets_Indexer;
+    static constexpr BatchIndexerT batch_indexer{};
+
+    static constexpr std::size_t single_batch_nelems = 1;
+
+    const std::size_t min_nm = std::min(n, m);
+    const std::size_t max_nm = std::max(n, m);
+    if (min_nm > 0 && (max_nm >= ((64 * 1024) / min_nm))) {
+        return gemm_detail::_gemm_batch_nm_impl<
+            lhsTy, rhsTy, resTy, BatchIndexerT, OuterInnerIndexerT,
+            OuterInnerIndexerT, OuterInnerIndexerT>(
+            exec_q, lhs_tp, rhs_tp, res_tp, single_batch_nelems, n, k, m,
+            batch_indexer, lhs_indexer, rhs_indexer, res_indexer, depends);
+    }
+
+    sycl::event res_init_ev = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+        cgh.fill<resTy>(res_tp, resTy(0), n * m);
+    });
+
+    if (k == 0) {
+        return res_init_ev;
+    }
+
+    if (max_nm < 64) {
+        if (m < 4) {
+            return gemm_detail::_gemm_small_m_impl<
+                lhsTy, rhsTy, resTy, BatchIndexerT, OuterInnerIndexerT,
+                OuterInnerIndexerT, OuterInnerIndexerT>(
+                exec_q, lhs_tp, rhs_tp, res_tp, single_batch_nelems, n, k, m,
+                batch_indexer, lhs_indexer, rhs_indexer, res_indexer,
+                {res_init_ev});
+        }
+        return gemm_detail::_gemm_k_impl<lhsTy, rhsTy, resTy, BatchIndexerT,
+                                         OuterInnerIndexerT, OuterInnerIndexerT,
+                                         OuterInnerIndexerT>(
+            exec_q, lhs_tp, rhs_tp, res_tp, single_batch_nelems, n, k, m,
+            batch_indexer, lhs_indexer, rhs_indexer, res_indexer,
+            {res_init_ev});
+    }
+
+    return gemm_detail::_gemm_batch_nm_impl<
+        lhsTy, rhsTy, resTy, BatchIndexerT, OuterInnerIndexerT,
+        OuterInnerIndexerT, OuterInnerIndexerT>(
+        exec_q, lhs_tp, rhs_tp, res_tp, single_batch_nelems, n, k, m,
+        batch_indexer, lhs_indexer, rhs_indexer, res_indexer, {res_init_ev});
+}
+
+template <typename T1, typename T2, typename T3>
+class gemm_batch_init_krn;
+
+typedef sycl::event (*gemm_batch_impl_fn_ptr_t)(
+    sycl::queue &,
+    const char *,    // lhs
+    const char *,    // rhs
+    char *,          // res
+    std::size_t,     // batch nelems
+    std::size_t,     // lhs outer nelems (n)
+    std::size_t,     // inner nelems (k)
+    std::size_t,     // rhs outer nelems (m)
+    int,             // batching nd
+    const ssize_t *, // batch shape strides
+    ssize_t,         // lhs batch offset
+    ssize_t,         // rhs batch offset
+    ssize_t,         // res batch offset
+    int,             // inner dims
+    int,             // lhs outer dims
+    const ssize_t *, // lhs outer and inner shape and strides
+    int,             // rhs outer dims
+    const ssize_t *, // rhs outer and inner shape and strides
+    int,             // res outer dims
+    const ssize_t *, // res outer and inner shape and strides
+    const ssize_t *, // res full shape and strides
+    std::vector<sycl::event> const &);
+
+template <typename lhsTy, typename rhsTy, typename resTy>
+sycl::event gemm_batch_impl(sycl::queue &exec_q,
+                            const char *lhs_cp,
+                            const char *rhs_cp,
+                            char *res_cp,
+                            std::size_t batch_nelems,
+                            std::size_t n,
+                            std::size_t k,
+                            std::size_t m,
+                            int batch_nd,
+                            const ssize_t *batch_shape_strides,
+                            ssize_t lhs_batch_offset,
+                            ssize_t rhs_batch_offset,
+                            ssize_t res_batch_offset,
+                            int inner_nd,
+                            int lhs_outer_nd,
+                            const ssize_t *lhs_outer_inner_shapes_strides,
+                            int rhs_outer_nd,
+                            const ssize_t *rhs_outer_inner_shapes_strides,
+                            int res_outer_nd,
+                            const ssize_t *res_outer_shapes_strides,
+                            const ssize_t *res_shape_strides,
+                            std::vector<sycl::event> const &depends = {})
+{
+    const lhsTy *lhs_tp = reinterpret_cast<const lhsTy *>(lhs_cp);
+    const rhsTy *rhs_tp = reinterpret_cast<const rhsTy *>(rhs_cp);
+    resTy *res_tp = reinterpret_cast<resTy *>(res_cp);
+
+    using OuterInnerDimsIndexerT = dpctl::tensor::offset_utils::StridedIndexer;
+    const OuterInnerDimsIndexerT lhs_indexer(inner_nd + lhs_outer_nd, 0,
+                                             lhs_outer_inner_shapes_strides);
+    const OuterInnerDimsIndexerT rhs_indexer(inner_nd + rhs_outer_nd, 0,
+                                             rhs_outer_inner_shapes_strides);
+    const OuterInnerDimsIndexerT res_indexer(res_outer_nd, 0,
+                                             res_outer_shapes_strides);
+    using BatchDimsIndexerT =
+        dpctl::tensor::offset_utils::ThreeOffsets_StridedIndexer;
+    const BatchDimsIndexerT batch_indexer(batch_nd, lhs_batch_offset,
+                                          rhs_batch_offset, res_batch_offset,
+                                          batch_shape_strides);
+
+    const std::size_t min_nm = std::min(n, m);
+    const std::size_t max_nm = std::max(n, m);
+
+    if (min_nm > 0 && (max_nm >= ((64 * 1024) / min_nm))) {
+        return gemm_detail::_gemm_batch_nm_impl<
+            lhsTy, rhsTy, resTy, BatchDimsIndexerT, OuterInnerDimsIndexerT,
+            OuterInnerDimsIndexerT, OuterInnerDimsIndexerT>(
+            exec_q, lhs_tp, rhs_tp, res_tp, batch_nelems, n, k, m,
+            batch_indexer, lhs_indexer, rhs_indexer, res_indexer, depends);
+    }
+
+    sycl::event res_init_ev = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+
+        using IndexerT = dpctl::tensor::offset_utils::StridedIndexer;
+        const IndexerT res_indexer(batch_nd + res_outer_nd, res_batch_offset,
+                                   res_shape_strides);
+        using InitKernelName = class gemm_batch_init_krn<lhsTy, rhsTy, resTy>;
+        cgh.parallel_for<InitKernelName>(
+            sycl::range<1>(n * m * batch_nelems), [=](sycl::id<1> id) {
+                auto res_offset = res_indexer(id[0]);
+                res_tp[res_offset] = resTy(0);
+            });
+    });
+
+    if (k == 0) {
+        return res_init_ev;
+    }
+
+    if (m < 4) {
+        return gemm_detail::_gemm_small_m_impl<
+            lhsTy, rhsTy, resTy, BatchDimsIndexerT, OuterInnerDimsIndexerT,
+            OuterInnerDimsIndexerT, OuterInnerDimsIndexerT>(
+            exec_q, lhs_tp, rhs_tp, res_tp, batch_nelems, n, k, m,
+            batch_indexer, lhs_indexer, rhs_indexer, res_indexer,
+            {res_init_ev});
+    }
+    else if (k > n && k > m) {
+        return gemm_detail::_gemm_k_impl<
+            lhsTy, rhsTy, resTy, BatchDimsIndexerT, OuterInnerDimsIndexerT,
+            OuterInnerDimsIndexerT, OuterInnerDimsIndexerT>(
+            exec_q, lhs_tp, rhs_tp, res_tp, batch_nelems, n, k, m,
+            batch_indexer, lhs_indexer, rhs_indexer, res_indexer,
+            {res_init_ev});
+    }
+    else {
+        return gemm_detail::_gemm_batch_nm_impl<
+            lhsTy, rhsTy, resTy, BatchDimsIndexerT, OuterInnerDimsIndexerT,
+            OuterInnerDimsIndexerT, OuterInnerDimsIndexerT>(
+            exec_q, lhs_tp, rhs_tp, res_tp, batch_nelems, n, k, m,
+            batch_indexer, lhs_indexer, rhs_indexer, res_indexer,
+            {res_init_ev});
+    }
+}
+
+typedef sycl::event (*gemm_batch_contig_impl_fn_ptr_t)(
+    sycl::queue &,
+    const char *, // lhs
+    const char *, // rhs
+    char *,       // res
+    std::size_t,  // batch nelems
+    std::size_t,  // n
+    std::size_t,  // k
+    std::size_t,  // m
+    ssize_t,      // lhs batch offset
+    ssize_t,      // rhs batch offset
+    ssize_t,      // res batch offset
+    std::vector<sycl::event> const &);
+
+template <typename lhsTy, typename rhsTy, typename resTy>
+sycl::event gemm_batch_contig_impl(sycl::queue &exec_q,
+                                   const char *lhs_cp,
+                                   const char *rhs_cp,
+                                   char *res_cp,
+                                   std::size_t batch_nelems,
+                                   std::size_t n,
+                                   std::size_t k,
+                                   std::size_t m,
+                                   ssize_t lhs_batch_offset,
+                                   ssize_t rhs_batch_offset,
+                                   ssize_t res_batch_offset,
+                                   std::vector<sycl::event> const &depends = {})
+{
+    const lhsTy *lhs_tp =
+        reinterpret_cast<const lhsTy *>(lhs_cp) + lhs_batch_offset;
+    const rhsTy *rhs_tp =
+        reinterpret_cast<const rhsTy *>(rhs_cp) + rhs_batch_offset;
+    resTy *res_tp = reinterpret_cast<resTy *>(res_cp) + res_batch_offset;
+
+    using OuterInnerDimsIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+    static constexpr OuterInnerDimsIndexerT lhs_indexer{};
+    static constexpr OuterInnerDimsIndexerT rhs_indexer{};
+    static constexpr OuterInnerDimsIndexerT res_indexer{};
+
+    using dpctl::tensor::offset_utils::Strided1DIndexer;
+    using dpctl::tensor::offset_utils::ThreeOffsets_CombinedIndexer;
+    using BatchDimsIndexerT =
+        ThreeOffsets_CombinedIndexer<Strided1DIndexer, Strided1DIndexer,
+                                     Strided1DIndexer>;
+
+    const BatchDimsIndexerT batch_indexer(
+        Strided1DIndexer{/* size */ batch_nelems,
+                         /* step */ n * k},
+        Strided1DIndexer{/* size */ batch_nelems,
+                         /* step */ k * m},
+        Strided1DIndexer{/* size */ batch_nelems,
+                         /* step */ n * m});
+
+    const std::size_t min_nm = std::min(n, m);
+    const std::size_t max_nm = std::max(n, m);
+
+    if (min_nm > 0 && (max_nm >= ((64 * 1024) / min_nm))) {
+        return gemm_detail::_gemm_batch_nm_impl<
+            lhsTy, rhsTy, resTy, BatchDimsIndexerT, OuterInnerDimsIndexerT,
+            OuterInnerDimsIndexerT, OuterInnerDimsIndexerT>(
+            exec_q, lhs_tp, rhs_tp, res_tp, batch_nelems, n, k, m,
+            batch_indexer, lhs_indexer, rhs_indexer, res_indexer, depends);
+    }
+
+    sycl::event res_init_ev = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+        cgh.fill<resTy>(res_tp, resTy(0), n * m * batch_nelems);
+    });
+
+    if (k == 0) {
+        return res_init_ev;
+    }
+
+    if (max_nm < 64) {
+        if (m < 4) {
+            return gemm_detail::_gemm_small_m_impl<
+                lhsTy, rhsTy, resTy, BatchDimsIndexerT, OuterInnerDimsIndexerT,
+                OuterInnerDimsIndexerT, OuterInnerDimsIndexerT>(
+                exec_q, lhs_tp, rhs_tp, res_tp, batch_nelems, n, k, m,
+                batch_indexer, lhs_indexer, rhs_indexer, res_indexer,
+                {res_init_ev});
+        }
+        return gemm_detail::_gemm_k_impl<
+            lhsTy, rhsTy, resTy, BatchDimsIndexerT, OuterInnerDimsIndexerT,
+            OuterInnerDimsIndexerT, OuterInnerDimsIndexerT>(
+            exec_q, lhs_tp, rhs_tp, res_tp, batch_nelems, n, k, m,
+            batch_indexer, lhs_indexer, rhs_indexer, res_indexer,
+            {res_init_ev});
+    }
+
+    return gemm_detail::_gemm_batch_nm_impl<
+        lhsTy, rhsTy, resTy, BatchDimsIndexerT, OuterInnerDimsIndexerT,
+        OuterInnerDimsIndexerT, OuterInnerDimsIndexerT>(
+        exec_q, lhs_tp, rhs_tp, res_tp, batch_nelems, n, k, m, batch_indexer,
+        lhs_indexer, rhs_indexer, res_indexer, {res_init_ev});
+}
+
+// ========== Gemm Tree
+
+template <typename lhsT,
+          typename rhsT,
+          typename resT,
+          typename LocAccT1,
+          typename LocAccT2,
+          typename OuterInnerDimsIndexerT,
+          typename ResIndexerT,
+          typename BatchDimsIndexerT,
+          int wi_delta_n,
+          int wi_delta_m>
+class GemmBatchNoAtomicFunctorThreadNM
+{
+private:
+    const lhsT *lhs = nullptr;
+    const rhsT *rhs = nullptr;
+    resT *res = nullptr;
+    LocAccT1 local_A_block;
+    LocAccT2 local_B_block;
+    std::size_t n = 0;
+    std::size_t wg_delta_n = 0;
+    std::size_t k = 0;
+    std::size_t k_blocks = 0;
+    std::size_t wi_delta_k = 0;
+    std::size_t m = 0;
+    std::size_t m_blocks = 0;
+    std::size_t wg_delta_m = 0;
+    std::size_t batch_nelems;
+    BatchDimsIndexerT batch_indexer;
+    OuterInnerDimsIndexerT lhs_indexer;
+    OuterInnerDimsIndexerT rhs_indexer;
+    ResIndexerT res_indexer;
+
+public:
+    GemmBatchNoAtomicFunctorThreadNM(const lhsT *lhs_,
+                                     const rhsT *rhs_,
+                                     resT *res_,
+                                     LocAccT1 local_A_block_,
+                                     LocAccT2 local_B_block_,
+                                     std::size_t n_,
+                                     std::size_t wg_delta_n_,
+                                     std::size_t k_,
+                                     std::size_t k_blocks_,
+                                     std::size_t wi_delta_k_,
+                                     std::size_t m_,
+                                     std::size_t m_blocks_,
+                                     std::size_t wg_delta_m_,
+                                     std::size_t batch_nelems_,
+                                     const BatchDimsIndexerT batch_indexer_,
+                                     const OuterInnerDimsIndexerT lhs_indexer_,
+                                     const OuterInnerDimsIndexerT rhs_indexer_,
+                                     const ResIndexerT res_indexer_)
+        : lhs(lhs_), rhs(rhs_), res(res_), local_A_block(local_A_block_),
+          local_B_block(local_B_block_), n(n_), wg_delta_n(wg_delta_n_), k(k_),
+          k_blocks(k_blocks_), wi_delta_k(wi_delta_k_), m(m_),
+          m_blocks(m_blocks_), wg_delta_m(wg_delta_m_),
+          batch_nelems(batch_nelems_), batch_indexer(batch_indexer_),
+          lhs_indexer(lhs_indexer_), rhs_indexer(rhs_indexer_),
+          res_indexer(res_indexer_)
+    {
+    }
+
+    void operator()(sycl::nd_item<1> it) const
+    {
+        const std::size_t n_groups_per_batch =
+            it.get_group_range(0) / batch_nelems;
+        const std::size_t m_id = it.get_group_linear_id() / n_groups_per_batch;
+        const std::size_t gr_id =
+            it.get_group_linear_id() - m_id * n_groups_per_batch;
+
+        const auto &three_offsets_ = batch_indexer(static_cast<ssize_t>(m_id));
+
+        // lift group_id to (block_i, block_j, block_s),
+        //    0 <= block_i < n_blocks, 0 <= block_j < m_blocks, 0 <= block_s
+        //    < k_blocks
+
+        const auto &lhs_offset = three_offsets_.get_first_offset();
+        const auto &rhs_offset = three_offsets_.get_second_offset();
+        const auto &res_offset = three_offsets_.get_third_offset();
+
+        std::size_t block_i = gr_id / (m_blocks * k_blocks);
+        std::size_t block_r = gr_id - block_i * (m_blocks * k_blocks);
+        std::size_t block_j = block_r / k_blocks;
+        std::size_t block_s = block_r - block_j * k_blocks;
+
+        std::size_t lid = it.get_local_linear_id();
+        std::size_t local_i = lid / wg_delta_m; // 0<= local_i < wg_delta_n
+        std::size_t local_j =
+            lid - local_i * wg_delta_m; // 0<= local_j < wg_delta_m
+
+        // load A block and B blocks into SLM
+
+        std::size_t i = block_i * wi_delta_n * wg_delta_n;
+        std::size_t j = block_j * wi_delta_m * wg_delta_m;
+        std::size_t s = block_s * wi_delta_k;
+
+        const std::int64_t a_st0 = k;
+        const std::int64_t a_st1 = 1;
+
+        const std::int64_t b_st0 = m;
+        const std::int64_t b_st1 = 1;
+
+        const std::int64_t c_st0 = m;
+        const std::int64_t c_st1 = 1;
+
+        std::size_t lws = it.get_local_range(0);
+
+        for (std::size_t vid = lid; vid < local_A_block.size(); vid += lws) {
+            std::size_t v_i =
+                vid / wi_delta_k; // 0<= v_i < wg_delta_n * wi_delta_n
+            std::size_t v_s = vid - v_i * wi_delta_k; // 0<= v_s < wi_delta_k
+
+            std::size_t g_i = i + v_i;
+            std::size_t g_s = s + v_s;
+
+            local_A_block[vid] =
+                (g_i < n && g_s < k)
+                    ? static_cast<resT>(
+                          lhs[lhs_offset +
+                              lhs_indexer(g_i * a_st0 + g_s * a_st1)])
+                    : resT(0);
+        }
+
+        using slmB_t = typename LocAccT2::value_type;
+
+        for (std::size_t vid = lid; vid < local_B_block.size(); vid += lws) {
+            std::size_t v_j = vid / wi_delta_k;       // 0<= v_i < wg_delta_m
+            std::size_t v_s = vid - v_j * wi_delta_k; // 0<= v_s < wi_delta_k
+
+            std::size_t g_j = j + v_j * wi_delta_m;
+            std::size_t g_s = s + v_s;
+
+            if constexpr (wi_delta_m == 1 && std::is_same_v<slmB_t, resT>) {
+                local_B_block[vid] =
+                    (g_j < m && g_s < k)
+                        ? static_cast<resT>(
+                              rhs[rhs_offset +
+                                  rhs_indexer(g_s * b_st0 + g_j * b_st1)])
+                        : resT(0);
+            }
+            else {
+                slmB_t vec{};
+#pragma unroll
+                for (std::uint8_t lane_id = 0; lane_id < wi_delta_m; ++lane_id)
+                {
+                    std::size_t g_j1 = g_j + lane_id;
+                    vec[lane_id] =
+                        (g_j1 < m && g_s < k)
+                            ? static_cast<resT>(
+                                  rhs[rhs_offset +
+                                      rhs_indexer(g_s * b_st0 + g_j1 * b_st1)])
+                            : resT(0);
+                }
+
+                local_B_block[vid] = vec;
+            }
+        }
+
+        it.barrier(sycl::access::fence_space::local_space);
+
+        i += local_i * wi_delta_n;
+        j += local_j * wi_delta_m;
+
+        const std::size_t a_offset = local_i * wi_delta_k * wi_delta_n;
+        const std::size_t b_offset = local_j * wi_delta_k;
+
+        static constexpr resT identity_(0);
+
+        for (std::uint8_t private_i = 0; private_i < wi_delta_n; ++private_i) {
+            const std::size_t a_pr_offset = private_i * wi_delta_k;
+
+            slmB_t local_sum(identity_);
+            for (std::size_t private_s = 0; private_s < wi_delta_k; ++private_s)
+            {
+                local_sum = local_sum +
+                            (local_A_block[a_offset + a_pr_offset + private_s] *
+                             local_B_block[b_offset + private_s]);
+            }
+
+            const std::size_t gl_i = i + private_i;
+
+            if constexpr (wi_delta_m == 1 && std::is_same_v<slmB_t, resT>) {
+                const std::size_t gl_j = j;
+                if (gl_i < n && gl_j < m) {
+                    res[res_offset + res_indexer(gl_i * c_st0 + gl_j * c_st1) +
+                        (block_s * n * m * batch_nelems)] = local_sum;
+                }
+            }
+            else {
+#pragma unroll
+                for (std::uint8_t lane_id = 0; lane_id < wi_delta_m; ++lane_id)
+                {
+                    const std::size_t gl_j = j + lane_id;
+
+                    if (gl_i < n && gl_j < m) {
+                        res[res_offset +
+                            res_indexer(gl_i * c_st0 + gl_j * c_st1) +
+                            (block_s * n * m * batch_nelems)] =
+                            local_sum[lane_id];
+                    }
+                }
+            }
+        }
+    }
+};
+
+template <typename lhsT,
+          typename rhsT,
+          typename resT,
+          typename LocAccT,
+          typename OuterInnerDimsIndexerT,
+          typename ResIndexerT,
+          typename BatchDimsIndexerT,
+          std::size_t m_groups>
+class GemmBatchNoAtomicFunctorThreadK
+{
+private:
+    const lhsT *lhs = nullptr;
+    const rhsT *rhs = nullptr;
+    resT *res = nullptr;
+    LocAccT workspace;
+    LocAccT local_B_block;
+    std::size_t n = 0;
+    std::size_t n_blocks = 0;
+    std::size_t delta_n = 0;
+    std::size_t k = 0;
+    std::size_t k_blocks = 0;
+    std::size_t delta_k = 0;
+    std::size_t n_wi = 0;
+    std::size_t m = 0;
+    std::size_t batch_nelems = 0;
+    BatchDimsIndexerT batch_indexer;
+    OuterInnerDimsIndexerT lhs_indexer;
+    OuterInnerDimsIndexerT rhs_indexer;
+    ResIndexerT res_indexer;
+
+public:
+    GemmBatchNoAtomicFunctorThreadK(const lhsT *lhs_,
+                                    const rhsT *rhs_,
+                                    resT *res_,
+                                    LocAccT workspace_,
+                                    LocAccT local_B_block_,
+                                    std::size_t n_,
+                                    std::size_t n_blocks_,
+                                    std::size_t delta_n_,
+                                    std::size_t k_,
+                                    std::size_t k_blocks_,
+                                    std::size_t delta_k_,
+                                    std::size_t n_wi_,
+                                    std::size_t m_,
+                                    std::size_t batch_nelems_,
+                                    const BatchDimsIndexerT &batch_indexer_,
+                                    const OuterInnerDimsIndexerT &lhs_indexer_,
+                                    const OuterInnerDimsIndexerT &rhs_indexer_,
+                                    const ResIndexerT &res_indexer_)
+        : lhs(lhs_), rhs(rhs_), res(res_), workspace(workspace_),
+          local_B_block(local_B_block_), n(n_), n_blocks(n_blocks_),
+          delta_n(delta_n_), k(k_), k_blocks(k_blocks_), delta_k(delta_k_),
+          n_wi(n_wi_), m(m_), batch_nelems(batch_nelems_),
+          batch_indexer(batch_indexer_), lhs_indexer(lhs_indexer_),
+          rhs_indexer(rhs_indexer_), res_indexer(res_indexer_)
+    {
+    }
+
+    void operator()(sycl::nd_item<1> it) const
+    {
+        const std::size_t n_groups_per_batch =
+            it.get_group_range(0) / batch_nelems;
+        const std::size_t m_id = it.get_group_linear_id() / n_groups_per_batch;
+        const std::size_t gr_id =
+            it.get_group_linear_id() - m_id * n_groups_per_batch;
+        std::size_t lid = it.get_local_linear_id();
+
+        const auto &three_offsets_ = batch_indexer(static_cast<ssize_t>(m_id));
+        const auto &lhs_offset = three_offsets_.get_first_offset();
+        const auto &rhs_offset = three_offsets_.get_second_offset();
+        const auto &res_offset = three_offsets_.get_third_offset();
+
+        // lift gr_id -> (block_i, block_j, block_s)
+        //   block_i moves fastest, then block_s, then block_j
+
+        const std::size_t r_size = (n_blocks * k_blocks);
+        // 0 <= block_j < m_blocks
+        std::size_t block_j = gr_id / r_size;
+        // 0 <= block_r < n_blocks * k_blocks
+        std::size_t block_r = gr_id - block_j * r_size;
+        // 0 <= block_s < k_blocks
+        std::size_t block_s = block_r / n_blocks;
+        // 0 <= block_i < n_blocks
+        std::size_t block_i = block_r - block_s * n_blocks;
+
+        std::size_t local_i = lid / (delta_k); // 0 <= local_i < delta_n
+        std::size_t local_s =
+            lid - local_i * (delta_k); // 0 <= local_s < delta_k
+
+        std::size_t i = block_i * delta_n + local_i;
+        std::size_t j = m_groups * block_j;
+        std::size_t s = block_s * delta_k * n_wi + local_s;
+
+        using accV_t = typename LocAccT::value_type;
+
+        static constexpr resT identity_ = resT(0);
+        if (local_i == 0) {
+            for (std::size_t q = 0; q < n_wi * delta_k; q += delta_k) {
+                std::size_t sq = s + q;
+                std::size_t sqmj = sq * m + j;
+
+                if constexpr (m_groups == 1 && std::is_same_v<accV_t, resT>) {
+                    local_B_block[local_s + q] =
+                        (sq < k && j < m)
+                            ? static_cast<resT>(
+                                  rhs[rhs_offset + rhs_indexer(sqmj)])
+                            : identity_;
+                }
+                else {
+                    accV_t local_B_vec;
+#pragma unroll
+                    for (std::size_t vec_idx = 0; vec_idx < m_groups; ++vec_idx)
+                    {
+                        local_B_vec[vec_idx] =
+                            (sq < k && j + vec_idx < m)
+                                ? static_cast<resT>(
+                                      rhs[rhs_offset +
+                                          rhs_indexer(sqmj + vec_idx)])
+                                : identity_;
+                    }
+                    local_B_block[local_s + q] = local_B_vec;
+                }
+            }
+        }
+
+        it.barrier(sycl::access::fence_space::local_space);
+
+        std::size_t t_shift = block_s * delta_k * n_wi;
+        std::size_t global_s_offset = i * k + t_shift;
+
+        accV_t private_sum(identity_);
+        static constexpr accV_t vec_identity_(identity_);
+        for (std::size_t t = local_s; t < local_B_block.size(); t += delta_k) {
+            private_sum +=
+                ((i < n) && (t + t_shift < k))
+                    ? (static_cast<resT>(
+                           lhs[lhs_offset + lhs_indexer(global_s_offset + t)]) *
+                       local_B_block[t])
+                    : vec_identity_;
+        }
+
+        std::size_t workspace_i_shift = local_i * delta_k;
+        workspace[workspace_i_shift + local_s] = private_sum;
+
+        it.barrier(sycl::access::fence_space::local_space);
+
+        if (local_s == 0 && i < n) {
+            accV_t local_sum(workspace[workspace_i_shift]);
+            for (std::size_t t = 1; t < delta_k; ++t) {
+                local_sum += workspace[workspace_i_shift + t];
+            }
+
+            const std::size_t total_offset =
+                res_offset + (block_s * n * m * batch_nelems);
+
+            if constexpr (m_groups == 1 && std::is_same_v<accV_t, resT>) {
+                res[total_offset + res_indexer(i * m + j)] = local_sum;
+            }
+            else {
+                res[total_offset + res_indexer(i * m + j)] = local_sum[0];
+
+#pragma unroll
+                for (std::size_t vec_id = 1; vec_id < m_groups; ++vec_id) {
+                    if (j + vec_id < m) {
+                        res[total_offset + res_indexer(i * m + j + vec_id)] =
+                            local_sum[vec_id];
+                    }
+                }
+            }
+        }
+    }
+};
+
+template <typename T1,
+          typename T2,
+          typename T3,
+          typename T4,
+          typename T5,
+          typename T6,
+          std::size_t>
+class gemm_batch_tree_k_krn;
+
+template <typename T1,
+          typename T2,
+          typename T3,
+          typename T4,
+          typename T5,
+          typename T6,
+          std::size_t>
+class gemm_batch_tree_nm_krn;
+
+namespace gemm_detail
+{
+
+template <typename lhsTy,
+          typename rhsTy,
+          typename resTy,
+          typename BatchIndexerT,
+          typename LhsIndexerT,
+          typename RhsIndexerT,
+          typename ResIndexerT,
+          std::uint32_t m_groups>
+sycl::event _gemm_tree_k_step(sycl::queue &exec_q,
+                              const lhsTy *lhs_tp,
+                              const rhsTy *rhs_tp,
+                              resTy *res_tp,
+                              const std::size_t batch_nelems,
+                              const std::size_t n,
+                              const std::size_t k,
+                              const std::size_t m,
+                              const std::size_t delta_n,
+                              const std::size_t n_wi,
+                              const std::size_t delta_k,
+                              const BatchIndexerT &batch_indexer,
+                              const LhsIndexerT &lhs_indexer,
+                              const RhsIndexerT &rhs_indexer,
+                              const ResIndexerT &res_indexer,
+                              const std::vector<sycl::event> &depends)
+{
+    static_assert(std::is_same_v<LhsIndexerT, RhsIndexerT>);
+
+    sycl::event gemm_ev = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+
+        const std::size_t n_blocks = (n + delta_n - 1) / delta_n;
+        const std::size_t k_blocks =
+            (k + n_wi * delta_k - 1) / (n_wi * delta_k);
+        const std::size_t m_blocks = (m + m_groups - 1) / m_groups;
+
+        const std::size_t lws = delta_n * delta_k;
+        const std::size_t gws =
+            batch_nelems * n_blocks * m_blocks * k_blocks * lws;
+
+        auto gRange = sycl::range<1>(gws);
+        auto lRange = sycl::range<1>(lws);
+        auto ndRange = sycl::nd_range<1>(gRange, lRange);
+
+        using slmB_t =
+            typename std::conditional<m_groups == 1, resTy,
+                                      sycl::vec<resTy, m_groups>>::type;
+
+        using LocAccT = sycl::local_accessor<slmB_t, 1>;
+        LocAccT local_B_block(n_wi * delta_k, cgh);
+        LocAccT workspace(delta_n * delta_k, cgh);
+
+        using KernelName =
+            class gemm_batch_tree_k_krn<lhsTy, rhsTy, resTy, LhsIndexerT,
+                                        ResIndexerT, BatchIndexerT, m_groups>;
+
+        cgh.parallel_for<KernelName>(
+            ndRange,
+            GemmBatchNoAtomicFunctorThreadK<lhsTy, rhsTy, resTy, LocAccT,
+                                            LhsIndexerT, ResIndexerT,
+                                            BatchIndexerT, m_groups>(
+                lhs_tp, rhs_tp, res_tp, std::move(workspace),
+                std::move(local_B_block), n, n_blocks, delta_n, k, k_blocks,
+                delta_k, n_wi, m, batch_nelems, batch_indexer, lhs_indexer,
+                rhs_indexer, res_indexer));
+    });
+    return gemm_ev;
+}
+
+} // end of namespace gemm_detail
+
+template <typename lhsTy,
+          typename rhsTy,
+          typename resTy,
+          std::uint32_t m_groups>
+sycl::event
+    gemm_batch_tree_k_impl(sycl::queue &exec_q,
+                           const lhsTy *lhs_tp,
+                           const rhsTy *rhs_tp,
+                           resTy *res_tp,
+                           std::size_t batch_nelems,
+                           std::size_t n,
+                           std::size_t k,
+                           std::size_t m,
+                           int batch_nd,
+                           const ssize_t *batch_shape_strides,
+                           ssize_t lhs_batch_offset,
+                           ssize_t rhs_batch_offset,
+                           ssize_t res_batch_offset,
+                           int inner_nd,
+                           int lhs_outer_nd,
+                           const ssize_t *lhs_outer_inner_shapes_strides,
+                           int rhs_outer_nd,
+                           const ssize_t *rhs_outer_inner_shapes_strides,
+                           int res_outer_nd,
+                           const ssize_t *res_outer_shapes_strides,
+                           const ssize_t *res_shape_strides,
+                           std::vector<sycl::event> const &depends)
+{
+    std::size_t delta_k(4);
+    std::size_t n_wi(64);
+    std::size_t delta_n(32);
+
+    const sycl::device &dev = exec_q.get_device();
+    const std::size_t local_mem_size =
+        dev.get_info<sycl::info::device::local_mem_size>();
+    const std::size_t reserved_slm_size = 512;
+
+    gemm_detail::scale_gemm_k_parameters<resTy, m_groups>(
+        local_mem_size, reserved_slm_size, delta_k,
+        n_wi,   // modified by reference
+        delta_n // modified by reference
+    );
+
+    if (k <= (delta_k * n_wi)) {
+        using OuterInnerDimsIndexerT =
+            dpctl::tensor::offset_utils::StridedIndexer;
+        const OuterInnerDimsIndexerT lhs_indexer(
+            inner_nd + lhs_outer_nd, 0, lhs_outer_inner_shapes_strides);
+        const OuterInnerDimsIndexerT rhs_indexer(
+            inner_nd + rhs_outer_nd, 0, rhs_outer_inner_shapes_strides);
+        const OuterInnerDimsIndexerT res_indexer(res_outer_nd, 0,
+                                                 res_outer_shapes_strides);
+        using BatchDimsIndexerT =
+            dpctl::tensor::offset_utils::ThreeOffsets_StridedIndexer;
+        const BatchDimsIndexerT batch_indexer(
+            batch_nd, lhs_batch_offset, rhs_batch_offset, res_batch_offset,
+            batch_shape_strides);
+
+        return gemm_detail::_gemm_tree_k_step<
+            lhsTy, rhsTy, resTy, BatchDimsIndexerT, OuterInnerDimsIndexerT,
+            OuterInnerDimsIndexerT, OuterInnerDimsIndexerT, m_groups>(
+            exec_q, lhs_tp, rhs_tp, res_tp, batch_nelems, n, k, m, delta_n,
+            n_wi, delta_k, batch_indexer, lhs_indexer, rhs_indexer, res_indexer,
+            depends);
+    }
+    else {
+        using ReductionOpT =
+            typename std::conditional<std::is_same_v<resTy, bool>,
+                                      sycl::logical_or<resTy>,
+                                      sycl::plus<resTy>>::type;
+        static constexpr resTy identity_val =
+            sycl::known_identity<ReductionOpT, resTy>::value;
+
+        std::size_t iter_nelems = batch_nelems * n * m;
+        std::size_t reduction_nelems =
+            (k + delta_k * n_wi - 1) / (delta_k * n_wi);
+
+        // more than one work-group is needed, requires a
+        // temporary delta_k * n_wi elements processed along k,
+        // so if more to process use multiple
+        const auto &sg_sizes =
+            dev.get_info<sycl::info::device::sub_group_sizes>();
+        std::size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes);
+
+        static constexpr std::size_t preferred_reductions_per_wi = 4;
+        std::size_t reductions_per_wi(preferred_reductions_per_wi);
+
+        std::size_t reduction_groups =
+            (reduction_nelems + preferred_reductions_per_wi * wg - 1) /
+            (preferred_reductions_per_wi * wg);
+
+        // max_max_wg prevents running out of resources on CPU
+        static constexpr std::size_t max_max_wg = 2048;
+        std::size_t max_wg = std::min(
+            max_max_wg,
+            dev.get_info<sycl::info::device::max_work_group_size>() / 2);
+
+        if (reduction_nelems <= preferred_reductions_per_wi * max_wg) {
+            auto tmp_owner =
+                dpctl::tensor::alloc_utils::smart_malloc_device<resTy>(
+                    iter_nelems * reduction_nelems, exec_q);
+            resTy *tmp = tmp_owner.get();
+
+            using OuterInnerDimsIndexerT =
+                dpctl::tensor::offset_utils::StridedIndexer;
+            using TmpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+            const OuterInnerDimsIndexerT lhs_indexer(
+                inner_nd + lhs_outer_nd, 0, lhs_outer_inner_shapes_strides);
+            const OuterInnerDimsIndexerT rhs_indexer(
+                inner_nd + rhs_outer_nd, 0, rhs_outer_inner_shapes_strides);
+            static constexpr TmpIndexerT res_indexer{};
+
+            using dpctl::tensor::offset_utils::Strided1DIndexer;
+            using dpctl::tensor::offset_utils::StridedIndexer;
+            using dpctl::tensor::offset_utils::ThreeOffsets_CombinedIndexer;
+            using dpctl::tensor::offset_utils::UnpackedStridedIndexer;
+            using BatchDimsIndexerT = ThreeOffsets_CombinedIndexer<
+                StridedIndexer, UnpackedStridedIndexer, Strided1DIndexer>;
+            const StridedIndexer lhs_batch_indexer(batch_nd, lhs_batch_offset,
+                                                   batch_shape_strides);
+            const UnpackedStridedIndexer rhs_batch_indexer(
+                batch_nd, rhs_batch_offset, batch_shape_strides,
+                batch_shape_strides + 2 * batch_nd);
+            const Strided1DIndexer tmp_batch_indexer(
+                /* size   */ batch_nelems,
+                /* step   */ n * m);
+            const BatchDimsIndexerT batch_indexer(
+                lhs_batch_indexer, rhs_batch_indexer, tmp_batch_indexer);
+
+            sycl::event gemm_ev = gemm_detail::_gemm_tree_k_step<
+                lhsTy, rhsTy, resTy, BatchDimsIndexerT, OuterInnerDimsIndexerT,
+                OuterInnerDimsIndexerT, TmpIndexerT, m_groups>(
+                exec_q, lhs_tp, rhs_tp, tmp, batch_nelems, n, k, m, delta_n,
+                n_wi, delta_k, batch_indexer, lhs_indexer, rhs_indexer,
+                res_indexer, depends);
+
+            sycl::event red_ev = single_reduction_for_gemm<resTy, ReductionOpT>(
+                exec_q, tmp, res_tp, identity_val, iter_nelems,
+                reduction_nelems, reduction_groups, wg, max_wg,
+                preferred_reductions_per_wi, reductions_per_wi,
+                batch_nd + res_outer_nd, res_batch_offset, res_shape_strides,
+                {gemm_ev});
+
+            sycl::event cleanup_host_task_event =
+                dpctl::tensor::alloc_utils::async_smart_free(exec_q, {red_ev},
+                                                             tmp_owner);
+
+            return cleanup_host_task_event;
+        }
+        else {
+            assert(reduction_groups > 1);
+
+            const std::size_t tmp_alloc_size =
+                iter_nelems * (
+                                  /* temp */ reduction_nelems +
+                                  /* first reduction temp */ reduction_groups);
+
+            // get unique_ptr owning the temporary allocation
+            auto tmp_owner =
+                dpctl::tensor::alloc_utils::smart_malloc_device<resTy>(
+                    tmp_alloc_size, exec_q);
+            // get raw USM pointer
+            resTy *partially_reduced_tmp = tmp_owner.get();
+            resTy *partially_reduced_tmp2 =
+                partially_reduced_tmp + reduction_nelems * iter_nelems;
+            ;
+
+            using OuterInnerDimsIndexerT =
+                dpctl::tensor::offset_utils::StridedIndexer;
+            using TmpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+            const OuterInnerDimsIndexerT lhs_indexer(
+                inner_nd + lhs_outer_nd, 0, lhs_outer_inner_shapes_strides);
+            const OuterInnerDimsIndexerT rhs_indexer(
+                inner_nd + rhs_outer_nd, 0, rhs_outer_inner_shapes_strides);
+            static constexpr TmpIndexerT res_indexer{};
+            using dpctl::tensor::offset_utils::Strided1DIndexer;
+            using dpctl::tensor::offset_utils::StridedIndexer;
+            using dpctl::tensor::offset_utils::ThreeOffsets_CombinedIndexer;
+            using BatchDimsIndexerT =
+                ThreeOffsets_CombinedIndexer<StridedIndexer, StridedIndexer,
+                                             Strided1DIndexer>;
+            const StridedIndexer lhs_batch_indexer(batch_nd, lhs_batch_offset,
+                                                   batch_shape_strides);
+            const StridedIndexer rhs_batch_indexer(
+                batch_nd, rhs_batch_offset, batch_shape_strides + 2 * batch_nd);
+            const Strided1DIndexer tmp_batch_indexer(
+                /* size   */ batch_nelems,
+                /* step   */ n * m);
+            const BatchDimsIndexerT batch_indexer(
+                lhs_batch_indexer, rhs_batch_indexer, tmp_batch_indexer);
+
+            sycl::event gemm_ev = gemm_detail::_gemm_tree_k_step<
+                lhsTy, rhsTy, resTy, BatchDimsIndexerT, OuterInnerDimsIndexerT,
+                OuterInnerDimsIndexerT, TmpIndexerT, m_groups>(
+                exec_q, lhs_tp, rhs_tp, partially_reduced_tmp, batch_nelems, n,
+                k, m, delta_n, n_wi, delta_k, batch_indexer, lhs_indexer,
+                rhs_indexer, res_indexer, depends);
+
+            sycl::event red_ev = tree_reduction_for_gemm<resTy, ReductionOpT>(
+                exec_q, partially_reduced_tmp, partially_reduced_tmp2, res_tp,
+                identity_val, iter_nelems, reduction_nelems, reduction_groups,
+                wg, max_wg, preferred_reductions_per_wi, reductions_per_wi,
+                batch_nd + res_outer_nd, res_batch_offset, res_shape_strides,
+                {gemm_ev});
+
+            sycl::event cleanup_host_task_event =
+                dpctl::tensor::alloc_utils::async_smart_free(exec_q, {red_ev},
+                                                             tmp_owner);
+
+            return cleanup_host_task_event;
+        }
+    }
+}
+
+namespace gemm_detail
+{
+
+template <typename lhsTy,
+          typename rhsTy,
+          typename resTy,
+          typename BatchIndexerT,
+          typename LhsIndexerT,
+          typename RhsIndexerT,
+          typename ResIndexerT,
+          std::uint32_t wi_delta_n,
+          std::uint32_t wi_delta_m>
+sycl::event _gemm_tree_nm_step(sycl::queue &exec_q,
+                               const lhsTy *lhs_tp,
+                               const rhsTy *rhs_tp,
+                               resTy *res_tp,
+                               const std::size_t batch_nelems,
+                               const std::size_t n,
+                               const std::size_t k,
+                               const std::size_t m,
+                               const std::uint32_t wg_delta_n,
+                               const std::uint32_t wg_delta_m,
+                               const std::uint32_t wi_delta_k,
+                               const BatchIndexerT &batch_indexer,
+                               const LhsIndexerT &lhs_indexer,
+                               const RhsIndexerT &rhs_indexer,
+                               const ResIndexerT &res_indexer,
+                               const std::vector<sycl::event> &depends)
+{
+    static_assert(std::is_same_v<LhsIndexerT, RhsIndexerT>);
+
+    sycl::event gemm_ev = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+
+        const std::size_t lws = wg_delta_n * wg_delta_m;
+
+        const std::size_t n_blocks =
+            ((n + wi_delta_n * wg_delta_n - 1) / (wi_delta_n * wg_delta_n));
+        const std::size_t k_blocks = ((k + wi_delta_k - 1) / wi_delta_k);
+        const std::size_t m_blocks =
+            ((m + wi_delta_m * wg_delta_m - 1) / (wi_delta_m * wg_delta_m));
+
+        const std::size_t gws =
+            batch_nelems * n_blocks * m_blocks * k_blocks * lws;
+
+        auto gwsRange = sycl::range<1>(gws);
+        auto lwsRange = sycl::range<1>(lws);
+        auto ndRange = sycl::nd_range<1>(gwsRange, lwsRange);
+
+        using slmB_t =
+            typename std::conditional<wi_delta_m == 1, resTy,
+                                      sycl::vec<resTy, wi_delta_m>>::type;
+        using LocAccT1 = sycl::local_accessor<resTy, 1>;
+        using LocAccT2 = sycl::local_accessor<slmB_t, 1>;
+
+        const sycl::range<1> local_A_size((wi_delta_n * wg_delta_n) *
+                                          wi_delta_k);
+        const sycl::range<1> local_B_size(wi_delta_k * wg_delta_m);
+
+        LocAccT1 local_A_block(local_A_size, cgh);
+        LocAccT2 local_B_block(local_B_size, cgh);
+
+        using KernelName =
+            class gemm_batch_tree_nm_krn<lhsTy, rhsTy, resTy, LhsIndexerT,
+                                         ResIndexerT, BatchIndexerT,
+                                         wi_delta_m>;
+        cgh.parallel_for<KernelName>(
+            ndRange, GemmBatchNoAtomicFunctorThreadNM<
+                         lhsTy, rhsTy, resTy, LocAccT1, LocAccT2, LhsIndexerT,
+                         ResIndexerT, BatchIndexerT, wi_delta_n, wi_delta_m>(
+                         lhs_tp, rhs_tp, res_tp, std::move(local_A_block),
+                         std::move(local_B_block), n, wg_delta_n, k, k_blocks,
+                         wi_delta_k, m, m_blocks, wg_delta_m, batch_nelems,
+                         batch_indexer, lhs_indexer, rhs_indexer, res_indexer));
+    });
+    return gemm_ev;
+}
+
+} // end namespace gemm_detail
+
+template <typename lhsTy, typename rhsTy, typename resTy, int wi_delta_m>
+sycl::event
+    gemm_batch_tree_nm_impl(sycl::queue &exec_q,
+                            const lhsTy *lhs_tp,
+                            const rhsTy *rhs_tp,
+                            resTy *res_tp,
+                            std::size_t batch_nelems,
+                            std::size_t n,
+                            std::size_t k,
+                            std::size_t m,
+                            int batch_nd,
+                            const ssize_t *batch_shape_strides,
+                            ssize_t lhs_batch_offset,
+                            ssize_t rhs_batch_offset,
+                            ssize_t res_batch_offset,
+                            int inner_nd,
+                            int lhs_outer_nd,
+                            const ssize_t *lhs_outer_inner_shapes_strides,
+                            int rhs_outer_nd,
+                            const ssize_t *rhs_outer_inner_shapes_strides,
+                            int res_outer_nd,
+                            const ssize_t *res_outer_shapes_strides,
+                            const ssize_t *res_shape_strides,
+                            std::vector<sycl::event> const &depends)
+{
+    static constexpr int wi_delta_n = 2;
+    std::size_t wg_delta_n(16); // rows of A processed in WG
+    std::size_t wg_delta_m(16); // rows of B processed in WG
+    std::size_t wi_delta_k(64); // Elements in K dimension processed by WI
+
+    const sycl::device &dev = exec_q.get_device();
+    const std::size_t local_mem_size =
+        dev.get_info<sycl::info::device::local_mem_size>();
+    const std::size_t reserved_slm_size = 512;
+
+    gemm_detail::scale_gemm_nm_parameters<resTy, wi_delta_m>(
+        local_mem_size, reserved_slm_size, wi_delta_n,
+        wi_delta_k, // modified by reference
+        wg_delta_n, // modified by reference
+        wg_delta_m  // modified by reference
+    );
+
+    // each group processes delta_k * n_wi
+    // items in a column, so no need for allocating
+    // temp memory if only one group is needed
+    if (k <= wi_delta_k) {
+        using OuterInnerDimsIndexerT =
+            dpctl::tensor::offset_utils::StridedIndexer;
+        const OuterInnerDimsIndexerT lhs_indexer(
+            inner_nd + lhs_outer_nd, 0, lhs_outer_inner_shapes_strides);
+        const OuterInnerDimsIndexerT rhs_indexer(
+            inner_nd + rhs_outer_nd, 0, rhs_outer_inner_shapes_strides);
+        const OuterInnerDimsIndexerT res_indexer(res_outer_nd, 0,
+                                                 res_outer_shapes_strides);
+        using BatchDimsIndexerT =
+            dpctl::tensor::offset_utils::ThreeOffsets_StridedIndexer;
+        const BatchDimsIndexerT batch_indexer(
+            batch_nd, lhs_batch_offset, rhs_batch_offset, res_batch_offset,
+            batch_shape_strides);
+
+        return gemm_detail::_gemm_tree_nm_step<
+            lhsTy, rhsTy, resTy, BatchDimsIndexerT, OuterInnerDimsIndexerT,
+            OuterInnerDimsIndexerT, OuterInnerDimsIndexerT, wi_delta_n,
+            wi_delta_m>(exec_q, lhs_tp, rhs_tp, res_tp, batch_nelems, n, k, m,
+                        wg_delta_n, wg_delta_m, wi_delta_k, batch_indexer,
+                        lhs_indexer, rhs_indexer, res_indexer, depends);
+    }
+    else {
+        using ReductionOpT =
+            typename std::conditional<std::is_same_v<resTy, bool>,
+                                      sycl::logical_or<resTy>,
+                                      sycl::plus<resTy>>::type;
+        static constexpr resTy identity_val =
+            sycl::known_identity<ReductionOpT, resTy>::value;
+        std::size_t iter_nelems = batch_nelems * n * m;
+        std::size_t reduction_nelems = (k + wi_delta_k - 1) / wi_delta_k;
+
+        // more than one work-group is needed, requires a temporary
+        // delta_k * n_wi elements processed along k, so if more to
+        // process use multiple
+        const auto &sg_sizes =
+            dev.get_info<sycl::info::device::sub_group_sizes>();
+        std::size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes);
+
+        static constexpr std::size_t preferred_reductions_per_wi = 4;
+        std::size_t reductions_per_wi(preferred_reductions_per_wi);
+
+        std::size_t reduction_groups =
+            (reduction_nelems + preferred_reductions_per_wi * wg - 1) /
+            (preferred_reductions_per_wi * wg);
+
+        std::size_t max_wg = reduction_detail::get_work_group_size(dev);
+
+        if (reduction_nelems <= preferred_reductions_per_wi * max_wg) {
+            auto tmp_owner =
+                dpctl::tensor::alloc_utils::smart_malloc_device<resTy>(
+                    iter_nelems * reduction_nelems, exec_q);
+            resTy *tmp = tmp_owner.get();
+
+            using OuterInnerDimsIndexerT =
+                dpctl::tensor::offset_utils::StridedIndexer;
+            using TmpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+            const OuterInnerDimsIndexerT lhs_indexer(
+                inner_nd + lhs_outer_nd, 0, lhs_outer_inner_shapes_strides);
+            const OuterInnerDimsIndexerT rhs_indexer(
+                inner_nd + rhs_outer_nd, 0, rhs_outer_inner_shapes_strides);
+            static constexpr TmpIndexerT res_indexer{};
+
+            using dpctl::tensor::offset_utils::Strided1DIndexer;
+            using dpctl::tensor::offset_utils::StridedIndexer;
+            using dpctl::tensor::offset_utils::ThreeOffsets_CombinedIndexer;
+            using dpctl::tensor::offset_utils::UnpackedStridedIndexer;
+            using BatchDimsIndexerT = ThreeOffsets_CombinedIndexer<
+                StridedIndexer, UnpackedStridedIndexer, Strided1DIndexer>;
+            const StridedIndexer lhs_batch_indexer(batch_nd, lhs_batch_offset,
+                                                   batch_shape_strides);
+            const UnpackedStridedIndexer rhs_batch_indexer(
+                batch_nd, rhs_batch_offset, batch_shape_strides,
+                batch_shape_strides + 2 * batch_nd);
+            const Strided1DIndexer tmp_batch_indexer(
+                /* size   */ batch_nelems,
+                /* step   */ n * m);
+            const BatchDimsIndexerT batch_indexer(
+                lhs_batch_indexer, rhs_batch_indexer, tmp_batch_indexer);
+
+            sycl::event gemm_ev = gemm_detail::_gemm_tree_nm_step<
+                lhsTy, rhsTy, resTy, BatchDimsIndexerT, OuterInnerDimsIndexerT,
+                OuterInnerDimsIndexerT, TmpIndexerT, wi_delta_n, wi_delta_m>(
+                exec_q, lhs_tp, rhs_tp, tmp, batch_nelems, n, k, m, wg_delta_n,
+                wg_delta_m, wi_delta_k, batch_indexer, lhs_indexer, rhs_indexer,
+                res_indexer, depends);
+
+            sycl::event red_ev = single_reduction_for_gemm<resTy, ReductionOpT>(
+                exec_q, tmp, res_tp, identity_val, iter_nelems,
+                reduction_nelems, reduction_groups, wg, max_wg,
+                preferred_reductions_per_wi, reductions_per_wi,
+                batch_nd + res_outer_nd, res_batch_offset, res_shape_strides,
+                {gemm_ev});
+
+            sycl::event cleanup_host_task_event =
+                dpctl::tensor::alloc_utils::async_smart_free(exec_q, {red_ev},
+                                                             tmp_owner);
+
+            return cleanup_host_task_event;
+        }
+        else {
+            assert(reduction_groups > 1);
+
+            const std::size_t tmp_alloc_size =
+                iter_nelems * (/* temp */ reduction_nelems +
+                               /* first reduction temp */ reduction_groups);
+
+            auto tmp_owner =
+                dpctl::tensor::alloc_utils::smart_malloc_device<resTy>(
+                    tmp_alloc_size, exec_q);
+
+            resTy *partially_reduced_tmp = tmp_owner.get();
+            resTy *partially_reduced_tmp2 =
+                partially_reduced_tmp + reduction_nelems * iter_nelems;
+            ;
+
+            using OuterInnerDimsIndexerT =
+                dpctl::tensor::offset_utils::StridedIndexer;
+            using TmpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+
+            const OuterInnerDimsIndexerT lhs_indexer(
+                inner_nd + lhs_outer_nd, 0, lhs_outer_inner_shapes_strides);
+            const OuterInnerDimsIndexerT rhs_indexer(
+                inner_nd + rhs_outer_nd, 0, rhs_outer_inner_shapes_strides);
+            static constexpr TmpIndexerT res_indexer{};
+
+            using dpctl::tensor::offset_utils::Strided1DIndexer;
+            using dpctl::tensor::offset_utils::StridedIndexer;
+            using dpctl::tensor::offset_utils::ThreeOffsets_CombinedIndexer;
+            using dpctl::tensor::offset_utils::UnpackedStridedIndexer;
+            using BatchDimsIndexerT = ThreeOffsets_CombinedIndexer<
+                StridedIndexer, UnpackedStridedIndexer, Strided1DIndexer>;
+
+            const StridedIndexer lhs_batch_indexer(batch_nd, lhs_batch_offset,
+                                                   batch_shape_strides);
+            const UnpackedStridedIndexer rhs_batch_indexer(
+                batch_nd, rhs_batch_offset, batch_shape_strides,
+                batch_shape_strides + 2 * batch_nd);
+            const Strided1DIndexer tmp_batch_indexer(
+                /* size   */ batch_nelems,
+                /* step   */ n * m);
+            const BatchDimsIndexerT batch_indexer(
+                lhs_batch_indexer, rhs_batch_indexer, tmp_batch_indexer);
+
+            sycl::event gemm_ev = gemm_detail::_gemm_tree_nm_step<
+                lhsTy, rhsTy, resTy, BatchDimsIndexerT, OuterInnerDimsIndexerT,
+                OuterInnerDimsIndexerT, TmpIndexerT, wi_delta_n, wi_delta_m>(
+                exec_q, lhs_tp, rhs_tp, partially_reduced_tmp, batch_nelems, n,
+                k, m, wg_delta_n, wg_delta_m, wi_delta_k, batch_indexer,
+                lhs_indexer, rhs_indexer, res_indexer, depends);
+
+            sycl::event red_ev = tree_reduction_for_gemm<resTy, ReductionOpT>(
+                exec_q, partially_reduced_tmp, partially_reduced_tmp2, res_tp,
+                identity_val, iter_nelems, reduction_nelems, reduction_groups,
+                wg, max_wg, preferred_reductions_per_wi, reductions_per_wi,
+                batch_nd + res_outer_nd, res_batch_offset, res_shape_strides,
+                {gemm_ev});
+
+            sycl::event cleanup_host_task_event =
+                dpctl::tensor::alloc_utils::async_smart_free(exec_q, {red_ev},
+                                                             tmp_owner);
+
+            return cleanup_host_task_event;
+        }
+    }
+}
+
+template <typename lhsTy, typename rhsTy, typename resTy>
+sycl::event gemm_batch_nm_impl(sycl::queue &exec_q,
+                               const lhsTy *lhs_tp,
+                               const rhsTy *rhs_tp,
+                               resTy *res_tp,
+                               std::size_t batch_nelems,
+                               std::size_t n,
+                               std::size_t k,
+                               std::size_t m,
+                               int batch_nd,
+                               const ssize_t *batch_shape_strides,
+                               ssize_t lhs_batch_offset,
+                               ssize_t rhs_batch_offset,
+                               ssize_t res_batch_offset,
+                               int inner_nd,
+                               int lhs_outer_nd,
+                               const ssize_t *lhs_outer_inner_shapes_strides,
+                               int rhs_outer_nd,
+                               const ssize_t *rhs_outer_inner_shapes_strides,
+                               int res_outer_nd,
+                               const ssize_t *res_outer_shapes_strides,
+                               const ssize_t *res_shape_strides,
+                               std::vector<sycl::event> const &depends = {})
+{
+
+    using OuterInnerDimsIndexerT = dpctl::tensor::offset_utils::StridedIndexer;
+    const OuterInnerDimsIndexerT lhs_indexer(inner_nd + lhs_outer_nd, 0,
+                                             lhs_outer_inner_shapes_strides);
+    const OuterInnerDimsIndexerT rhs_indexer(inner_nd + rhs_outer_nd, 0,
+                                             rhs_outer_inner_shapes_strides);
+    const OuterInnerDimsIndexerT res_indexer(res_outer_nd, 0,
+                                             res_outer_shapes_strides);
+
+    using BatchDimsIndexerT =
+        dpctl::tensor::offset_utils::ThreeOffsets_StridedIndexer;
+    const BatchDimsIndexerT batch_indexer(batch_nd, lhs_batch_offset,
+                                          rhs_batch_offset, res_batch_offset,
+                                          batch_shape_strides);
+
+    sycl::event gemm_ev = gemm_detail::_gemm_batch_nm_impl<
+        lhsTy, rhsTy, resTy, BatchDimsIndexerT, OuterInnerDimsIndexerT,
+        OuterInnerDimsIndexerT, OuterInnerDimsIndexerT>(
+        exec_q, lhs_tp, rhs_tp, res_tp, batch_nelems, n, k, m, batch_indexer,
+        lhs_indexer, rhs_indexer, res_indexer, depends);
+
+    return gemm_ev;
+}
+
+template <typename T1, typename T2, typename T3>
+class gemm_batch_tree_empty_krn;
+
+template <typename lhsTy, typename rhsTy, typename resTy>
+sycl::event gemm_batch_tree_impl(sycl::queue &exec_q,
+                                 const char *lhs_cp,
+                                 const char *rhs_cp,
+                                 char *res_cp,
+                                 std::size_t batch_nelems,
+                                 std::size_t n,
+                                 std::size_t k,
+                                 std::size_t m,
+                                 int batch_nd,
+                                 const ssize_t *batch_shape_strides,
+                                 ssize_t lhs_batch_offset,
+                                 ssize_t rhs_batch_offset,
+                                 ssize_t res_batch_offset,
+                                 int inner_nd,
+                                 int lhs_outer_nd,
+                                 const ssize_t *lhs_outer_inner_shapes_strides,
+                                 int rhs_outer_nd,
+                                 const ssize_t *rhs_outer_inner_shapes_strides,
+                                 int res_outer_nd,
+                                 const ssize_t *res_outer_shapes_strides,
+                                 const ssize_t *res_shape_strides,
+                                 std::vector<sycl::event> const &depends = {})
+{
+    const lhsTy *lhs_tp = reinterpret_cast<const lhsTy *>(lhs_cp);
+    const rhsTy *rhs_tp = reinterpret_cast<const rhsTy *>(rhs_cp);
+    resTy *res_tp = reinterpret_cast<resTy *>(res_cp);
+
+    const std::size_t min_nm = std::min(n, m);
+    const std::size_t max_nm = std::max(n, m);
+
+    if (min_nm > 0 && (max_nm >= ((64 * 1024) / min_nm))) {
+        return gemm_batch_nm_impl<lhsTy, rhsTy, resTy>(
+            exec_q, lhs_tp, rhs_tp, res_tp, batch_nelems, n, k, m, batch_nd,
+            batch_shape_strides, lhs_batch_offset, rhs_batch_offset,
+            res_batch_offset, inner_nd, lhs_outer_nd,
+            lhs_outer_inner_shapes_strides, rhs_outer_nd,
+            rhs_outer_inner_shapes_strides, res_outer_nd,
+            res_outer_shapes_strides, res_shape_strides, depends);
+    }
+
+    if (k == 0) {
+        sycl::event gemm_batch_no_reduction_ev =
+            exec_q.submit([&](sycl::handler &cgh) {
+                cgh.depends_on(depends);
+
+                using IndexerT = dpctl::tensor::offset_utils::StridedIndexer;
+                const IndexerT res_indexer(batch_nd + res_outer_nd,
+                                           res_batch_offset, res_shape_strides);
+                using InitKernelName =
+                    class gemm_batch_tree_empty_krn<lhsTy, rhsTy, resTy>;
+                cgh.parallel_for<InitKernelName>(
+                    sycl::range<1>(n * m * batch_nelems), [=](sycl::id<1> id) {
+                        auto res_offset = res_indexer(id[0]);
+                        res_tp[res_offset] = resTy(0);
+                    });
+            });
+        return gemm_batch_no_reduction_ev;
+    }
+
+    if (max_nm < 64) {
+        using dpctl::tensor::type_utils::is_complex;
+        if constexpr (!is_complex<resTy>::value) {
+            if (m < 4) {
+                static constexpr std::uint32_t m_groups_one = 1;
+                return gemm_batch_tree_k_impl<lhsTy, rhsTy, resTy,
+                                              m_groups_one>(
+                    exec_q, lhs_tp, rhs_tp, res_tp, batch_nelems, n, k, m,
+                    batch_nd, batch_shape_strides, lhs_batch_offset,
+                    rhs_batch_offset, res_batch_offset, inner_nd, lhs_outer_nd,
+                    lhs_outer_inner_shapes_strides, rhs_outer_nd,
+                    rhs_outer_inner_shapes_strides, res_outer_nd,
+                    res_outer_shapes_strides, res_shape_strides, depends);
+            }
+            else {
+                static constexpr std::uint32_t m_groups_four = 4;
+                return gemm_batch_tree_k_impl<lhsTy, rhsTy, resTy,
+                                              m_groups_four>(
+                    exec_q, lhs_tp, rhs_tp, res_tp, batch_nelems, n, k, m,
+                    batch_nd, batch_shape_strides, lhs_batch_offset,
+                    rhs_batch_offset, res_batch_offset, inner_nd, lhs_outer_nd,
+                    lhs_outer_inner_shapes_strides, rhs_outer_nd,
+                    rhs_outer_inner_shapes_strides, res_outer_nd,
+                    res_outer_shapes_strides, res_shape_strides, depends);
+            }
+        }
+        else {
+            static constexpr std::uint32_t m_groups_one = 1;
+            return gemm_batch_tree_k_impl<lhsTy, rhsTy, resTy, m_groups_one>(
+                exec_q, lhs_tp, rhs_tp, res_tp, batch_nelems, n, k, m, batch_nd,
+                batch_shape_strides, lhs_batch_offset, rhs_batch_offset,
+                res_batch_offset, inner_nd, lhs_outer_nd,
+                lhs_outer_inner_shapes_strides, rhs_outer_nd,
+                rhs_outer_inner_shapes_strides, res_outer_nd,
+                res_outer_shapes_strides, res_shape_strides, depends);
+        }
+    }
+    else { // m > 1, n > k or m > k
+        using dpctl::tensor::type_utils::is_complex;
+        if constexpr (!is_complex<resTy>::value) {
+            static constexpr std::uint32_t m_groups_four = 4;
+            return gemm_batch_tree_nm_impl<lhsTy, rhsTy, resTy, m_groups_four>(
+                exec_q, lhs_tp, rhs_tp, res_tp, batch_nelems, n, k, m, batch_nd,
+                batch_shape_strides, lhs_batch_offset, rhs_batch_offset,
+                res_batch_offset, inner_nd, lhs_outer_nd,
+                lhs_outer_inner_shapes_strides, rhs_outer_nd,
+                rhs_outer_inner_shapes_strides, res_outer_nd,
+                res_outer_shapes_strides, res_shape_strides, depends);
+        }
+        else { // m > 1, n > k or m > k, resTy complex
+            static constexpr std::uint32_t m_groups_one = 1;
+            return gemm_batch_tree_nm_impl<lhsTy, rhsTy, resTy, m_groups_one>(
+                exec_q, lhs_tp, rhs_tp, res_tp, batch_nelems, n, k, m, batch_nd,
+                batch_shape_strides, lhs_batch_offset, rhs_batch_offset,
+                res_batch_offset, inner_nd, lhs_outer_nd,
+                lhs_outer_inner_shapes_strides, rhs_outer_nd,
+                rhs_outer_inner_shapes_strides, res_outer_nd,
+                res_outer_shapes_strides, res_shape_strides, depends);
+        }
+    }
+}
+
+template <typename lhsTy, typename rhsTy, typename resTy, std::size_t m_groups>
+sycl::event
+    gemm_batch_contig_tree_k_impl(sycl::queue &exec_q,
+                                  const lhsTy *lhs_tp,
+                                  const rhsTy *rhs_tp,
+                                  resTy *res_tp,
+                                  std::size_t batch_nelems,
+                                  std::size_t n,
+                                  std::size_t k,
+                                  std::size_t m,
+                                  std::vector<sycl::event> const &depends)
+{
+    std::size_t delta_k(4);
+    std::size_t n_wi(64);
+    std::size_t delta_n(32);
+
+    const sycl::device &dev = exec_q.get_device();
+    const std::size_t local_mem_size =
+        dev.get_info<sycl::info::device::local_mem_size>();
+    const std::size_t reserved_slm_size = 512;
+
+    gemm_detail::scale_gemm_k_parameters<resTy, m_groups>(
+        local_mem_size, reserved_slm_size, delta_k,
+        n_wi,   // modified by reference
+        delta_n // modified by reference
+    );
+
+    if (k <= (delta_k * n_wi)) {
+        using OuterInnerDimsIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+        static constexpr OuterInnerDimsIndexerT lhs_indexer{};
+        static constexpr OuterInnerDimsIndexerT rhs_indexer{};
+        static constexpr OuterInnerDimsIndexerT res_indexer{};
+
+        using dpctl::tensor::offset_utils::Strided1DIndexer;
+        using dpctl::tensor::offset_utils::ThreeOffsets_CombinedIndexer;
+        using BatchDimsIndexerT =
+            ThreeOffsets_CombinedIndexer<Strided1DIndexer, Strided1DIndexer,
+                                         Strided1DIndexer>;
+
+        using dpctl::tensor::offset_utils::Strided1DIndexer;
+        const BatchDimsIndexerT batch_indexer(
+            Strided1DIndexer{/* size   */ batch_nelems,
+                             /* step   */ n * k},
+            Strided1DIndexer{/* size   */ batch_nelems,
+                             /* step   */ k * m},
+            Strided1DIndexer{/* size   */ batch_nelems,
+                             /* step   */ n * m});
+
+        return gemm_detail::_gemm_tree_k_step<
+            lhsTy, rhsTy, resTy, BatchDimsIndexerT, OuterInnerDimsIndexerT,
+            OuterInnerDimsIndexerT, OuterInnerDimsIndexerT, m_groups>(
+            exec_q, lhs_tp, rhs_tp, res_tp, batch_nelems, n, k, m, delta_n,
+            n_wi, delta_k, batch_indexer, lhs_indexer, rhs_indexer, res_indexer,
+            depends);
+    }
+    else {
+        using ReductionOpT =
+            typename std::conditional<std::is_same_v<resTy, bool>,
+                                      sycl::logical_or<resTy>,
+                                      sycl::plus<resTy>>::type;
+        static constexpr resTy identity_val =
+            sycl::known_identity<ReductionOpT, resTy>::value;
+
+        std::size_t iter_nelems = batch_nelems * n * m;
+        std::size_t reduction_nelems =
+            (k + delta_k * n_wi - 1) / (delta_k * n_wi);
+
+        // more than one work-group is needed, requires a
+        // temporary delta_k * n_wi elements processed along k,
+        // so if more to process use multiple
+        const auto &sg_sizes =
+            dev.get_info<sycl::info::device::sub_group_sizes>();
+        std::size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes);
+
+        static constexpr std::size_t preferred_reductions_per_wi = 4;
+        std::size_t reductions_per_wi(preferred_reductions_per_wi);
+
+        std::size_t reduction_groups =
+            (reduction_nelems + preferred_reductions_per_wi * wg - 1) /
+            (preferred_reductions_per_wi * wg);
+
+        std::size_t max_wg = reduction_detail::get_work_group_size(dev);
+
+        if (reduction_nelems <= preferred_reductions_per_wi * max_wg) {
+            auto tmp_owner =
+                dpctl::tensor::alloc_utils::smart_malloc_device<resTy>(
+                    iter_nelems * reduction_nelems, exec_q);
+            resTy *tmp = tmp_owner.get();
+
+            using OuterInnerDimsIndexerT =
+                dpctl::tensor::offset_utils::NoOpIndexer;
+            static constexpr OuterInnerDimsIndexerT lhs_indexer{};
+            static constexpr OuterInnerDimsIndexerT rhs_indexer{};
+            static constexpr OuterInnerDimsIndexerT tmp_indexer{};
+            using dpctl::tensor::offset_utils::Strided1DIndexer;
+            using dpctl::tensor::offset_utils::ThreeOffsets_CombinedIndexer;
+            using BatchDimsIndexerT =
+                ThreeOffsets_CombinedIndexer<Strided1DIndexer, Strided1DIndexer,
+                                             Strided1DIndexer>;
+
+            const BatchDimsIndexerT batch_indexer(
+                Strided1DIndexer{/* size   */ batch_nelems,
+                                 /* step   */ n * k},
+                Strided1DIndexer{/* size   */ batch_nelems,
+                                 /* step   */ k * m},
+                Strided1DIndexer{/* size   */ batch_nelems,
+                                 /* step   */ n * m});
+
+            sycl::event gemm_ev = gemm_detail::_gemm_tree_k_step<
+                lhsTy, rhsTy, resTy, BatchDimsIndexerT, OuterInnerDimsIndexerT,
+                OuterInnerDimsIndexerT, OuterInnerDimsIndexerT, m_groups>(
+                exec_q, lhs_tp, rhs_tp, tmp, batch_nelems, n, k, m, delta_n,
+                n_wi, delta_k, batch_indexer, lhs_indexer, rhs_indexer,
+                tmp_indexer, depends);
+
+            sycl::event red_ev =
+                single_reduction_for_gemm_contig<resTy, ReductionOpT>(
+                    exec_q, tmp, res_tp, identity_val, iter_nelems,
+                    reduction_nelems, reduction_groups, wg, max_wg,
+                    preferred_reductions_per_wi, reductions_per_wi, {gemm_ev});
+
+            sycl::event cleanup_host_task_event =
+                dpctl::tensor::alloc_utils::async_smart_free(exec_q, {red_ev},
+                                                             tmp_owner);
+
+            return cleanup_host_task_event;
+        }
+        else {
+            assert(reduction_groups > 1);
+
+            const std::size_t tmp_alloc_size =
+                iter_nelems * (/* temp */ reduction_nelems +
+                               /* first reduction temp */ reduction_groups);
+
+            auto tmp_owner =
+                dpctl::tensor::alloc_utils::smart_malloc_device<resTy>(
+                    tmp_alloc_size, exec_q);
+
+            resTy *partially_reduced_tmp = tmp_owner.get();
+            resTy *partially_reduced_tmp2 =
+                partially_reduced_tmp + reduction_nelems * iter_nelems;
+
+            using OuterInnerDimsIndexerT =
+                dpctl::tensor::offset_utils::NoOpIndexer;
+            static constexpr OuterInnerDimsIndexerT lhs_indexer{};
+            static constexpr OuterInnerDimsIndexerT rhs_indexer{};
+            static constexpr OuterInnerDimsIndexerT tmp_indexer{};
+            using dpctl::tensor::offset_utils::Strided1DIndexer;
+            using dpctl::tensor::offset_utils::ThreeOffsets_CombinedIndexer;
+            using BatchDimsIndexerT =
+                ThreeOffsets_CombinedIndexer<Strided1DIndexer, Strided1DIndexer,
+                                             Strided1DIndexer>;
+
+            const BatchDimsIndexerT batch_indexer(
+                Strided1DIndexer{/* size   */ batch_nelems,
+                                 /* step   */ n * k},
+                Strided1DIndexer{/* size   */ batch_nelems,
+                                 /* step   */ k * m},
+                Strided1DIndexer{/* size   */ batch_nelems,
+                                 /* step   */ n * m});
+
+            sycl::event gemm_ev = gemm_detail::_gemm_tree_k_step<
+                lhsTy, rhsTy, resTy, BatchDimsIndexerT, OuterInnerDimsIndexerT,
+                OuterInnerDimsIndexerT, OuterInnerDimsIndexerT, m_groups>(
+                exec_q, lhs_tp, rhs_tp, partially_reduced_tmp, batch_nelems, n,
+                k, m, delta_n, n_wi, delta_k, batch_indexer, lhs_indexer,
+                rhs_indexer, tmp_indexer, depends);
+
+            sycl::event red_ev =
+                tree_reduction_for_gemm_contig<resTy, ReductionOpT>(
+                    exec_q, partially_reduced_tmp, partially_reduced_tmp2,
+                    res_tp, identity_val, iter_nelems, reduction_nelems,
+                    reduction_groups, wg, max_wg, preferred_reductions_per_wi,
+                    reductions_per_wi, {gemm_ev});
+
+            sycl::event cleanup_host_task_event =
+                dpctl::tensor::alloc_utils::async_smart_free(exec_q, {red_ev},
+                                                             tmp_owner);
+
+            return cleanup_host_task_event;
+        }
+    }
+}
+
+template <typename lhsTy, typename rhsTy, typename resTy, int wi_delta_m>
+sycl::event
+    gemm_batch_contig_tree_nm_impl(sycl::queue &exec_q,
+                                   const lhsTy *lhs_tp,
+                                   const rhsTy *rhs_tp,
+                                   resTy *res_tp,
+                                   std::size_t batch_nelems,
+                                   std::size_t n,
+                                   std::size_t k,
+                                   std::size_t m,
+                                   std::vector<sycl::event> const &depends)
+{
+    static constexpr int wi_delta_n = 2;
+    std::size_t wg_delta_n(16); // rows of A processed in WG
+    std::size_t wg_delta_m(16); // rows of B processed in WG
+    std::size_t wi_delta_k(64); // Elements in K dimension processed by WI
+
+    const sycl::device &dev = exec_q.get_device();
+    const std::size_t local_mem_size =
+        dev.get_info<sycl::info::device::local_mem_size>();
+    const std::size_t reserved_slm_size = 512;
+
+    gemm_detail::scale_gemm_nm_parameters<resTy, wi_delta_m>(
+        local_mem_size, reserved_slm_size, wi_delta_n,
+        wi_delta_k, // modified by reference
+        wg_delta_n, // modified by reference
+        wg_delta_m  // modified by reference
+    );
+
+    // each group processes delta_k * n_wi
+    // items in a column, so no need for allocating
+    // temp memory if only one group is needed
+    if (k <= wi_delta_k) {
+        using OuterInnerDimsIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+        static constexpr OuterInnerDimsIndexerT lhs_indexer{};
+        static constexpr OuterInnerDimsIndexerT rhs_indexer{};
+        static constexpr OuterInnerDimsIndexerT res_indexer{};
+
+        using dpctl::tensor::offset_utils::Strided1DIndexer;
+        using dpctl::tensor::offset_utils::ThreeOffsets_CombinedIndexer;
+        using BatchDimsIndexerT =
+            ThreeOffsets_CombinedIndexer<Strided1DIndexer, Strided1DIndexer,
+                                         Strided1DIndexer>;
+
+        const BatchDimsIndexerT batch_indexer(
+            Strided1DIndexer{/* size   */ batch_nelems,
+                             /* step   */ n * k},
+            Strided1DIndexer{/* size   */ batch_nelems,
+                             /* step   */ k * m},
+            Strided1DIndexer{/* size   */ batch_nelems,
+                             /* step   */ n * m});
+
+        return gemm_detail::_gemm_tree_nm_step<
+            lhsTy, rhsTy, resTy, BatchDimsIndexerT, OuterInnerDimsIndexerT,
+            OuterInnerDimsIndexerT, OuterInnerDimsIndexerT, wi_delta_n,
+            wi_delta_m>(exec_q, lhs_tp, rhs_tp, res_tp, batch_nelems, n, k, m,
+                        wg_delta_n, wg_delta_m, wi_delta_k, batch_indexer,
+                        lhs_indexer, rhs_indexer, res_indexer, depends);
+    }
+    else {
+        using ReductionOpT =
+            typename std::conditional<std::is_same_v<resTy, bool>,
+                                      sycl::logical_or<resTy>,
+                                      sycl::plus<resTy>>::type;
+        static constexpr resTy identity_val =
+            sycl::known_identity<ReductionOpT, resTy>::value;
+        std::size_t iter_nelems = batch_nelems * n * m;
+        std::size_t reduction_nelems = (k + wi_delta_k - 1) / wi_delta_k;
+
+        // more than one work-group is needed, requires a temporary
+        // delta_k * n_wi elements processed along k, so if more to
+        // process use multiple
+        const auto &sg_sizes =
+            dev.get_info<sycl::info::device::sub_group_sizes>();
+        std::size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes);
+
+        static constexpr std::size_t preferred_reductions_per_wi = 4;
+        std::size_t reductions_per_wi(preferred_reductions_per_wi);
+
+        std::size_t reduction_groups =
+            (reduction_nelems + preferred_reductions_per_wi * wg - 1) /
+            (preferred_reductions_per_wi * wg);
+
+        std::size_t max_wg = reduction_detail::get_work_group_size(dev);
+
+        if (reduction_nelems <= preferred_reductions_per_wi * max_wg) {
+            auto tmp_owner =
+                dpctl::tensor::alloc_utils::smart_malloc_device<resTy>(
+                    iter_nelems * reduction_nelems, exec_q);
+
+            resTy *tmp = tmp_owner.get();
+
+            using OuterInnerDimsIndexerT =
+                dpctl::tensor::offset_utils::NoOpIndexer;
+            static constexpr OuterInnerDimsIndexerT lhs_indexer{};
+            static constexpr OuterInnerDimsIndexerT rhs_indexer{};
+            static constexpr OuterInnerDimsIndexerT tmp_indexer{};
+
+            using dpctl::tensor::offset_utils::Strided1DIndexer;
+            using dpctl::tensor::offset_utils::ThreeOffsets_CombinedIndexer;
+            using BatchDimsIndexerT =
+                ThreeOffsets_CombinedIndexer<Strided1DIndexer, Strided1DIndexer,
+                                             Strided1DIndexer>;
+
+            const BatchDimsIndexerT batch_indexer(
+                Strided1DIndexer{/* size */ batch_nelems,
+                                 /* step */ n * k},
+                Strided1DIndexer{/* size   */ batch_nelems,
+                                 /* step   */ k * m},
+                Strided1DIndexer{/* size   */ batch_nelems,
+                                 /* step   */ n * m});
+
+            sycl::event gemm_ev = gemm_detail::_gemm_tree_nm_step<
+                lhsTy, rhsTy, resTy, BatchDimsIndexerT, OuterInnerDimsIndexerT,
+                OuterInnerDimsIndexerT, OuterInnerDimsIndexerT, wi_delta_n,
+                wi_delta_m>(exec_q, lhs_tp, rhs_tp, tmp, batch_nelems, n, k, m,
+                            wg_delta_n, wg_delta_m, wi_delta_k, batch_indexer,
+                            lhs_indexer, rhs_indexer, tmp_indexer, depends);
+
+            sycl::event red_ev =
+                single_reduction_for_gemm_contig<resTy, ReductionOpT>(
+                    exec_q, tmp, res_tp, identity_val, iter_nelems,
+                    reduction_nelems, reduction_groups, wg, max_wg,
+                    preferred_reductions_per_wi, reductions_per_wi, {gemm_ev});
+
+            sycl::event cleanup_host_task_event =
+                dpctl::tensor::alloc_utils::async_smart_free(exec_q, {red_ev},
+                                                             tmp_owner);
+
+            return cleanup_host_task_event;
+        }
+        else {
+            assert(reduction_groups > 1);
+
+            const std::size_t tmp_alloc_size =
+                iter_nelems * (/* temp */ reduction_nelems +
+                               /* first reduction temp */ reduction_groups);
+
+            auto tmp_owner =
+                dpctl::tensor::alloc_utils::smart_malloc_device<resTy>(
+                    tmp_alloc_size, exec_q);
+
+            resTy *partially_reduced_tmp = tmp_owner.get();
+            resTy *partially_reduced_tmp2 =
+                partially_reduced_tmp + reduction_nelems * iter_nelems;
+
+            using OuterInnerDimsIndexerT =
+                dpctl::tensor::offset_utils::NoOpIndexer;
+            static constexpr OuterInnerDimsIndexerT lhs_indexer{};
+            static constexpr OuterInnerDimsIndexerT rhs_indexer{};
+            static constexpr OuterInnerDimsIndexerT tmp_indexer{};
+
+            using dpctl::tensor::offset_utils::Strided1DIndexer;
+            using dpctl::tensor::offset_utils::ThreeOffsets_CombinedIndexer;
+            using BatchDimsIndexerT =
+                ThreeOffsets_CombinedIndexer<Strided1DIndexer, Strided1DIndexer,
+                                             Strided1DIndexer>;
+
+            const BatchDimsIndexerT batch_indexer(
+                Strided1DIndexer{/* size */ batch_nelems,
+                                 /* step */ n * k},
+                Strided1DIndexer{/* size   */ batch_nelems,
+                                 /* step   */ k * m},
+                Strided1DIndexer{/* size   */ batch_nelems,
+                                 /* step   */ n * m});
+
+            sycl::event gemm_ev = gemm_detail::_gemm_tree_nm_step<
+                lhsTy, rhsTy, resTy, BatchDimsIndexerT, OuterInnerDimsIndexerT,
+                OuterInnerDimsIndexerT, OuterInnerDimsIndexerT, wi_delta_n,
+                wi_delta_m>(exec_q, lhs_tp, rhs_tp, partially_reduced_tmp,
+                            batch_nelems, n, k, m, wg_delta_n, wg_delta_m,
+                            wi_delta_k, batch_indexer, lhs_indexer, rhs_indexer,
+                            tmp_indexer, depends);
+
+            sycl::event red_ev =
+                tree_reduction_for_gemm_contig<resTy, ReductionOpT>(
+                    exec_q, partially_reduced_tmp, partially_reduced_tmp2,
+                    res_tp, identity_val, iter_nelems, reduction_nelems,
+                    reduction_groups, wg, max_wg, preferred_reductions_per_wi,
+                    reductions_per_wi, {gemm_ev});
+
+            sycl::event cleanup_host_task_event =
+                dpctl::tensor::alloc_utils::async_smart_free(exec_q, {red_ev},
+                                                             tmp_owner);
+
+            return cleanup_host_task_event;
+        }
+    }
+}
+
+template <typename lhsTy, typename rhsTy, typename resTy>
+sycl::event gemm_nm_impl(sycl::queue &exec_q,
+                         const lhsTy *lhs_tp,
+                         const rhsTy *rhs_tp,
+                         resTy *res_tp,
+                         std::size_t n,
+                         std::size_t k,
+                         std::size_t m,
+                         int inner_nd,
+                         int lhs_outer_nd,
+                         const ssize_t *lhs_shape_strides,
+                         int rhs_outer_nd,
+                         const ssize_t *rhs_shape_strides,
+                         int res_outer_nd,
+                         const ssize_t *res_shape_strides,
+                         std::vector<sycl::event> const &depends = {})
+{
+    using OuterInnerDimsIndexerT = dpctl::tensor::offset_utils::StridedIndexer;
+    const OuterInnerDimsIndexerT lhs_indexer(inner_nd + lhs_outer_nd, 0,
+                                             lhs_shape_strides);
+    const OuterInnerDimsIndexerT rhs_indexer(inner_nd + rhs_outer_nd, 0,
+                                             rhs_shape_strides);
+    const OuterInnerDimsIndexerT res_indexer(res_outer_nd, 0,
+                                             res_shape_strides);
+
+    using BatchDimsIndexerT =
+        dpctl::tensor::offset_utils::ThreeZeroOffsets_Indexer;
+    static constexpr BatchDimsIndexerT batch_indexer{};
+
+    static constexpr std::size_t single_batch_nelems = 1;
+
+    sycl::event gemm_ev = gemm_detail::_gemm_batch_nm_impl<
+        lhsTy, rhsTy, resTy, BatchDimsIndexerT, OuterInnerDimsIndexerT,
+        OuterInnerDimsIndexerT, OuterInnerDimsIndexerT>(
+        exec_q, lhs_tp, rhs_tp, res_tp, single_batch_nelems, n, k, m,
+        batch_indexer, lhs_indexer, rhs_indexer, res_indexer, depends);
+
+    return gemm_ev;
+}
+
+template <typename lhsTy, typename rhsTy, typename resTy>
+sycl::event
+    gemm_batch_nm_contig_impl(sycl::queue &exec_q,
+                              const lhsTy *lhs_tp,
+                              const rhsTy *rhs_tp,
+                              resTy *res_tp,
+                              std::size_t batch_nelems,
+                              std::size_t n,
+                              std::size_t k,
+                              std::size_t m,
+                              std::vector<sycl::event> const &depends = {})
+{
+    using OuterInnerDimsIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+    static constexpr OuterInnerDimsIndexerT lhs_indexer{};
+    static constexpr OuterInnerDimsIndexerT rhs_indexer{};
+    static constexpr OuterInnerDimsIndexerT res_indexer{};
+
+    static constexpr std::size_t single_batch_nelems = 1;
+    if (batch_nelems == single_batch_nelems) {
+        using BatchDimsIndexerT =
+            dpctl::tensor::offset_utils::ThreeZeroOffsets_Indexer;
+        static constexpr BatchDimsIndexerT batch_indexer{};
+
+        sycl::event gemm_ev = gemm_detail::_gemm_batch_nm_impl<
+            lhsTy, rhsTy, resTy, BatchDimsIndexerT, OuterInnerDimsIndexerT,
+            OuterInnerDimsIndexerT, OuterInnerDimsIndexerT>(
+            exec_q, lhs_tp, rhs_tp, res_tp, single_batch_nelems, n, k, m,
+            batch_indexer, lhs_indexer, rhs_indexer, res_indexer, depends);
+
+        return gemm_ev;
+    }
+    else {
+        using dpctl::tensor::offset_utils::Strided1DIndexer;
+        using dpctl::tensor::offset_utils::ThreeOffsets_CombinedIndexer;
+        using BatchDimsIndexerT =
+            ThreeOffsets_CombinedIndexer<Strided1DIndexer, Strided1DIndexer,
+                                         Strided1DIndexer>;
+
+        using dpctl::tensor::offset_utils::Strided1DIndexer;
+
+        const BatchDimsIndexerT batch_indexer(
+            Strided1DIndexer{/* size   */ batch_nelems,
+                             /* step   */ n * k},
+            Strided1DIndexer{/* size   */ batch_nelems,
+                             /* step   */ k * m},
+            Strided1DIndexer{/* size   */ batch_nelems,
+                             /* step   */ n * m});
+
+        sycl::event gemm_ev = gemm_detail::_gemm_batch_nm_impl<
+            lhsTy, rhsTy, resTy, BatchDimsIndexerT, OuterInnerDimsIndexerT,
+            OuterInnerDimsIndexerT, OuterInnerDimsIndexerT>(
+            exec_q, lhs_tp, rhs_tp, res_tp, batch_nelems, n, k, m,
+            batch_indexer, lhs_indexer, rhs_indexer, res_indexer, depends);
+
+        return gemm_ev;
+    }
+}
+
+template <typename lhsTy, typename rhsTy, typename resTy>
+sycl::event
+    gemm_batch_contig_tree_impl(sycl::queue &exec_q,
+                                const char *lhs_cp,
+                                const char *rhs_cp,
+                                char *res_cp,
+                                std::size_t batch_nelems,
+                                std::size_t n,
+                                std::size_t k,
+                                std::size_t m,
+                                ssize_t lhs_batch_offset,
+                                ssize_t rhs_batch_offset,
+                                ssize_t res_batch_offset,
+                                std::vector<sycl::event> const &depends = {})
+{
+    const lhsTy *lhs_tp =
+        reinterpret_cast<const lhsTy *>(lhs_cp) + lhs_batch_offset;
+    const rhsTy *rhs_tp =
+        reinterpret_cast<const rhsTy *>(rhs_cp) + rhs_batch_offset;
+    resTy *res_tp = reinterpret_cast<resTy *>(res_cp) + res_batch_offset;
+
+    const std::size_t min_nm = std::min(n, m);
+    const std::size_t max_nm = std::max(n, m);
+
+    if (min_nm > 0 && (max_nm >= ((64 * 1024) / min_nm))) {
+        return gemm_batch_nm_contig_impl<lhsTy, rhsTy, resTy>(
+            exec_q, lhs_tp, rhs_tp, res_tp, batch_nelems, n, k, m, depends);
+    }
+
+    if (k == 0) {
+        sycl::event gemm_batch_no_reduction_ev =
+            exec_q.submit([&](sycl::handler &cgh) {
+                cgh.depends_on(depends);
+                cgh.fill<resTy>(res_tp, resTy(0), n * m * batch_nelems);
+            });
+        return gemm_batch_no_reduction_ev;
+    }
+
+    if (max_nm < 64) {
+        using dpctl::tensor::type_utils::is_complex;
+        if constexpr (!is_complex<resTy>::value) {
+            if (m < 4) {
+                return gemm_batch_contig_tree_k_impl<lhsTy, rhsTy, resTy, 1>(
+                    exec_q, lhs_tp, rhs_tp, res_tp, batch_nelems, n, k, m,
+                    depends);
+            }
+            else {
+                return gemm_batch_contig_tree_k_impl<lhsTy, rhsTy, resTy, 4>(
+                    exec_q, lhs_tp, rhs_tp, res_tp, batch_nelems, n, k, m,
+                    depends);
+            }
+        }
+        else {
+            return gemm_batch_contig_tree_k_impl<lhsTy, rhsTy, resTy, 1>(
+                exec_q, lhs_tp, rhs_tp, res_tp, batch_nelems, n, k, m, depends);
+        }
+    }
+    else { // m > 1, n > k or m > k
+        using dpctl::tensor::type_utils::is_complex;
+        if constexpr (!is_complex<resTy>::value) {
+            return gemm_batch_contig_tree_nm_impl<lhsTy, rhsTy, resTy, 4>(
+                exec_q, lhs_tp, rhs_tp, res_tp, batch_nelems, n, k, m, depends);
+        }
+        else { // m > 1, n > k or m > k, resTy complex
+            return gemm_batch_contig_tree_nm_impl<lhsTy, rhsTy, resTy, 1>(
+                exec_q, lhs_tp, rhs_tp, res_tp, batch_nelems, n, k, m, depends);
+        }
+    }
+}
+
+// Gemm tree non-batched
+
+template <typename T1,
+          typename T2,
+          typename T3,
+          typename T4,
+          typename T5,
+          std::size_t>
+class gemm_tree_nm_krn;
+
+template <typename T1,
+          typename T2,
+          typename T3,
+          typename T4,
+          typename T5,
+          std::size_t>
+class gemm_tree_k_krn;
+
+template <typename lhsTy, typename rhsTy, typename resTy, std::size_t m_groups>
+sycl::event gemm_tree_k_impl(sycl::queue &exec_q,
+                             const lhsTy *lhs_tp,
+                             const rhsTy *rhs_tp,
+                             resTy *res_tp,
+                             std::size_t n,
+                             std::size_t k,
+                             std::size_t m,
+                             int inner_nd,
+                             int lhs_outer_nd,
+                             const ssize_t *lhs_outer_inner_shapes_strides,
+                             int rhs_outer_nd,
+                             const ssize_t *rhs_outer_inner_shapes_strides,
+                             int res_nd,
+                             const ssize_t *res_shapes_strides,
+                             const std::vector<sycl::event> &depends)
+{
+    std::size_t delta_k(4);
+    std::size_t n_wi(64);
+    std::size_t delta_n(32);
+
+    const sycl::device &dev = exec_q.get_device();
+    const std::size_t local_mem_size =
+        dev.get_info<sycl::info::device::local_mem_size>();
+    const std::size_t reserved_slm_size = 512;
+
+    gemm_detail::scale_gemm_k_parameters<resTy, m_groups>(
+        local_mem_size, reserved_slm_size, delta_k,
+        n_wi,   // modified by reference
+        delta_n // modified by reference
+    );
+
+    using BatchIndexerT = dpctl::tensor::offset_utils::ThreeZeroOffsets_Indexer;
+    static constexpr BatchIndexerT batch_indexer{};
+
+    static constexpr std::size_t single_batch_nelems = 1;
+
+    using OuterInnerDimsIndexerT = dpctl::tensor::offset_utils::StridedIndexer;
+    const OuterInnerDimsIndexerT lhs_indexer(inner_nd + lhs_outer_nd, 0,
+                                             lhs_outer_inner_shapes_strides);
+    const OuterInnerDimsIndexerT rhs_indexer(inner_nd + rhs_outer_nd, 0,
+                                             rhs_outer_inner_shapes_strides);
+
+    sycl::event gemm_ev;
+    if (k <= (delta_k * n_wi)) {
+        const OuterInnerDimsIndexerT res_indexer(res_nd, 0, res_shapes_strides);
+
+        return gemm_detail::_gemm_tree_k_step<
+            lhsTy, rhsTy, resTy, BatchIndexerT, OuterInnerDimsIndexerT,
+            OuterInnerDimsIndexerT, OuterInnerDimsIndexerT, m_groups>(
+            exec_q, lhs_tp, rhs_tp, res_tp, single_batch_nelems, n, k, m,
+            delta_n, n_wi, delta_k, batch_indexer, lhs_indexer, rhs_indexer,
+            res_indexer, depends);
+    }
+    else {
+        using ReductionOpT =
+            typename std::conditional<std::is_same_v<resTy, bool>,
+                                      sycl::logical_or<resTy>,
+                                      sycl::plus<resTy>>::type;
+        static constexpr resTy identity_val =
+            sycl::known_identity<ReductionOpT, resTy>::value;
+
+        std::size_t iter_nelems = n * m;
+        std::size_t reduction_nelems =
+            (k + delta_k * n_wi - 1) / (delta_k * n_wi);
+
+        // more than one work-groups is needed, requires a temporary
+        // delta_k * n_wi elements processed along k, so if more to
+        // process use multiple
+        const auto &sg_sizes =
+            dev.get_info<sycl::info::device::sub_group_sizes>();
+        std::size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes);
+
+        static constexpr std::size_t preferred_reductions_per_wi = 8;
+        std::size_t reductions_per_wi(preferred_reductions_per_wi);
+
+        std::size_t reduction_groups =
+            (reduction_nelems + preferred_reductions_per_wi * wg - 1) /
+            (preferred_reductions_per_wi * wg);
+
+        std::size_t max_wg = reduction_detail::get_work_group_size(dev);
+
+        if (reduction_nelems <= preferred_reductions_per_wi * max_wg) {
+
+            auto tmp_owner =
+                dpctl::tensor::alloc_utils::smart_malloc_device<resTy>(
+                    iter_nelems * reduction_nelems, exec_q);
+            resTy *tmp = tmp_owner.get();
+
+            using ResIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+            static constexpr ResIndexerT res_indexer{};
+
+            sycl::event gemm_ev = gemm_detail::_gemm_tree_k_step<
+                lhsTy, rhsTy, resTy, BatchIndexerT, OuterInnerDimsIndexerT,
+                OuterInnerDimsIndexerT, ResIndexerT, m_groups>(
+                exec_q, lhs_tp, rhs_tp, tmp, single_batch_nelems, n, k, m,
+                delta_n, n_wi, delta_k, batch_indexer, lhs_indexer, rhs_indexer,
+                res_indexer, depends);
+
+            sycl::event red_ev = single_reduction_for_gemm<resTy, ReductionOpT>(
+                exec_q, tmp, res_tp, identity_val, iter_nelems,
+                reduction_nelems, reduction_groups, wg, max_wg,
+                preferred_reductions_per_wi, reductions_per_wi, res_nd, 0,
+                res_shapes_strides, {gemm_ev});
+
+            sycl::event cleanup_host_task_event =
+                dpctl::tensor::alloc_utils::async_smart_free(exec_q, {red_ev},
+                                                             tmp_owner);
+            return cleanup_host_task_event;
+        }
+        else {
+            assert(reduction_groups > 1);
+
+            const std::size_t tmp_alloc_size =
+                iter_nelems * (/* temp */ reduction_nelems +
+                               /* first reduction temp */ reduction_groups);
+
+            auto tmp_owner =
+                dpctl::tensor::alloc_utils::smart_malloc_device<resTy>(
+                    tmp_alloc_size, exec_q);
+
+            resTy *partially_reduced_tmp = tmp_owner.get();
+            resTy *partially_reduced_tmp2 =
+                partially_reduced_tmp + reduction_nelems * iter_nelems;
+
+            using ResIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+            static constexpr ResIndexerT res_indexer{};
+
+            sycl::event gemm_ev = gemm_detail::_gemm_tree_k_step<
+                lhsTy, rhsTy, resTy, BatchIndexerT, OuterInnerDimsIndexerT,
+                OuterInnerDimsIndexerT, ResIndexerT, m_groups>(
+                exec_q, lhs_tp, rhs_tp, partially_reduced_tmp,
+                single_batch_nelems, n, k, m, delta_n, n_wi, delta_k,
+                batch_indexer, lhs_indexer, rhs_indexer, res_indexer, depends);
+
+            // tree_reduction_for_gemm returns sycl::event for reduction
+            sycl::event red_ev = tree_reduction_for_gemm<resTy, ReductionOpT>(
+                exec_q, partially_reduced_tmp, partially_reduced_tmp2, res_tp,
+                identity_val, iter_nelems, reduction_nelems, reduction_groups,
+                wg, max_wg, preferred_reductions_per_wi, reductions_per_wi,
+                res_nd, 0, res_shapes_strides, {gemm_ev});
+
+            sycl::event cleanup_host_task_event =
+                dpctl::tensor::alloc_utils::async_smart_free(exec_q, {red_ev},
+                                                             tmp_owner);
+
+            return cleanup_host_task_event;
+        }
+    }
+}
+
+template <typename lhsTy, typename rhsTy, typename resTy, int wi_delta_m>
+sycl::event gemm_tree_nm_impl(sycl::queue &exec_q,
+                              const lhsTy *lhs_tp,
+                              const rhsTy *rhs_tp,
+                              resTy *res_tp,
+                              std::size_t n,
+                              std::size_t k,
+                              std::size_t m,
+                              int inner_nd,
+                              int lhs_outer_nd,
+                              const ssize_t *lhs_outer_inner_shapes_strides,
+                              int rhs_outer_nd,
+                              const ssize_t *rhs_outer_inner_shapes_strides,
+                              int res_nd,
+                              const ssize_t *res_shapes_strides,
+                              const std::vector<sycl::event> &depends)
+{
+    static constexpr int wi_delta_n = 2;
+    std::size_t wg_delta_n(16); // rows of A processed in WG
+    std::size_t wg_delta_m(16); // rows of B processed in WG
+    std::size_t wi_delta_k(64); // Elements in K dimension processed by WI
+
+    const sycl::device &dev = exec_q.get_device();
+    const std::size_t local_mem_size =
+        dev.get_info<sycl::info::device::local_mem_size>();
+    const std::size_t reserved_slm_size = 512;
+
+    gemm_detail::scale_gemm_nm_parameters<resTy, wi_delta_m>(
+        local_mem_size, reserved_slm_size, wi_delta_n,
+        wi_delta_k, // modified by reference
+        wg_delta_n, // modified by reference
+        wg_delta_m  // modified by reference
+    );
+
+    using BatchIndexerT = dpctl::tensor::offset_utils::ThreeZeroOffsets_Indexer;
+    static constexpr BatchIndexerT batch_indexer{};
+
+    static constexpr std::size_t single_batch_nelems = 1;
+
+    using OuterInnerDimsIndexerT = dpctl::tensor::offset_utils::StridedIndexer;
+    const OuterInnerDimsIndexerT lhs_indexer(inner_nd + lhs_outer_nd, 0,
+                                             lhs_outer_inner_shapes_strides);
+    const OuterInnerDimsIndexerT rhs_indexer(inner_nd + rhs_outer_nd, 0,
+                                             rhs_outer_inner_shapes_strides);
+
+    // each group processes delta_k items in a column,
+    // so no need to allocate temp memory if one group needed
+    if (k <= wi_delta_k) {
+        const OuterInnerDimsIndexerT res_indexer(res_nd, 0, res_shapes_strides);
+
+        return gemm_detail::_gemm_tree_nm_step<
+            lhsTy, rhsTy, resTy, BatchIndexerT, OuterInnerDimsIndexerT,
+            OuterInnerDimsIndexerT, OuterInnerDimsIndexerT, wi_delta_n,
+            wi_delta_m>(exec_q, lhs_tp, rhs_tp, res_tp, single_batch_nelems, n,
+                        k, m, wg_delta_n, wg_delta_m, wi_delta_k, batch_indexer,
+                        lhs_indexer, rhs_indexer, res_indexer, depends);
+    }
+    else {
+        using ReductionOpT =
+            typename std::conditional<std::is_same_v<resTy, bool>,
+                                      sycl::logical_or<resTy>,
+                                      sycl::plus<resTy>>::type;
+        static constexpr resTy identity_val =
+            sycl::known_identity<ReductionOpT, resTy>::value;
+
+        std::size_t iter_nelems = n * m;
+        std::size_t reduction_nelems = (k + wi_delta_k - 1) / wi_delta_k;
+
+        // more than one work-groups is needed, requires a temporary
+        // wi_delta_k elements processed along k, so if more to
+        // process use multiple
+        const auto &sg_sizes =
+            dev.get_info<sycl::info::device::sub_group_sizes>();
+        std::size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes);
+
+        static constexpr std::size_t preferred_reductions_per_wi = 8;
+        std::size_t reductions_per_wi(preferred_reductions_per_wi);
+
+        std::size_t reduction_groups =
+            (reduction_nelems + preferred_reductions_per_wi * wg - 1) /
+            (preferred_reductions_per_wi * wg);
+
+        std::size_t max_wg = reduction_detail::get_work_group_size(dev);
+
+        if (reduction_nelems <= preferred_reductions_per_wi * max_wg) {
+            auto tmp_owner =
+                dpctl::tensor::alloc_utils::smart_malloc_device<resTy>(
+                    iter_nelems * reduction_nelems, exec_q);
+            resTy *tmp = tmp_owner.get();
+
+            using ResIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+            static constexpr ResIndexerT res_indexer{};
+
+            sycl::event gemm_ev = gemm_detail::_gemm_tree_nm_step<
+                lhsTy, rhsTy, resTy, BatchIndexerT, OuterInnerDimsIndexerT,
+                OuterInnerDimsIndexerT, ResIndexerT, wi_delta_n, wi_delta_m>(
+                exec_q, lhs_tp, rhs_tp, tmp, single_batch_nelems, n, k, m,
+                wg_delta_n, wg_delta_m, wi_delta_k, batch_indexer, lhs_indexer,
+                rhs_indexer, res_indexer, depends);
+
+            sycl::event red_ev = single_reduction_for_gemm<resTy, ReductionOpT>(
+                exec_q, tmp, res_tp, identity_val, iter_nelems,
+                reduction_nelems, reduction_groups, wg, max_wg,
+                preferred_reductions_per_wi, reductions_per_wi, res_nd, 0,
+                res_shapes_strides, {gemm_ev});
+
+            sycl::event cleanup_host_task_event =
+                dpctl::tensor::alloc_utils::async_smart_free(exec_q, {red_ev},
+                                                             tmp_owner);
+
+            return cleanup_host_task_event;
+        }
+        else {
+            assert(reduction_groups > 1);
+
+            const std::size_t tmp_alloc_size =
+                iter_nelems * (/* temp */ reduction_nelems +
+                               /* first reduction temp */ reduction_groups);
+            auto tmp_owner =
+                dpctl::tensor::alloc_utils::smart_malloc_device<resTy>(
+                    tmp_alloc_size, exec_q);
+
+            resTy *partially_reduced_tmp = tmp_owner.get();
+            resTy *partially_reduced_tmp2 =
+                partially_reduced_tmp + reduction_nelems * iter_nelems;
+
+            using ResIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+            static constexpr ResIndexerT res_indexer{};
+
+            sycl::event gemm_ev = gemm_detail::_gemm_tree_nm_step<
+                lhsTy, rhsTy, resTy, BatchIndexerT, OuterInnerDimsIndexerT,
+                OuterInnerDimsIndexerT, ResIndexerT, wi_delta_n, wi_delta_m>(
+                exec_q, lhs_tp, rhs_tp, partially_reduced_tmp,
+                single_batch_nelems, n, k, m, wg_delta_n, wg_delta_m,
+                wi_delta_k, batch_indexer, lhs_indexer, rhs_indexer,
+                res_indexer, depends);
+
+            sycl::event red_ev = tree_reduction_for_gemm<resTy, ReductionOpT>(
+                exec_q, partially_reduced_tmp, partially_reduced_tmp2, res_tp,
+                identity_val, iter_nelems, reduction_nelems, reduction_groups,
+                wg, max_wg, preferred_reductions_per_wi, reductions_per_wi,
+                res_nd, 0, res_shapes_strides, {gemm_ev});
+
+            sycl::event cleanup_host_task_event =
+                dpctl::tensor::alloc_utils::async_smart_free(exec_q, {red_ev},
+                                                             tmp_owner);
+
+            return cleanup_host_task_event;
+        }
+    }
+}
+
+template <typename T1, typename T2, typename T3>
+class gemm_tree_empty_krn;
+
+template <typename lhsTy, typename rhsTy, typename resTy>
+sycl::event gemm_tree_impl(sycl::queue &exec_q,
+                           const char *lhs_cp,
+                           const char *rhs_cp,
+                           char *res_cp,
+                           std::size_t n,
+                           std::size_t k,
+                           std::size_t m,
+                           int inner_nd,
+                           int lhs_outer_nd,
+                           const ssize_t *lhs_outer_inner_shapes_strides,
+                           int rhs_outer_nd,
+                           const ssize_t *rhs_outer_inner_shapes_strides,
+                           int res_nd,
+                           const ssize_t *res_shapes_strides,
+                           std::vector<sycl::event> const &depends = {})
+{
+    const lhsTy *lhs_tp = reinterpret_cast<const lhsTy *>(lhs_cp);
+    const rhsTy *rhs_tp = reinterpret_cast<const rhsTy *>(rhs_cp);
+    resTy *res_tp = reinterpret_cast<resTy *>(res_cp);
+
+    const std::size_t min_nm = std::min(n, m);
+    const std::size_t max_nm = std::max(n, m);
+
+    if (min_nm > 0 && (max_nm >= ((64 * 1024) / min_nm))) {
+        return gemm_nm_impl<lhsTy, rhsTy, resTy>(
+            exec_q, lhs_tp, rhs_tp, res_tp, n, k, m, inner_nd, lhs_outer_nd,
+            lhs_outer_inner_shapes_strides, rhs_outer_nd,
+            rhs_outer_inner_shapes_strides, res_nd, res_shapes_strides,
+            depends);
+    }
+
+    if (k == 0) {
+        sycl::event gemm_no_reduction_ev =
+            exec_q.submit([&](sycl::handler &cgh) {
+                cgh.depends_on(depends);
+
+                using IndexerT = dpctl::tensor::offset_utils::StridedIndexer;
+                const IndexerT res_indexer(res_nd, 0, res_shapes_strides);
+                using InitKernelName =
+                    class gemm_tree_empty_krn<lhsTy, rhsTy, resTy>;
+                cgh.parallel_for<InitKernelName>(
+                    sycl::range<1>(n * m), [=](sycl::id<1> id) {
+                        auto res_offset = res_indexer(id[0]);
+                        res_tp[res_offset] = resTy(0);
+                    });
+            });
+        return gemm_no_reduction_ev;
+    }
+
+    if (max_nm < 64) {
+        using dpctl::tensor::type_utils::is_complex;
+        if constexpr (!is_complex<resTy>::value) {
+            if (m < 4) {
+                return gemm_tree_k_impl<lhsTy, rhsTy, resTy, 1>(
+                    exec_q, lhs_tp, rhs_tp, res_tp, n, k, m, inner_nd,
+                    lhs_outer_nd, lhs_outer_inner_shapes_strides, rhs_outer_nd,
+                    rhs_outer_inner_shapes_strides, res_nd, res_shapes_strides,
+                    depends);
+            }
+            else {
+                return gemm_tree_k_impl<lhsTy, rhsTy, resTy, 4>(
+                    exec_q, lhs_tp, rhs_tp, res_tp, n, k, m, inner_nd,
+                    lhs_outer_nd, lhs_outer_inner_shapes_strides, rhs_outer_nd,
+                    rhs_outer_inner_shapes_strides, res_nd, res_shapes_strides,
+                    depends);
+            }
+        }
+        else {
+            return gemm_tree_k_impl<lhsTy, rhsTy, resTy, 1>(
+                exec_q, lhs_tp, rhs_tp, res_tp, n, k, m, inner_nd, lhs_outer_nd,
+                lhs_outer_inner_shapes_strides, rhs_outer_nd,
+                rhs_outer_inner_shapes_strides, res_nd, res_shapes_strides,
+                depends);
+        }
+    }
+    else { // m > 1, n > k or m > k
+        using dpctl::tensor::type_utils::is_complex;
+        if constexpr (!is_complex<resTy>::value) {
+            return gemm_tree_nm_impl<lhsTy, rhsTy, resTy, 4>(
+                exec_q, lhs_tp, rhs_tp, res_tp, n, k, m, inner_nd, lhs_outer_nd,
+                lhs_outer_inner_shapes_strides, rhs_outer_nd,
+                rhs_outer_inner_shapes_strides, res_nd, res_shapes_strides,
+                depends);
+        }
+        else {
+            return gemm_tree_nm_impl<lhsTy, rhsTy, resTy, 1>(
+                exec_q, lhs_tp, rhs_tp, res_tp, n, k, m, inner_nd, lhs_outer_nd,
+                lhs_outer_inner_shapes_strides, rhs_outer_nd,
+                rhs_outer_inner_shapes_strides, res_nd, res_shapes_strides,
+                depends);
+        }
+    }
+}
+
+template <typename lhsTy, typename rhsTy, typename resTy, std::size_t m_groups>
+sycl::event gemm_contig_tree_k_impl(sycl::queue &exec_q,
+                                    const lhsTy *lhs_tp,
+                                    const rhsTy *rhs_tp,
+                                    resTy *res_tp,
+                                    std::size_t n,
+                                    std::size_t k,
+                                    std::size_t m,
+                                    std::vector<sycl::event> const &depends)
+{
+    std::size_t delta_k(4);
+    std::size_t n_wi(64);
+    std::size_t delta_n(32);
+
+    const sycl::device &dev = exec_q.get_device();
+    const std::size_t local_mem_size =
+        dev.get_info<sycl::info::device::local_mem_size>();
+    const std::size_t reserved_slm_size = 512;
+
+    gemm_detail::scale_gemm_k_parameters<resTy, m_groups>(
+        local_mem_size, reserved_slm_size, delta_k,
+        n_wi,   // modified by reference
+        delta_n // modified by reference
+    );
+
+    using OuterInnerDimsIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+    static constexpr OuterInnerDimsIndexerT lhs_indexer{};
+    static constexpr OuterInnerDimsIndexerT rhs_indexer{};
+    static constexpr OuterInnerDimsIndexerT res_indexer{};
+
+    using BatchIndexerT = dpctl::tensor::offset_utils::ThreeZeroOffsets_Indexer;
+    static constexpr BatchIndexerT batch_indexer{};
+
+    static constexpr std::size_t single_batch_nelems = 1;
+
+    sycl::event gemm_ev;
+    if (k <= (delta_k * n_wi)) {
+        return gemm_detail::_gemm_tree_k_step<
+            lhsTy, rhsTy, resTy, BatchIndexerT, OuterInnerDimsIndexerT,
+            OuterInnerDimsIndexerT, OuterInnerDimsIndexerT, m_groups>(
+            exec_q, lhs_tp, rhs_tp, res_tp, single_batch_nelems, n, k, m,
+            delta_n, n_wi, delta_k, batch_indexer, lhs_indexer, rhs_indexer,
+            res_indexer, depends);
+    }
+    else {
+        using ReductionOpT =
+            typename std::conditional<std::is_same_v<resTy, bool>,
+                                      sycl::logical_or<resTy>,
+                                      sycl::plus<resTy>>::type;
+        static constexpr resTy identity_val =
+            sycl::known_identity<ReductionOpT, resTy>::value;
+
+        std::size_t iter_nelems = n * m;
+        std::size_t reduction_nelems =
+            (k + delta_k * n_wi - 1) / (delta_k * n_wi);
+
+        // more than one work-groups is needed, requires a
+        // temporary delta_k * n_wi elements processed along k,
+        // so if more to process use multiple
+        const auto &sg_sizes =
+            dev.get_info<sycl::info::device::sub_group_sizes>();
+        std::size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes);
+
+        static constexpr std::size_t preferred_reductions_per_wi = 8;
+        std::size_t reductions_per_wi(preferred_reductions_per_wi);
+
+        std::size_t reduction_groups =
+            (reduction_nelems + preferred_reductions_per_wi * wg - 1) /
+            (preferred_reductions_per_wi * wg);
+
+        std::size_t max_wg = reduction_detail::get_work_group_size(dev);
+
+        if (reduction_nelems <= preferred_reductions_per_wi * max_wg) {
+            auto tmp_owner =
+                dpctl::tensor::alloc_utils::smart_malloc_device<resTy>(
+                    iter_nelems * reduction_nelems, exec_q);
+            resTy *tmp = tmp_owner.get();
+
+            sycl::event gemm_ev = gemm_detail::_gemm_tree_k_step<
+                lhsTy, rhsTy, resTy, BatchIndexerT, OuterInnerDimsIndexerT,
+                OuterInnerDimsIndexerT, OuterInnerDimsIndexerT, m_groups>(
+                exec_q, lhs_tp, rhs_tp, tmp, single_batch_nelems, n, k, m,
+                delta_n, n_wi, delta_k, batch_indexer, lhs_indexer, rhs_indexer,
+                res_indexer, depends);
+
+            sycl::event red_ev =
+                single_reduction_for_gemm_contig<resTy, ReductionOpT>(
+                    exec_q, tmp, res_tp, identity_val, iter_nelems,
+                    reduction_nelems, reduction_groups, wg, max_wg,
+                    preferred_reductions_per_wi, reductions_per_wi, {gemm_ev});
+
+            sycl::event cleanup_host_task_event =
+                dpctl::tensor::alloc_utils::async_smart_free(exec_q, {red_ev},
+                                                             tmp_owner);
+            return cleanup_host_task_event;
+        }
+        else {
+            assert(reduction_groups > 1);
+
+            const std::size_t tmp_alloc_size =
+                iter_nelems * (/* temp */ reduction_nelems +
+                               /* first reduction temp */ reduction_groups);
+            auto tmp_owner =
+                dpctl::tensor::alloc_utils::smart_malloc_device<resTy>(
+                    tmp_alloc_size, exec_q);
+
+            resTy *partially_reduced_tmp = tmp_owner.get();
+            resTy *partially_reduced_tmp2 =
+                partially_reduced_tmp + reduction_nelems * iter_nelems;
+
+            sycl::event gemm_ev = gemm_detail::_gemm_tree_k_step<
+                lhsTy, rhsTy, resTy, BatchIndexerT, OuterInnerDimsIndexerT,
+                OuterInnerDimsIndexerT, OuterInnerDimsIndexerT, m_groups>(
+                exec_q, lhs_tp, rhs_tp, partially_reduced_tmp,
+                single_batch_nelems, n, k, m, delta_n, n_wi, delta_k,
+                batch_indexer, lhs_indexer, rhs_indexer, res_indexer, depends);
+
+            // tree_reduction_for_gemm_contig returns sycl::event
+            // for reduction
+            sycl::event red_ev =
+                tree_reduction_for_gemm_contig<resTy, ReductionOpT>(
+                    exec_q, partially_reduced_tmp, partially_reduced_tmp2,
+                    res_tp, identity_val, iter_nelems, reduction_nelems,
+                    reduction_groups, wg, max_wg, preferred_reductions_per_wi,
+                    reductions_per_wi, {gemm_ev});
+
+            sycl::event cleanup_host_task_event =
+                dpctl::tensor::alloc_utils::async_smart_free(exec_q, {red_ev},
+                                                             tmp_owner);
+
+            return cleanup_host_task_event;
+        }
+    }
+}
+
+template <typename lhsTy, typename rhsTy, typename resTy, int wi_delta_m>
+sycl::event gemm_contig_tree_nm_impl(sycl::queue &exec_q,
+                                     const lhsTy *lhs_tp,
+                                     const rhsTy *rhs_tp,
+                                     resTy *res_tp,
+                                     std::size_t n,
+                                     std::size_t k,
+                                     std::size_t m,
+                                     std::vector<sycl::event> const &depends)
+{
+    static constexpr int wi_delta_n = 2;
+    std::size_t wg_delta_n(16); // rows of A processed in WG
+    std::size_t wg_delta_m(16); // rows of B processed in WG
+    std::size_t wi_delta_k(64); // Elements in K dimension processed by WI
+
+    const sycl::device &dev = exec_q.get_device();
+    const std::size_t local_mem_size =
+        dev.get_info<sycl::info::device::local_mem_size>();
+    const std::size_t reserved_slm_size = 512;
+
+    gemm_detail::scale_gemm_nm_parameters<resTy, wi_delta_m>(
+        local_mem_size, reserved_slm_size, wi_delta_n,
+        wi_delta_k, // modified by reference
+        wg_delta_n, // modified by reference
+        wg_delta_m  // modified by reference
+    );
+
+    using OuterInnerDimsIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+    static constexpr OuterInnerDimsIndexerT lhs_indexer{};
+    static constexpr OuterInnerDimsIndexerT rhs_indexer{};
+    static constexpr OuterInnerDimsIndexerT res_indexer{};
+
+    using BatchIndexerT = dpctl::tensor::offset_utils::ThreeZeroOffsets_Indexer;
+    static constexpr BatchIndexerT batch_indexer{};
+
+    static constexpr std::size_t single_batch_nelems = 1;
+
+    // each group processes delta_k items in a column,
+    // so no need to allocate temp memory if one group needed
+    if (k <= wi_delta_k) {
+
+        return gemm_detail::_gemm_tree_nm_step<
+            lhsTy, rhsTy, resTy, BatchIndexerT, OuterInnerDimsIndexerT,
+            OuterInnerDimsIndexerT, OuterInnerDimsIndexerT, wi_delta_n,
+            wi_delta_m>(exec_q, lhs_tp, rhs_tp, res_tp, single_batch_nelems, n,
+                        k, m, wg_delta_n, wg_delta_m, wi_delta_k, batch_indexer,
+                        lhs_indexer, rhs_indexer, res_indexer, depends);
+    }
+    else {
+        using ReductionOpT =
+            typename std::conditional<std::is_same_v<resTy, bool>,
+                                      sycl::logical_or<resTy>,
+                                      sycl::plus<resTy>>::type;
+        static constexpr resTy identity_val =
+            sycl::known_identity<ReductionOpT, resTy>::value;
+
+        std::size_t iter_nelems = n * m;
+        std::size_t reduction_nelems = (k + wi_delta_k - 1) / wi_delta_k;
+
+        // more than one work-groups is needed, requires a temporary
+        // wi_delta_k elements processed along k, so if more to
+        // process use multiple
+        const auto &sg_sizes =
+            dev.get_info<sycl::info::device::sub_group_sizes>();
+        std::size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes);
+
+        static constexpr std::size_t preferred_reductions_per_wi = 8;
+        std::size_t reductions_per_wi(preferred_reductions_per_wi);
+
+        std::size_t reduction_groups =
+            (reduction_nelems + preferred_reductions_per_wi * wg - 1) /
+            (preferred_reductions_per_wi * wg);
+
+        std::size_t max_wg = reduction_detail::get_work_group_size(dev);
+
+        if (reduction_nelems <= preferred_reductions_per_wi * max_wg) {
+            auto tmp_owner =
+                dpctl::tensor::alloc_utils::smart_malloc_device<resTy>(
+                    iter_nelems * reduction_nelems, exec_q);
+            resTy *tmp = tmp_owner.get();
+
+            sycl::event gemm_ev = gemm_detail::_gemm_tree_nm_step<
+                lhsTy, rhsTy, resTy, BatchIndexerT, OuterInnerDimsIndexerT,
+                OuterInnerDimsIndexerT, OuterInnerDimsIndexerT, wi_delta_n,
+                wi_delta_m>(exec_q, lhs_tp, rhs_tp, tmp, single_batch_nelems, n,
+                            k, m, wg_delta_n, wg_delta_m, wi_delta_k,
+                            batch_indexer, lhs_indexer, rhs_indexer,
+                            res_indexer, depends);
+
+            sycl::event red_ev =
+                single_reduction_for_gemm_contig<resTy, ReductionOpT>(
+                    exec_q, tmp, res_tp, identity_val, iter_nelems,
+                    reduction_nelems, reduction_groups, wg, max_wg,
+                    preferred_reductions_per_wi, reductions_per_wi, {gemm_ev});
+
+            sycl::event cleanup_host_task_event =
+                dpctl::tensor::alloc_utils::async_smart_free(exec_q, {red_ev},
+                                                             tmp_owner);
+            return cleanup_host_task_event;
+        }
+        else {
+            assert(reduction_groups > 1);
+
+            const std::size_t tmp_alloc_size =
+                iter_nelems * (/* temp */ reduction_nelems +
+                               /* first reduction temp */ reduction_groups);
+
+            auto tmp_owner =
+                dpctl::tensor::alloc_utils::smart_malloc_device<resTy>(
+                    tmp_alloc_size, exec_q);
+            resTy *partially_reduced_tmp = tmp_owner.get();
+            resTy *partially_reduced_tmp2 =
+                partially_reduced_tmp + reduction_nelems * iter_nelems;
+
+            sycl::event gemm_ev = gemm_detail::_gemm_tree_nm_step<
+                lhsTy, rhsTy, resTy, BatchIndexerT, OuterInnerDimsIndexerT,
+                OuterInnerDimsIndexerT, OuterInnerDimsIndexerT, wi_delta_n,
+                wi_delta_m>(exec_q, lhs_tp, rhs_tp, partially_reduced_tmp,
+                            single_batch_nelems, n, k, m, wg_delta_n,
+                            wg_delta_m, wi_delta_k, batch_indexer, lhs_indexer,
+                            rhs_indexer, res_indexer, depends);
+
+            sycl::event red_ev =
+                tree_reduction_for_gemm_contig<resTy, ReductionOpT>(
+                    exec_q, partially_reduced_tmp, partially_reduced_tmp2,
+                    res_tp, identity_val, iter_nelems, reduction_nelems,
+                    reduction_groups, wg, max_wg, preferred_reductions_per_wi,
+                    reductions_per_wi, {gemm_ev});
+
+            sycl::event cleanup_host_task_event =
+                dpctl::tensor::alloc_utils::async_smart_free(exec_q, {red_ev},
+                                                             tmp_owner);
+
+            return cleanup_host_task_event;
+        }
+    }
+}
+
+template <typename lhsTy, typename rhsTy, typename resTy>
+sycl::event gemm_contig_tree_impl(sycl::queue &exec_q,
+                                  const char *lhs_cp,
+                                  const char *rhs_cp,
+                                  char *res_cp,
+                                  std::size_t n,
+                                  std::size_t k,
+                                  std::size_t m,
+                                  std::vector<sycl::event> const &depends = {})
+{
+    const lhsTy *lhs_tp = reinterpret_cast<const lhsTy *>(lhs_cp);
+    const rhsTy *rhs_tp = reinterpret_cast<const rhsTy *>(rhs_cp);
+    resTy *res_tp = reinterpret_cast<resTy *>(res_cp);
+
+    const std::size_t min_nm = std::min(n, m);
+    const std::size_t max_nm = std::max(n, m);
+
+    if (min_nm > 0 && (max_nm >= ((64 * 1024) / min_nm))) {
+        static constexpr std::size_t single_batch_nelems = 1;
+        return gemm_batch_nm_contig_impl<lhsTy, rhsTy, resTy>(
+            exec_q, lhs_tp, rhs_tp, res_tp, single_batch_nelems, n, k, m,
+            depends);
+    }
+
+    if (k == 0) {
+        sycl::event gemm_no_reduction_ev =
+            exec_q.submit([&](sycl::handler &cgh) {
+                cgh.depends_on(depends);
+                cgh.fill<resTy>(res_tp, resTy(0), n * m);
+            });
+        return gemm_no_reduction_ev;
+    }
+
+    if (max_nm < 64) {
+        using dpctl::tensor::type_utils::is_complex;
+        if constexpr (!is_complex<resTy>::value) {
+            if (m < 4) {
+                return gemm_contig_tree_k_impl<lhsTy, rhsTy, resTy, 1>(
+                    exec_q, lhs_tp, rhs_tp, res_tp, n, k, m, depends);
+            }
+            else {
+                return gemm_contig_tree_k_impl<lhsTy, rhsTy, resTy, 4>(
+                    exec_q, lhs_tp, rhs_tp, res_tp, n, k, m, depends);
+            }
+        }
+        else {
+            return gemm_contig_tree_k_impl<lhsTy, rhsTy, resTy, 1>(
+                exec_q, lhs_tp, rhs_tp, res_tp, n, k, m, depends);
+        }
+    }
+    else { // m > 1, n > k or m > k
+        using dpctl::tensor::type_utils::is_complex;
+        if constexpr (!is_complex<resTy>::value) {
+            return gemm_contig_tree_nm_impl<lhsTy, rhsTy, resTy, 4>(
+                exec_q, lhs_tp, rhs_tp, res_tp, n, k, m, depends);
+        }
+        else {
+            return gemm_contig_tree_nm_impl<lhsTy, rhsTy, resTy, 1>(
+                exec_q, lhs_tp, rhs_tp, res_tp, n, k, m, depends);
+        }
+    }
+}
+
+} // namespace dpctl::tensor::kernels
diff --git a/dpctl_ext/tensor/libtensor/source/linalg_functions/dot.cpp b/dpctl_ext/tensor/libtensor/source/linalg_functions/dot.cpp
new file mode 100644
index 000000000000..05ee37594e12
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/linalg_functions/dot.cpp
@@ -0,0 +1,839 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions
+//===--------------------------------------------------------------------===//
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <exception>
+#include <iterator>
+#include <stdexcept>
+#include <string>
+#include <sycl/sycl.hpp>
+#include <utility>
+#include <vector>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "dot.hpp"
+#include "dot_atomic_support.hpp"
+#include "dot_dispatch.hpp"
+#include "elementwise_functions/elementwise_functions_type_utils.hpp"
+#include "kernels/linalg_functions/dot_product.hpp"
+#include "kernels/linalg_functions/gemm.hpp"
+#include "reductions/reduction_atomic_support.hpp"
+#include "simplify_iteration_space.hpp"
+#include "utils/memory_overlap.hpp"
+#include "utils/offset_utils.hpp"
+#include "utils/output_validation.hpp"
+#include "utils/sycl_alloc_utils.hpp"
+#include "utils/type_dispatch.hpp"
+#include "utils/type_dispatch_building.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+static int dot_output_id_table[td_ns::num_types][td_ns::num_types];
+
+using dpctl::tensor::kernels::dot_product_impl_fn_ptr_t;
+static dot_product_impl_fn_ptr_t dot_product_dispatch_table[td_ns::num_types]
+                                                           [td_ns::num_types];
+
+static dot_product_impl_fn_ptr_t
+    dot_product_temps_dispatch_table[td_ns::num_types][td_ns::num_types];
+
+using dpctl::tensor::kernels::dot_product_contig_impl_fn_ptr_t;
+static dot_product_contig_impl_fn_ptr_t
+    dot_product_contig_dispatch_table[td_ns::num_types][td_ns::num_types];
+
+static dot_product_contig_impl_fn_ptr_t
+    dot_product_contig_temps_dispatch_table[td_ns::num_types][td_ns::num_types];
+
+using dpctl::tensor::kernels::gemm_impl_fn_ptr_t;
+static gemm_impl_fn_ptr_t gemm_atomic_dispatch_table[td_ns::num_types]
+                                                    [td_ns::num_types];
+
+static gemm_impl_fn_ptr_t gemm_temps_dispatch_table[td_ns::num_types]
+                                                   [td_ns::num_types];
+
+using dpctl::tensor::kernels::gemm_contig_impl_fn_ptr_t;
+static gemm_contig_impl_fn_ptr_t
+    gemm_contig_atomic_dispatch_table[td_ns::num_types][td_ns::num_types];
+
+static gemm_contig_impl_fn_ptr_t
+    gemm_contig_temps_dispatch_table[td_ns::num_types][td_ns::num_types];
+
+using dpctl::tensor::kernels::gemm_batch_impl_fn_ptr_t;
+static gemm_batch_impl_fn_ptr_t
+    gemm_batch_atomic_dispatch_table[td_ns::num_types][td_ns::num_types];
+
+static gemm_batch_impl_fn_ptr_t
+    gemm_batch_temps_dispatch_table[td_ns::num_types][td_ns::num_types];
+
+using dpctl::tensor::kernels::gemm_batch_contig_impl_fn_ptr_t;
+static gemm_batch_contig_impl_fn_ptr_t
+    gemm_batch_contig_atomic_dispatch_table[td_ns::num_types][td_ns::num_types];
+
+static gemm_batch_contig_impl_fn_ptr_t
+    gemm_batch_contig_temps_dispatch_table[td_ns::num_types][td_ns::num_types];
+
+void init_dot_dispatch_tables(void)
+{
+    td_ns::DispatchTableBuilder<int, DotTypeMapFactory, td_ns::num_types> dtb1;
+    dtb1.populate_dispatch_table(dot_output_id_table);
+
+    td_ns::DispatchTableBuilder<gemm_batch_impl_fn_ptr_t,
+                                GemmBatchAtomicFactory, td_ns::num_types>
+        dtb2;
+    dtb2.populate_dispatch_table(gemm_batch_atomic_dispatch_table);
+
+    td_ns::DispatchTableBuilder<gemm_batch_contig_impl_fn_ptr_t,
+                                GemmBatchContigAtomicFactory, td_ns::num_types>
+        dtb3;
+    dtb3.populate_dispatch_table(gemm_batch_contig_atomic_dispatch_table);
+
+    td_ns::DispatchTableBuilder<gemm_impl_fn_ptr_t, GemmAtomicFactory,
+                                td_ns::num_types>
+        dtb4;
+    dtb4.populate_dispatch_table(gemm_atomic_dispatch_table);
+
+    td_ns::DispatchTableBuilder<gemm_contig_impl_fn_ptr_t,
+                                GemmContigAtomicFactory, td_ns::num_types>
+        dtb5;
+    dtb5.populate_dispatch_table(gemm_contig_atomic_dispatch_table);
+
+    td_ns::DispatchTableBuilder<gemm_batch_impl_fn_ptr_t, GemmBatchTempsFactory,
+                                td_ns::num_types>
+        dtb6;
+    dtb6.populate_dispatch_table(gemm_batch_temps_dispatch_table);
+
+    td_ns::DispatchTableBuilder<gemm_batch_contig_impl_fn_ptr_t,
+                                GemmBatchContigTempsFactory, td_ns::num_types>
+        dtb7;
+    dtb7.populate_dispatch_table(gemm_batch_contig_temps_dispatch_table);
+
+    td_ns::DispatchTableBuilder<gemm_impl_fn_ptr_t, GemmTempsFactory,
+                                td_ns::num_types>
+        dtb8;
+    dtb8.populate_dispatch_table(gemm_temps_dispatch_table);
+
+    td_ns::DispatchTableBuilder<gemm_contig_impl_fn_ptr_t,
+                                GemmContigTempsFactory, td_ns::num_types>
+        dtb9;
+    dtb9.populate_dispatch_table(gemm_contig_temps_dispatch_table);
+
+    td_ns::DispatchTableBuilder<dot_product_impl_fn_ptr_t,
+                                DotProductAtomicFactory, td_ns::num_types>
+        dtb10;
+    dtb10.populate_dispatch_table(dot_product_dispatch_table);
+
+    td_ns::DispatchTableBuilder<dot_product_impl_fn_ptr_t,
+                                DotProductNoAtomicFactory, td_ns::num_types>
+        dtb11;
+    dtb11.populate_dispatch_table(dot_product_temps_dispatch_table);
+
+    td_ns::DispatchTableBuilder<dot_product_contig_impl_fn_ptr_t,
+                                DotProductContigAtomicFactory, td_ns::num_types>
+        dtb12;
+    dtb12.populate_dispatch_table(dot_product_contig_dispatch_table);
+
+    td_ns::DispatchTableBuilder<dot_product_contig_impl_fn_ptr_t,
+                                DotProductContigNoAtomicFactory,
+                                td_ns::num_types>
+        dtb13;
+    dtb13.populate_dispatch_table(dot_product_contig_temps_dispatch_table);
+}
+
+using atomic_support::atomic_support_fn_ptr_t;
+static atomic_support_fn_ptr_t dot_atomic_support_vector[td_ns::num_types];
+
+void init_dot_atomic_support_vector(void)
+{
+
+    using atomic_support::DotAtomicSupportFactory;
+    td_ns::DispatchVectorBuilder<atomic_support_fn_ptr_t,
+                                 DotAtomicSupportFactory, td_ns::num_types>
+        dvb;
+    dvb.populate_dispatch_vector(dot_atomic_support_vector);
+}
+
+std::pair<sycl::event, sycl::event>
+    py_dot(const dpctl::tensor::usm_ndarray &x1,
+           const dpctl::tensor::usm_ndarray &x2,
+           int batch_dims,
+           int x1_outer_dims,
+           int x2_outer_dims,
+           int inner_dims,
+           const dpctl::tensor::usm_ndarray &dst,
+           sycl::queue &exec_q,
+           const std::vector<sycl::event> &depends)
+{
+    if (!dpctl::utils::queues_are_compatible(exec_q, {x1, x2, dst})) {
+        throw py::value_error(
+            "Execution queue is not compatible with allocation queues");
+    }
+
+    dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst);
+
+    if (inner_dims == 0) {
+        throw py::value_error("No inner dimension for dot");
+    }
+
+    int x1_nd = x1.get_ndim();
+    int x2_nd = x2.get_ndim();
+    if (x1_nd != (batch_dims + x1_outer_dims + inner_dims) ||
+        x2_nd != (batch_dims + x2_outer_dims + inner_dims))
+    {
+        throw py::value_error("Input arrays do not have dimensions consistent "
+                              "with input dimensions");
+    }
+
+    int dst_nd = dst.get_ndim();
+    if (dst_nd != (batch_dims + x1_outer_dims + x2_outer_dims)) {
+        throw py::value_error("Destination array rank does not match input "
+                              "array rank and number of input dimensions");
+    }
+
+    const py::ssize_t *x1_shape_ptr = x1.get_shape_raw();
+    const py::ssize_t *x2_shape_ptr = x2.get_shape_raw();
+    const py::ssize_t *dst_shape_ptr = dst.get_shape_raw();
+
+    bool same_shapes = true;
+    std::size_t batches(1);
+    for (int i = 0; same_shapes && (i < batch_dims); ++i) {
+        same_shapes = same_shapes && (x1_shape_ptr[i] == dst_shape_ptr[i]) &&
+                      (x2_shape_ptr[i] == dst_shape_ptr[i]);
+        batches *= x1_shape_ptr[i];
+    }
+    std::size_t x1_outer_nelems(1);
+    for (int i = batch_dims; same_shapes && (i < (batch_dims + x1_outer_dims));
+         ++i) {
+        same_shapes = same_shapes && (x1_shape_ptr[i] == dst_shape_ptr[i]);
+        x1_outer_nelems *= x1_shape_ptr[i];
+    }
+    std::size_t inner_nelems(1);
+    for (int i = batch_dims; i < (batch_dims + inner_dims); ++i) {
+        auto x1_shape_idx = x1_outer_dims + i;
+        same_shapes =
+            same_shapes && (x1_shape_ptr[x1_shape_idx] == x2_shape_ptr[i]);
+        inner_nelems *= x1_shape_ptr[x1_shape_idx];
+    }
+    std::size_t x2_outer_nelems(1);
+    for (int i = 0; same_shapes && (i < x2_outer_dims); ++i) {
+        auto x2_shape_idx = batch_dims + inner_dims + i;
+        same_shapes =
+            same_shapes && (x2_shape_ptr[x2_shape_idx] ==
+                            dst_shape_ptr[batch_dims + x1_outer_dims + i]);
+        x2_outer_nelems *= x2_shape_ptr[x2_shape_idx];
+    }
+    if (!same_shapes) {
+        throw py::value_error("Input arrays to tensor dot product do not have "
+                              "appropriate shapes");
+    }
+
+    std::size_t dst_nelems = batches * x1_outer_nelems * x2_outer_nelems;
+    if (dst_nelems == 0) {
+        return std::make_pair(sycl::event(), sycl::event());
+    }
+
+    if (static_cast<std::size_t>(dst.get_size()) != dst_nelems) {
+        throw py::value_error("dst shape and size mismatch");
+    }
+
+    dpctl::tensor::validation::AmpleMemory::throw_if_not_ample(dst, dst_nelems);
+
+    auto const &overlap = dpctl::tensor::overlap::MemoryOverlap();
+    // check that dst does not intersect with x1 or x2
+    if (overlap(dst, x1) || overlap(dst, x2)) {
+        throw py::value_error("Result array overlaps with inputs");
+    }
+
+    int x1_typenum = x1.get_typenum();
+    int x2_typenum = x2.get_typenum();
+    int dst_typenum = dst.get_typenum();
+
+    auto const &array_types = td_ns::usm_ndarray_types();
+    int x1_typeid = array_types.typenum_to_lookup_id(x1_typenum);
+    int x2_typeid = array_types.typenum_to_lookup_id(x2_typenum);
+    int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum);
+
+    int output_typeid = dot_output_id_table[x1_typeid][x2_typeid];
+
+    if (output_typeid != dst_typeid) {
+        throw py::value_error(
+            "Result array has unexpected elemental data type.");
+    }
+
+    void *data_ptr = dst.get_data();
+    const auto &ctx = exec_q.get_context();
+    auto usm_type = sycl::get_pointer_type(data_ptr, ctx);
+    bool supports_atomics =
+        dot_atomic_support_vector[output_typeid](exec_q, usm_type);
+
+    const char *x1_data = x1.get_data();
+    const char *x2_data = x2.get_data();
+    char *dst_data = dst.get_data();
+
+    const auto &x1_shape_vec = x1.get_shape_vector();
+    const auto &x1_strides_vec = x1.get_strides_vector();
+
+    const auto &x2_shape_vec = x2.get_shape_vector();
+    const auto &x2_strides_vec = x2.get_strides_vector();
+
+    const auto &dst_shape_vec = dst.get_shape_vector();
+    const auto &dst_strides_vec = dst.get_strides_vector();
+
+    bool is_x1_c_contig = x1.is_c_contiguous();
+    bool is_x1_f_contig = x1.is_f_contiguous();
+    bool is_x2_c_contig = x2.is_c_contiguous();
+    bool is_x2_f_contig = x2.is_f_contiguous();
+    bool is_dst_c_contig = dst.is_c_contiguous();
+
+    bool call_vecdot = ((x1_outer_dims == 0 && x1_outer_nelems == 1) &&
+                        (x2_outer_dims == 0 && x2_outer_nelems == 1));
+
+    bool call_batched = (batch_dims != 0 || batches > 1);
+    std::vector<sycl::event> host_task_events{};
+    sycl::event dot_ev;
+    if (call_vecdot) {
+        if ((is_x1_c_contig && is_x2_c_contig && is_dst_c_contig) ||
+            ((is_x1_f_contig && is_x2_f_contig) && !call_batched))
+        {
+            dot_product_contig_impl_fn_ptr_t fn = nullptr;
+            if (supports_atomics) {
+                fn = dot_product_contig_dispatch_table[x1_typeid][x2_typeid];
+            }
+            else {
+                fn = dot_product_contig_temps_dispatch_table[x1_typeid]
+                                                            [x2_typeid];
+            }
+            if (fn != nullptr) {
+                static constexpr py::ssize_t zero_offset = 0;
+                dot_ev = fn(exec_q, dst_nelems, inner_nelems, x1.get_data(),
+                            x2.get_data(), dst.get_data(),
+                            zero_offset, // lhs batch offset
+                            zero_offset, // rhs batch offset
+                            zero_offset, // res batch offset
+                            zero_offset, // lhs reduction offset
+                            zero_offset, // rhs reduction offset
+                            depends);
+                return std::make_pair(dpctl::utils::keep_args_alive(
+                                          exec_q, {x1, x2, dst}, {dot_ev}),
+                                      dot_ev);
+            }
+        }
+        int inner_nd = inner_dims;
+        const py::ssize_t *inner_shape_ptr = x1_shape_ptr + batch_dims;
+        using shT = std::vector<py::ssize_t>;
+        const shT inner_x1_strides(std::begin(x1_strides_vec) + batch_dims,
+                                   std::end(x1_strides_vec));
+        const shT inner_x2_strides(std::begin(x2_strides_vec) + batch_dims,
+                                   std::end(x2_strides_vec));
+
+        shT simplified_inner_shape;
+        shT simplified_inner_x1_strides;
+        shT simplified_inner_x2_strides;
+        py::ssize_t inner_x1_offset(0);
+        py::ssize_t inner_x2_offset(0);
+
+        simplify_iteration_space(
+            inner_nd, inner_shape_ptr, inner_x1_strides, inner_x2_strides,
+            // output
+            simplified_inner_shape, simplified_inner_x1_strides,
+            simplified_inner_x2_strides, inner_x1_offset, inner_x2_offset);
+
+        const py::ssize_t *batch_shape_ptr = x1_shape_ptr;
+
+        const shT batch_x1_strides(std::begin(x1_strides_vec),
+                                   std::begin(x1_strides_vec) + batch_dims);
+        const shT batch_x2_strides(std::begin(x2_strides_vec),
+                                   std::begin(x2_strides_vec) + batch_dims);
+        shT const &batch_dst_strides = dst_strides_vec;
+
+        shT simplified_batch_shape;
+        shT simplified_batch_x1_strides;
+        shT simplified_batch_x2_strides;
+        shT simplified_batch_dst_strides;
+        py::ssize_t batch_x1_offset(0);
+        py::ssize_t batch_x2_offset(0);
+        py::ssize_t batch_dst_offset(0);
+
+        if (batch_dims == 0) {
+            if (dst_nelems != 1) {
+                throw std::runtime_error(
+                    "batch_dims == 0, but dst_nelems != 1");
+            }
+            batch_dims = 1;
+            simplified_batch_shape.push_back(1);
+            simplified_batch_x1_strides.push_back(0);
+            simplified_batch_x2_strides.push_back(0);
+            simplified_batch_dst_strides.push_back(0);
+        }
+        else {
+            simplify_iteration_space_3(
+                batch_dims, batch_shape_ptr, batch_x1_strides, batch_x2_strides,
+                batch_dst_strides,
+                // output
+                simplified_batch_shape, simplified_batch_x1_strides,
+                simplified_batch_x2_strides, simplified_batch_dst_strides,
+                batch_x1_offset, batch_x2_offset, batch_dst_offset);
+        }
+
+        if (inner_nd == 1 && batch_dims == 1) {
+            bool dot_product_c_contig = false;
+            bool reduce_all_elems = false;
+
+            if (simplified_inner_x1_strides[0] == 1 &&
+                simplified_inner_x2_strides[0] == 1) {
+                reduce_all_elems = (simplified_batch_shape[0] == 1);
+                dot_product_c_contig =
+                    (simplified_batch_dst_strides[0] == 1) &&
+                    (static_cast<std::size_t>(simplified_batch_x1_strides[0]) ==
+                     inner_nelems) &&
+                    (static_cast<std::size_t>(simplified_batch_x2_strides[0]) ==
+                     inner_nelems);
+            }
+
+            if (dot_product_c_contig || reduce_all_elems) {
+                dot_product_contig_impl_fn_ptr_t fn = nullptr;
+                if (supports_atomics) {
+                    fn =
+                        dot_product_contig_dispatch_table[x1_typeid][x2_typeid];
+                }
+                else {
+                    fn = dot_product_contig_temps_dispatch_table[x1_typeid]
+                                                                [x2_typeid];
+                }
+                if (fn != nullptr) {
+                    dot_ev = fn(exec_q, dst_nelems, inner_nelems, x1.get_data(),
+                                x2.get_data(), dst.get_data(),
+                                batch_x1_offset,  // lhs batch offset
+                                batch_x2_offset,  // rhs batch offset
+                                batch_dst_offset, // res batch offset
+                                inner_x1_offset,  // lhs reduction offset
+                                inner_x2_offset,  // rhs reduction offset
+                                depends);
+                    return std::make_pair(dpctl::utils::keep_args_alive(
+                                              exec_q, {x1, x2, dst}, {dot_ev}),
+                                          dot_ev);
+                }
+            }
+        }
+
+        dot_product_impl_fn_ptr_t fn = nullptr;
+        if (supports_atomics) {
+            fn = dot_product_dispatch_table[x1_typeid][x2_typeid];
+        }
+        if (fn == nullptr) {
+            fn = dot_product_temps_dispatch_table[x1_typeid][x2_typeid];
+            if (fn == nullptr) {
+                throw std::runtime_error(
+                    "Implementation is missing for x1_typeid=" +
+                    std::to_string(x1_typeid) +
+                    " and x2_typeid=" + std::to_string(x2_typeid));
+            }
+        }
+
+        using dpctl::tensor::offset_utils::device_allocate_and_pack;
+        auto arrays_metainfo_packing_triple_ =
+            device_allocate_and_pack<py::ssize_t>(
+                exec_q, host_task_events,
+                // iteration metadata
+                simplified_batch_shape, simplified_batch_x1_strides,
+                simplified_batch_x2_strides, simplified_batch_dst_strides,
+                // reduction metadata
+                simplified_inner_shape, simplified_inner_x1_strides,
+                simplified_inner_x2_strides);
+        auto tmp_alloc_owner =
+            std::move(std::get<0>(arrays_metainfo_packing_triple_));
+        const auto &copy_metadata_ev =
+            std::get<2>(arrays_metainfo_packing_triple_);
+        const py::ssize_t *temp_allocation_ptr = tmp_alloc_owner.get();
+
+        const py::ssize_t *iter_shape_and_strides = temp_allocation_ptr;
+        const py::ssize_t *inner_shape_stride =
+            temp_allocation_ptr + 4 * simplified_batch_shape.size();
+
+        std::vector<sycl::event> all_deps;
+        all_deps.reserve(depends.size() + 1);
+        all_deps.resize(depends.size());
+        std::copy(depends.begin(), depends.end(), all_deps.begin());
+        all_deps.push_back(copy_metadata_ev);
+
+        dot_ev =
+            fn(exec_q, dst_nelems, inner_nelems, x1.get_data(), x2.get_data(),
+               dst.get_data(), batch_dims, iter_shape_and_strides,
+               batch_x1_offset, batch_x2_offset, batch_dst_offset,
+               inner_nd, // number dimensions being reduced
+               inner_shape_stride, inner_x1_offset, inner_x2_offset, all_deps);
+
+        sycl::event temp_cleanup_ev =
+            dpctl::tensor::alloc_utils::async_smart_free(exec_q, {dot_ev},
+                                                         tmp_alloc_owner);
+        host_task_events.push_back(temp_cleanup_ev);
+    }
+    else { // if (!call_vecdot)
+        if (!call_batched) {
+            if ((is_x1_c_contig && is_x2_c_contig && is_dst_c_contig)) {
+                gemm_contig_impl_fn_ptr_t fn = nullptr;
+                if (supports_atomics) {
+                    fn =
+                        gemm_contig_atomic_dispatch_table[x1_typeid][x2_typeid];
+                }
+                else {
+                    fn = gemm_contig_temps_dispatch_table[x1_typeid][x2_typeid];
+                }
+                if (fn != nullptr) {
+                    dot_ev = fn(exec_q, x1_data, x2_data, dst_data,
+                                x1_outer_nelems, // n
+                                inner_nelems,    // k
+                                x2_outer_nelems, // m
+                                depends);
+                    return std::make_pair(dpctl::utils::keep_args_alive(
+                                              exec_q, {x1, x2, dst}, {dot_ev}),
+                                          dot_ev);
+                }
+            }
+            gemm_impl_fn_ptr_t fn = nullptr;
+            if (supports_atomics) {
+                fn = gemm_atomic_dispatch_table[x1_typeid][x2_typeid];
+            }
+            if (fn == nullptr) {
+                fn = gemm_temps_dispatch_table[x1_typeid][x2_typeid];
+                if (fn == nullptr) {
+                    throw std::runtime_error(
+                        "Implementation is missing for x1_typeid=" +
+                        std::to_string(x1_typeid) +
+                        " and x2_typeid=" + std::to_string(x2_typeid));
+                }
+            }
+            using dpctl::tensor::offset_utils::device_allocate_and_pack;
+            auto ptr_size_event_tuple1 = device_allocate_and_pack<py::ssize_t>(
+                exec_q, host_task_events, x1_shape_vec, x1_strides_vec,
+                x2_shape_vec, x2_strides_vec, dst_shape_vec, dst_strides_vec);
+            auto packed_shapes_strides_owner =
+                std::move(std::get<0>(ptr_size_event_tuple1));
+            sycl::event copy_shapes_strides_ev =
+                std::get<2>(ptr_size_event_tuple1);
+            const py::ssize_t *packed_shapes_strides =
+                packed_shapes_strides_owner.get();
+
+            const py::ssize_t *x1_shape_strides = packed_shapes_strides;
+            const py::ssize_t *x2_shape_strides =
+                packed_shapes_strides + 2 * (x1_nd);
+            const py::ssize_t *dst_shape_strides =
+                packed_shapes_strides + 2 * (x1_nd + x2_nd);
+
+            std::vector<sycl::event> all_deps;
+            all_deps.reserve(depends.size() + 1);
+            all_deps.insert(all_deps.end(), depends.begin(), depends.end());
+            all_deps.push_back(copy_shapes_strides_ev);
+
+            // change gemm calls to pass inner dims and outer dims separately
+            dot_ev =
+                fn(exec_q, x1_data, x2_data, dst_data, x1_outer_nelems,
+                   inner_nelems, x2_outer_nelems, inner_dims, x1_outer_dims,
+                   x1_shape_strides, x2_outer_dims, x2_shape_strides,
+                   x1_outer_dims + x2_outer_dims, dst_shape_strides, all_deps);
+
+            sycl::event cleanup_tmp_allocations_ev =
+                dpctl::tensor::alloc_utils::async_smart_free(
+                    exec_q, {dot_ev}, packed_shapes_strides_owner);
+            host_task_events.push_back(cleanup_tmp_allocations_ev);
+        }
+        else { // if (call_batched)
+            using shT = std::vector<py::ssize_t>;
+            // temporary asserts for matmul
+            assert(x1_outer_dims == 1);
+            assert(x2_outer_dims == 1);
+            assert(inner_dims == 1);
+
+            if ((is_x1_c_contig && is_x2_c_contig && is_dst_c_contig)) {
+                gemm_batch_contig_impl_fn_ptr_t fn = nullptr;
+                if (supports_atomics) {
+                    fn = gemm_batch_contig_atomic_dispatch_table[x1_typeid]
+                                                                [x2_typeid];
+                }
+                else {
+                    fn = gemm_batch_contig_temps_dispatch_table[x1_typeid]
+                                                               [x2_typeid];
+                }
+                if (fn != nullptr) {
+                    static constexpr py::ssize_t zero_offset = 0;
+                    dot_ev = fn(exec_q, x1_data, x2_data, dst_data, batches,
+                                x1_outer_nelems, // n
+                                inner_nelems,    // k
+                                x2_outer_nelems, // m
+                                zero_offset, zero_offset, zero_offset, depends);
+                    return std::make_pair(dpctl::utils::keep_args_alive(
+                                              exec_q, {x1, x2, dst}, {dot_ev}),
+                                          dot_ev);
+                }
+            }
+
+            auto x1_outer_inner_dims = x1_nd - batch_dims;
+            auto x2_outer_inner_dims = x2_nd - batch_dims;
+            auto dst_outer_inner_dims = dst_nd - batch_dims;
+
+            shT batch_x1_shape;
+            shT outer_inner_x1_shape;
+            shT batch_x1_strides;
+            shT outer_inner_x1_strides;
+            split_iteration_space(x1_shape_vec, x1_strides_vec, batch_dims,
+                                  batch_dims + x1_outer_inner_dims,
+                                  // 4 vectors modified
+                                  batch_x1_shape, outer_inner_x1_shape,
+                                  batch_x1_strides, outer_inner_x1_strides);
+
+            shT batch_x2_shape;
+            shT outer_inner_x2_shape;
+            shT batch_x2_strides;
+            shT outer_inner_x2_strides;
+            split_iteration_space(x2_shape_vec, x2_strides_vec, batch_dims,
+                                  batch_dims + x2_outer_inner_dims,
+                                  // 4 vectors modified
+                                  batch_x2_shape, outer_inner_x2_shape,
+                                  batch_x2_strides, outer_inner_x2_strides);
+
+            shT batch_dst_shape;
+            shT outer_inner_dst_shape;
+            shT batch_dst_strides;
+            shT outer_inner_dst_strides;
+            split_iteration_space(dst_shape_vec, dst_strides_vec, batch_dims,
+                                  batch_dims + dst_outer_inner_dims,
+                                  // 4 vectors modified
+                                  batch_dst_shape, outer_inner_dst_shape,
+                                  batch_dst_strides, outer_inner_dst_strides);
+
+            using shT = std::vector<py::ssize_t>;
+            shT simplified_batch_shape;
+            shT simplified_batch_x1_strides;
+            shT simplified_batch_x2_strides;
+            shT simplified_batch_dst_strides;
+            py::ssize_t x1_batch_offset(0);
+            py::ssize_t x2_batch_offset(0);
+            py::ssize_t dst_batch_offset(0);
+
+            const py::ssize_t *shape = x1_shape_ptr;
+
+            simplify_iteration_space_3(
+                batch_dims, shape, batch_x1_strides, batch_x2_strides,
+                batch_dst_strides,
+                // outputs
+                simplified_batch_shape, simplified_batch_x1_strides,
+                simplified_batch_x2_strides, simplified_batch_dst_strides,
+                x1_batch_offset, x2_batch_offset, dst_batch_offset);
+
+            if (batch_dims == 1 && x1_outer_dims == 1 && x2_outer_dims == 1 &&
+                inner_dims == 1)
+            {
+                bool gemm_batch_c_contig = false;
+
+                if ((static_cast<std::size_t>(outer_inner_x1_strides[0]) ==
+                         inner_nelems &&
+                     outer_inner_x1_strides[1] == 1) &&
+                    (static_cast<std::size_t>(outer_inner_x2_strides[0]) ==
+                         inner_nelems &&
+                     outer_inner_x2_strides[1] == 1) &&
+                    (static_cast<std::size_t>(outer_inner_dst_strides[0]) ==
+                         x2_outer_nelems &&
+                     outer_inner_dst_strides[1] == 1))
+                {
+                    gemm_batch_c_contig =
+                        (static_cast<std::size_t>(
+                             simplified_batch_x1_strides[0]) ==
+                         x1_outer_nelems * inner_nelems) &&
+                        (static_cast<std::size_t>(
+                             simplified_batch_x2_strides[0]) ==
+                         x2_outer_nelems * inner_nelems) &&
+                        (static_cast<std::size_t>(
+                             simplified_batch_dst_strides[0]) ==
+                         x1_outer_nelems * x2_outer_nelems);
+                }
+
+                if (gemm_batch_c_contig) {
+                    gemm_batch_contig_impl_fn_ptr_t fn = nullptr;
+                    if (supports_atomics) {
+                        fn = gemm_batch_contig_atomic_dispatch_table[x1_typeid]
+                                                                    [x2_typeid];
+                    }
+                    else {
+                        fn = gemm_batch_contig_temps_dispatch_table[x1_typeid]
+                                                                   [x2_typeid];
+                    }
+                    if (fn != nullptr) {
+                        dot_ev = fn(exec_q, x1_data, x2_data, dst_data, batches,
+                                    x1_outer_nelems, // n
+                                    inner_nelems,    // k
+                                    x2_outer_nelems, // m
+                                    x1_batch_offset, x2_batch_offset,
+                                    dst_batch_offset, depends);
+                        return std::make_pair(
+                            dpctl::utils::keep_args_alive(exec_q, {x1, x2, dst},
+                                                          {dot_ev}),
+                            dot_ev);
+                    }
+                }
+            }
+
+            gemm_batch_impl_fn_ptr_t fn = nullptr;
+            if (supports_atomics) {
+                fn = gemm_batch_atomic_dispatch_table[x1_typeid][x2_typeid];
+            }
+            if (fn == nullptr) {
+                fn = gemm_batch_temps_dispatch_table[x1_typeid][x2_typeid];
+                if (fn == nullptr) {
+                    throw std::runtime_error(
+                        "Implementation is missing for x1_typeid=" +
+                        std::to_string(x1_typeid) +
+                        " and x2_typeid=" + std::to_string(x2_typeid));
+                }
+            }
+            using dpctl::tensor::offset_utils::device_allocate_and_pack;
+            auto ptr_size_event_tuple1 = device_allocate_and_pack<py::ssize_t>(
+                exec_q, host_task_events, simplified_batch_shape,
+                simplified_batch_x1_strides, simplified_batch_x2_strides,
+                simplified_batch_dst_strides, outer_inner_x1_shape,
+                outer_inner_x1_strides, outer_inner_x2_shape,
+                outer_inner_x2_strides, outer_inner_dst_shape,
+                outer_inner_dst_strides,
+                // full shape and strides of the result array
+                // necessary for reduction and initialization
+                simplified_batch_shape, outer_inner_dst_shape,
+                simplified_batch_dst_strides, outer_inner_dst_strides);
+            auto packed_shapes_strides_owner =
+                std::move(std::get<0>(ptr_size_event_tuple1));
+            sycl::event copy_shapes_strides_ev =
+                std::get<2>(ptr_size_event_tuple1);
+            const py::ssize_t *packed_shapes_strides =
+                packed_shapes_strides_owner.get();
+
+            const auto batch_shape_strides = packed_shapes_strides;
+            const auto x1_outer_inner_shapes_strides =
+                packed_shapes_strides + 4 * batch_dims;
+            const auto x2_outer_inner_shapes_strides =
+                packed_shapes_strides + 4 * batch_dims +
+                2 * (x1_outer_inner_dims);
+            const auto dst_outer_shapes_strides =
+                packed_shapes_strides + 4 * batch_dims +
+                2 * (x1_outer_inner_dims) + 2 * (x2_outer_inner_dims);
+            const auto dst_full_shape_strides =
+                packed_shapes_strides + 4 * batch_dims +
+                2 * (x1_outer_inner_dims) + 2 * (x2_outer_inner_dims) +
+                2 * (dst_outer_inner_dims);
+
+            std::vector<sycl::event> all_deps;
+            all_deps.reserve(depends.size() + 1);
+            all_deps.insert(all_deps.end(), depends.begin(), depends.end());
+            all_deps.push_back(copy_shapes_strides_ev);
+
+            dot_ev = fn(
+                exec_q, x1_data, x2_data, dst_data, batches, x1_outer_nelems,
+                inner_nelems, x2_outer_nelems, batch_dims, batch_shape_strides,
+                x1_batch_offset, x2_batch_offset, dst_batch_offset, inner_dims,
+                x1_outer_dims, x1_outer_inner_shapes_strides, x2_outer_dims,
+                x2_outer_inner_shapes_strides, x1_outer_dims + x2_outer_dims,
+                dst_outer_shapes_strides, dst_full_shape_strides, all_deps);
+
+            sycl::event cleanup_tmp_allocations_ev =
+                dpctl::tensor::alloc_utils::async_smart_free(
+                    exec_q, {dot_ev}, packed_shapes_strides_owner);
+            host_task_events.push_back(cleanup_tmp_allocations_ev);
+        }
+    }
+    return std::make_pair(
+        dpctl::utils::keep_args_alive(exec_q, {x1, x2, dst}, host_task_events),
+        dot_ev);
+}
+
+template <typename output_typesT>
+py::object py_dot_result_type(const py::dtype &input1_dtype,
+                              const py::dtype &input2_dtype,
+                              const output_typesT &output_types_table)
+{
+    int tn1 = input1_dtype.num(); // NumPy type numbers are the same as in dpctl
+    int tn2 = input2_dtype.num(); // NumPy type numbers are the same as in dpctl
+    int src1_typeid = -1;
+    int src2_typeid = -1;
+
+    auto array_types = td_ns::usm_ndarray_types();
+
+    try {
+        src1_typeid = array_types.typenum_to_lookup_id(tn1);
+        src2_typeid = array_types.typenum_to_lookup_id(tn2);
+    } catch (const std::exception &e) {
+        throw py::value_error(e.what());
+    }
+
+    if (src1_typeid < 0 || src1_typeid >= td_ns::num_types || src2_typeid < 0 ||
+        src2_typeid >= td_ns::num_types)
+    {
+        throw std::runtime_error("binary output type lookup failed");
+    }
+    int dst_typeid = output_types_table[src1_typeid][src2_typeid];
+
+    if (dst_typeid < 0) {
+        auto res = py::none();
+        return py::cast<py::object>(res);
+    }
+    else {
+        auto dst_typenum_t = static_cast<td_ns::typenum_t>(dst_typeid);
+        auto dt = type_utils::_dtype_from_typenum(dst_typenum_t);
+
+        return py::cast<py::object>(dt);
+    }
+}
+
+void init_dot(py::module_ m)
+{
+    init_dot_atomic_support_vector();
+    init_dot_dispatch_tables();
+
+    m.def("_dot", &py_dot, "", py::arg("x1"), py::arg("x2"),
+          py::arg("batch_dims"), py::arg("x1_outer_dims"),
+          py::arg("x2_outer_dims"), py::arg("inner_dims"), py::arg("dst"),
+          py::arg("sycl_queue"), py::arg("depends") = py::list());
+
+    auto dot_result_type_pyapi = [&](const py::dtype &dtype1,
+                                     const py::dtype &dtype2) {
+        return py_dot_result_type(dtype1, dtype2, dot_output_id_table);
+    };
+    m.def("_dot_result_type", dot_result_type_pyapi, "");
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/linalg_functions/dot.hpp b/dpctl_ext/tensor/libtensor/source/linalg_functions/dot.hpp
new file mode 100644
index 000000000000..f6a23ace5cd9
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/linalg_functions/dot.hpp
@@ -0,0 +1,45 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions
+//===--------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern void init_dot(py::module_ m);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/linalg_functions/dot_atomic_support.hpp b/dpctl_ext/tensor/libtensor/source/linalg_functions/dot_atomic_support.hpp
new file mode 100644
index 000000000000..66b9b5004575
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/linalg_functions/dot_atomic_support.hpp
@@ -0,0 +1,58 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions
+//===--------------------------------------------------------------------===//
+
+#pragma once
+
+#include "reductions/reduction_atomic_support.hpp"
+#include "utils/type_utils.hpp"
+
+namespace dpctl::tensor::py_internal::atomic_support
+{
+
+template <typename fnT, typename T>
+struct DotAtomicSupportFactory
+{
+    fnT get()
+    {
+        using dpctl::tensor::type_utils::is_complex;
+        if constexpr (is_complex<T>::value) {
+            return atomic_support::fixed_decision<false>;
+        }
+        else {
+            return atomic_support::check_atomic_support<T>;
+        }
+    }
+};
+
+} // namespace dpctl::tensor::py_internal::atomic_support
diff --git a/dpctl_ext/tensor/libtensor/source/linalg_functions/dot_dispatch.hpp b/dpctl_ext/tensor/libtensor/source/linalg_functions/dot_dispatch.hpp
new file mode 100644
index 000000000000..984f71a4c183
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/linalg_functions/dot_dispatch.hpp
@@ -0,0 +1,405 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions
+//===--------------------------------------------------------------------===//
+
+#pragma once
+
+#include <complex>
+#include <cstdint>
+#include <type_traits>
+
+#include "kernels/linalg_functions/dot_product.hpp"
+#include "kernels/linalg_functions/gemm.hpp"
+#include "utils/type_dispatch_building.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+template <typename T1, typename T2>
+struct DotAtomicOutputType
+{
+    using value_type = typename std::disjunction<
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint32_t,
+                                        T2,
+                                        std::uint32_t,
+                                        std::uint32_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint32_t,
+                                        T2,
+                                        std::uint32_t,
+                                        std::uint64_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::int32_t,
+                                        T2,
+                                        std::int32_t,
+                                        std::int32_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::int32_t,
+                                        T2,
+                                        std::int32_t,
+                                        std::int64_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint64_t,
+                                        T2,
+                                        std::uint64_t,
+                                        std::uint64_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::int64_t,
+                                        T2,
+                                        std::int64_t,
+                                        std::int64_t>,
+        td_ns::BinaryTypeMapResultEntry<T1, float, T2, float, float>,
+        td_ns::BinaryTypeMapResultEntry<T1, float, T2, float, double>,
+        td_ns::BinaryTypeMapResultEntry<T1, double, T2, double, double>,
+        td_ns::DefaultResultEntry<void>>::result_type;
+
+    static constexpr bool is_defined = !std::is_same_v<value_type, void>;
+};
+
+// add separate type support lists for atomic vs. temps
+// gemm, gevm, and dot product share output type struct
+template <typename T1, typename T2>
+struct DotNoAtomicOutputType
+{
+    using value_type = typename std::disjunction<
+        td_ns::BinaryTypeMapResultEntry<T1, bool, T2, bool, bool>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint8_t,
+                                        T2,
+                                        std::uint8_t,
+                                        std::uint8_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::int8_t,
+                                        T2,
+                                        std::int8_t,
+                                        std::int8_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint16_t,
+                                        T2,
+                                        std::uint16_t,
+                                        std::uint16_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::int16_t,
+                                        T2,
+                                        std::int16_t,
+                                        std::int16_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint32_t,
+                                        T2,
+                                        std::uint32_t,
+                                        std::uint32_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint32_t,
+                                        T2,
+                                        std::uint32_t,
+                                        std::uint64_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::int32_t,
+                                        T2,
+                                        std::int32_t,
+                                        std::int32_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::int32_t,
+                                        T2,
+                                        std::int32_t,
+                                        std::int64_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint64_t,
+                                        T2,
+                                        std::uint64_t,
+                                        std::uint64_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::int64_t,
+                                        T2,
+                                        std::int64_t,
+                                        std::int64_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        sycl::half,
+                                        T2,
+                                        sycl::half,
+                                        sycl::half>,
+        td_ns::BinaryTypeMapResultEntry<T1, float, T2, float, float>,
+        td_ns::BinaryTypeMapResultEntry<T1, double, T2, double, double>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::complex<float>,
+                                        T2,
+                                        std::complex<float>,
+                                        std::complex<float>>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::complex<float>,
+                                        T2,
+                                        std::complex<float>,
+                                        std::complex<double>>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::complex<double>,
+                                        T2,
+                                        std::complex<double>,
+                                        std::complex<double>>,
+        td_ns::DefaultResultEntry<void>>::result_type;
+
+    static constexpr bool is_defined = !std::is_same_v<value_type, void>;
+};
+
+template <typename fnT, typename T1, typename T2>
+struct DotTypeMapFactory
+{
+    /*! @brief get typeid for output type of kernels called by py_dot */
+    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
+    {
+        using rT1 = typename DotNoAtomicOutputType<T1, T2>::value_type;
+        using rT2 = typename DotAtomicOutputType<T1, T2>::value_type;
+        static_assert(std::is_same_v<rT1, rT2> || std::is_same_v<rT2, void>);
+        return td_ns::GetTypeid<rT1>{}.get();
+    }
+};
+
+template <typename fnT, typename T1, typename T2>
+struct GemmBatchAtomicFactory
+{
+    fnT get()
+    {
+        if constexpr (!DotAtomicOutputType<T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            using dpctl::tensor::kernels::gemm_batch_impl;
+            using T3 = typename DotAtomicOutputType<T1, T2>::value_type;
+            fnT fn = gemm_batch_impl<T1, T2, T3>;
+            return fn;
+        }
+    }
+};
+
+template <typename fnT, typename T1, typename T2>
+struct GemmBatchContigAtomicFactory
+{
+    fnT get()
+    {
+        if constexpr (!DotAtomicOutputType<T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            using dpctl::tensor::kernels::gemm_batch_contig_impl;
+            using T3 = typename DotAtomicOutputType<T1, T2>::value_type;
+            fnT fn = gemm_batch_contig_impl<T1, T2, T3>;
+            return fn;
+        }
+    }
+};
+
+template <typename fnT, typename T1, typename T2>
+struct GemmAtomicFactory
+{
+    fnT get()
+    {
+        if constexpr (!DotAtomicOutputType<T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            using dpctl::tensor::kernels::gemm_impl;
+            using T3 = typename DotAtomicOutputType<T1, T2>::value_type;
+            fnT fn = gemm_impl<T1, T2, T3>;
+            return fn;
+        }
+    }
+};
+
+template <typename fnT, typename T1, typename T2>
+struct GemmContigAtomicFactory
+{
+    fnT get()
+    {
+        if constexpr (!DotAtomicOutputType<T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            using dpctl::tensor::kernels::gemm_contig_impl;
+            using T3 = typename DotAtomicOutputType<T1, T2>::value_type;
+            fnT fn = gemm_contig_impl<T1, T2, T3>;
+            return fn;
+        }
+    }
+};
+
+template <typename fnT, typename T1, typename T2>
+struct GemmTempsFactory
+{
+    fnT get()
+    {
+        if constexpr (!DotNoAtomicOutputType<T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            using dpctl::tensor::kernels::gemm_tree_impl;
+            using T3 = typename DotNoAtomicOutputType<T1, T2>::value_type;
+            fnT fn = gemm_tree_impl<T1, T2, T3>;
+            return fn;
+        }
+    }
+};
+
+template <typename fnT, typename T1, typename T2>
+struct GemmContigTempsFactory
+{
+    fnT get()
+    {
+        if constexpr (!DotNoAtomicOutputType<T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            using dpctl::tensor::kernels::gemm_contig_tree_impl;
+            using T3 = typename DotNoAtomicOutputType<T1, T2>::value_type;
+            fnT fn = gemm_contig_tree_impl<T1, T2, T3>;
+            return fn;
+        }
+    }
+};
+
+template <typename fnT, typename T1, typename T2>
+struct GemmBatchTempsFactory
+{
+    fnT get()
+    {
+        if constexpr (!DotNoAtomicOutputType<T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            using dpctl::tensor::kernels::gemm_batch_tree_impl;
+            using T3 = typename DotNoAtomicOutputType<T1, T2>::value_type;
+            fnT fn = gemm_batch_tree_impl<T1, T2, T3>;
+            return fn;
+        }
+    }
+};
+
+template <typename fnT, typename T1, typename T2>
+struct GemmBatchContigTempsFactory
+{
+    fnT get()
+    {
+        if constexpr (!DotNoAtomicOutputType<T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            using dpctl::tensor::kernels::gemm_batch_contig_tree_impl;
+            using T3 = typename DotNoAtomicOutputType<T1, T2>::value_type;
+            fnT fn = gemm_batch_contig_tree_impl<T1, T2, T3>;
+            return fn;
+        }
+    }
+};
+
+template <typename fnT, typename T1, typename T2>
+struct DotProductAtomicFactory
+{
+    fnT get()
+    {
+        if constexpr (!DotAtomicOutputType<T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            using dpctl::tensor::kernels::dot_product_impl;
+            using T3 = typename DotAtomicOutputType<T1, T2>::value_type;
+            fnT fn = dot_product_impl<T1, T2, T3>;
+            return fn;
+        }
+    }
+};
+
+template <typename fnT, typename T1, typename T2>
+struct DotProductNoAtomicFactory
+{
+    fnT get()
+    {
+        if constexpr (!DotNoAtomicOutputType<T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            using dpctl::tensor::kernels::dot_product_tree_impl;
+            using T3 = typename DotNoAtomicOutputType<T1, T2>::value_type;
+            fnT fn = dot_product_tree_impl<T1, T2, T3>;
+            return fn;
+        }
+    }
+};
+
+template <typename fnT, typename T1, typename T2>
+struct DotProductContigAtomicFactory
+{
+    fnT get()
+    {
+        if constexpr (!DotAtomicOutputType<T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            using dpctl::tensor::kernels::dot_product_contig_impl;
+            using T3 = typename DotAtomicOutputType<T1, T2>::value_type;
+            fnT fn = dot_product_contig_impl<T1, T2, T3>;
+            return fn;
+        }
+    }
+};
+
+template <typename fnT, typename T1, typename T2>
+struct DotProductContigNoAtomicFactory
+{
+    fnT get()
+    {
+        if constexpr (!DotNoAtomicOutputType<T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            using dpctl::tensor::kernels::dot_product_contig_tree_impl;
+            using T3 = typename DotNoAtomicOutputType<T1, T2>::value_type;
+            fnT fn = dot_product_contig_tree_impl<T1, T2, T3>;
+            return fn;
+        }
+    }
+};
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/tensor_linalg.cpp b/dpctl_ext/tensor/libtensor/source/tensor_linalg.cpp
new file mode 100644
index 000000000000..4a1b5fb79b9e
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/tensor_linalg.cpp
@@ -0,0 +1,41 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions
+//===----------------------------------------------------------------------===//
+
+#include "linalg_functions/dot.hpp"
+#include <pybind11/pybind11.h>
+
+PYBIND11_MODULE(_tensor_linalg_impl, m)
+{
+    dpctl::tensor::py_internal::init_dot(m);
+}
diff --git a/dpnp/dpnp_iface_manipulation.py b/dpnp/dpnp_iface_manipulation.py
index b5afd9523d67..2ff08cc6ec8b 100644
--- a/dpnp/dpnp_iface_manipulation.py
+++ b/dpnp/dpnp_iface_manipulation.py
@@ -2370,7 +2370,7 @@ def matrix_transpose(x, /):
             f"but it is {usm_x.ndim}"
         )
 
-    usm_res = dpt.matrix_transpose(usm_x)
+    usm_res = dpt_ext.matrix_transpose(usm_x)
     return dpnp_array._create_from_usm_ndarray(usm_res)
 
 
diff --git a/dpnp/dpnp_utils/dpnp_utils_linearalgebra.py b/dpnp/dpnp_utils/dpnp_utils_linearalgebra.py
index b01f57eaecdd..28ed40ab5f61 100644
--- a/dpnp/dpnp_utils/dpnp_utils_linearalgebra.py
+++ b/dpnp/dpnp_utils/dpnp_utils_linearalgebra.py
@@ -26,8 +26,6 @@
 # THE POSSIBILITY OF SUCH DAMAGE.
 # *****************************************************************************
 
-import dpctl
-import dpctl.tensor as dpt
 import dpctl.utils as dpu
 import numpy
 from dpctl.utils import ExecutionPlacementError
@@ -35,6 +33,7 @@
 # pylint: disable=no-name-in-module
 # TODO: revert to `import dpctl.tensor...`
 # when dpnp fully migrates dpctl/tensor
+import dpctl_ext.tensor as dpt
 import dpctl_ext.tensor._tensor_impl as ti
 import dpnp
 import dpnp.backend.extensions.blas._blas_impl as bi
@@ -696,7 +695,7 @@ def _validate_out_array(out, exec_q):
     """Validate out is supported array and has correct queue."""
     if out is not None:
         dpnp.check_supported_arrays_type(out)
-        if dpctl.utils.get_execution_queue((exec_q, out.sycl_queue)) is None:
+        if dpu.get_execution_queue((exec_q, out.sycl_queue)) is None:
             raise ExecutionPlacementError(
                 "Input and output allocation queues are not compatible"
             )

From 5dcdd271adcdf31b575be3af2e9ab321e31f7d6a Mon Sep 17 00:00:00 2001
From: vlad-perevezentsev <vladislav.perevezentsev@intel.com>
Date: Wed, 18 Mar 2026 15:23:20 +0100
Subject: [PATCH 17/43] Extend `_tensor_elementwise_impl` with binary functions
 and use it for dpnp  (#2803)

This PR extends `_tensor_elementwise_impl` with part of binary functions
: `add, atan2, bitwise_and, bitwise_left_shift, bitwise_or,
bitwise_right_shift, bitwise_xor`
---
 dpctl_ext/tensor/CMakeLists.txt               |  14 +-
 dpctl_ext/tensor/__init__.py                  |  14 +
 dpctl_ext/tensor/_elementwise_common.py       | 715 +++++++++++++++++-
 dpctl_ext/tensor/_elementwise_funcs.py        | 257 ++++++-
 .../kernels/elementwise_functions/add.hpp     | 688 +++++++++++++++++
 .../kernels/elementwise_functions/atan2.hpp   | 233 ++++++
 .../elementwise_functions/bitwise_and.hpp     | 461 +++++++++++
 .../bitwise_left_shift.hpp                    | 485 ++++++++++++
 .../elementwise_functions/bitwise_or.hpp      | 461 +++++++++++
 .../bitwise_right_shift.hpp                   | 493 ++++++++++++
 .../elementwise_functions/bitwise_xor.hpp     | 465 ++++++++++++
 .../elementwise_functions/common_inplace.hpp  | 478 ++++++++++++
 .../kernels/linalg_functions/dot_product.hpp  |   1 +
 .../include/kernels/linalg_functions/gemm.hpp |   3 +
 .../source/elementwise_functions/add.cpp      | 243 ++++++
 .../source/elementwise_functions/add.hpp      |  46 ++
 .../source/elementwise_functions/atan2.cpp    | 146 ++++
 .../source/elementwise_functions/atan2.hpp    |  46 ++
 .../elementwise_functions/bitwise_and.cpp     | 206 +++++
 .../elementwise_functions/bitwise_and.hpp     |  46 ++
 .../bitwise_left_shift.cpp                    | 216 ++++++
 .../bitwise_left_shift.hpp                    |  46 ++
 .../elementwise_functions/bitwise_or.cpp      | 206 +++++
 .../elementwise_functions/bitwise_or.hpp      |  46 ++
 .../bitwise_right_shift.cpp                   | 217 ++++++
 .../bitwise_right_shift.hpp                   |  46 ++
 .../elementwise_functions/bitwise_xor.cpp     | 206 +++++
 .../elementwise_functions/bitwise_xor.hpp     |  46 ++
 .../elementwise_common.cpp                    |  28 +-
 .../elementwise_functions.hpp                 | 532 +++++++++++++
 .../elementwise_functions.hpp                 |   1 +
 dpnp/backend/include/dpnp4pybind11.hpp        |   1 +
 dpnp/dpnp_iface_bitwise.py                    |   7 +-
 dpnp/dpnp_iface_mathematical.py               |   4 +-
 dpnp/dpnp_iface_trigonometric.py              |   4 +-
 35 files changed, 7076 insertions(+), 31 deletions(-)
 create mode 100644 dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/add.hpp
 create mode 100644 dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/atan2.hpp
 create mode 100644 dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/bitwise_and.hpp
 create mode 100644 dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/bitwise_left_shift.hpp
 create mode 100644 dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/bitwise_or.hpp
 create mode 100644 dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/bitwise_right_shift.hpp
 create mode 100644 dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/bitwise_xor.hpp
 create mode 100644 dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/common_inplace.hpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/elementwise_functions/add.cpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/elementwise_functions/add.hpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/elementwise_functions/atan2.cpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/elementwise_functions/atan2.hpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/elementwise_functions/bitwise_and.cpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/elementwise_functions/bitwise_and.hpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/elementwise_functions/bitwise_left_shift.cpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/elementwise_functions/bitwise_left_shift.hpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/elementwise_functions/bitwise_or.cpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/elementwise_functions/bitwise_or.hpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/elementwise_functions/bitwise_right_shift.cpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/elementwise_functions/bitwise_right_shift.hpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/elementwise_functions/bitwise_xor.cpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/elementwise_functions/bitwise_xor.hpp

diff --git a/dpctl_ext/tensor/CMakeLists.txt b/dpctl_ext/tensor/CMakeLists.txt
index afc7dca4db33..b032dc34bdb3 100644
--- a/dpctl_ext/tensor/CMakeLists.txt
+++ b/dpctl_ext/tensor/CMakeLists.txt
@@ -75,19 +75,19 @@ set(_elementwise_sources
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/abs.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/acos.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/acosh.cpp
-    #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/add.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/add.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/angle.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/asin.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/asinh.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/atan.cpp
-    #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/atan2.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/atan2.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/atanh.cpp
-    #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/bitwise_and.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/bitwise_and.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/bitwise_invert.cpp
-    #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/bitwise_left_shift.cpp
-    #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/bitwise_or.cpp
-    #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/bitwise_right_shift.cpp
-    #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/bitwise_xor.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/bitwise_left_shift.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/bitwise_or.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/bitwise_right_shift.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/bitwise_xor.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/cbrt.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/ceil.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/conj.cpp
diff --git a/dpctl_ext/tensor/__init__.py b/dpctl_ext/tensor/__init__.py
index a6127f1fc27c..5172d426334a 100644
--- a/dpctl_ext/tensor/__init__.py
+++ b/dpctl_ext/tensor/__init__.py
@@ -57,12 +57,19 @@
     abs,
     acos,
     acosh,
+    add,
     angle,
     asin,
     asinh,
     atan,
+    atan2,
     atanh,
+    bitwise_and,
     bitwise_invert,
+    bitwise_left_shift,
+    bitwise_or,
+    bitwise_right_shift,
+    bitwise_xor,
     cbrt,
     ceil,
     conj,
@@ -158,6 +165,7 @@
     "abs",
     "acos",
     "acosh",
+    "add",
     "all",
     "angle",
     "any",
@@ -172,7 +180,13 @@
     "astype",
     "atan",
     "atanh",
+    "atan2",
+    "bitwise_and",
     "bitwise_invert",
+    "bitwise_left_shift",
+    "bitwise_or",
+    "bitwise_right_shift",
+    "bitwise_xor",
     "broadcast_arrays",
     "broadcast_to",
     "can_cast",
diff --git a/dpctl_ext/tensor/_elementwise_common.py b/dpctl_ext/tensor/_elementwise_common.py
index 7811c01d9ce2..7fd9dabf9614 100644
--- a/dpctl_ext/tensor/_elementwise_common.py
+++ b/dpctl_ext/tensor/_elementwise_common.py
@@ -35,11 +35,22 @@
 import dpctl_ext.tensor as dpt_ext
 import dpctl_ext.tensor._tensor_impl as ti
 
-from ._copy_utils import _empty_like_orderK
+from ._copy_utils import _empty_like_orderK, _empty_like_pair_orderK
+from ._manipulation_functions import _broadcast_shape_impl
+from ._scalar_utils import (
+    _get_dtype,
+    _get_queue_usm_type,
+    _get_shape,
+    _validate_dtype,
+)
 from ._type_utils import (
+    _acceptance_fn_default_binary,
     _acceptance_fn_default_unary,
     _all_data_types,
     _find_buf_dtype,
+    _find_buf_dtype2,
+    _find_buf_dtype_in_place_op,
+    _resolve_weak_types,
 )
 
 
@@ -283,3 +294,705 @@ def __call__(self, x, /, *, out=None, order="K"):
         _manager.add_event_pair(ht, uf_ev)
 
         return out
+
+
+class BinaryElementwiseFunc:
+    """
+    Class that implements binary element-wise functions.
+
+    Args:
+        name (str):
+            Name of the unary function
+        result_type_resovle_fn (callable):
+            Function that takes dtypes of the input and
+            returns the dtype of the result if the
+            implementation functions supports it, or
+            returns `None` otherwise.
+        binary_dp_impl_fn (callable):
+            Data-parallel implementation function with signature
+            `impl_fn(src1: usm_ndarray, src2: usm_ndarray, dst: usm_ndarray,
+             sycl_queue: SyclQueue, depends: Optional[List[SyclEvent]])`
+            where the `src1` and `src2` are the argument arrays, `dst` is the
+            array to be populated with function values,
+            i.e. `dst=func(src1, src2)`.
+            The `impl_fn` is expected to return a 2-tuple of `SyclEvent`s.
+            The first event corresponds to data-management host tasks,
+            including lifetime management of argument Python objects to ensure
+            that their associated USM allocation is not freed before offloaded
+            computational tasks complete execution, while the second event
+            corresponds to computational tasks associated with function
+            evaluation.
+        docs (str):
+            Documentation string for the unary function.
+        binary_inplace_fn (callable, optional):
+            Data-parallel implementation function with signature
+            `impl_fn(src: usm_ndarray, dst: usm_ndarray,
+             sycl_queue: SyclQueue, depends: Optional[List[SyclEvent]])`
+            where the `src` is the argument array, `dst` is the
+            array to be populated with function values,
+            i.e. `dst=func(dst, src)`.
+            The `impl_fn` is expected to return a 2-tuple of `SyclEvent`s.
+            The first event corresponds to data-management host tasks,
+            including async lifetime management of Python arguments,
+            while the second event corresponds to computational tasks
+            associated with function evaluation.
+        acceptance_fn (callable, optional):
+            Function to influence type promotion behavior of this binary
+            function. The function takes 6 arguments:
+                arg1_dtype - Data type of the first argument
+                arg2_dtype - Data type of the second argument
+                ret_buf1_dtype - Data type the first argument would be cast to
+                ret_buf2_dtype - Data type the second argument would be cast to
+                res_dtype - Data type of the output array with function values
+                sycl_dev - The :class:`dpctl.SyclDevice` where the function
+                    evaluation is carried out.
+            The function is only called when both arguments of the binary
+            function require casting, e.g. both arguments of
+            `dpctl.tensor.logaddexp` are arrays with integral data type.
+    """
+
+    def __init__(
+        self,
+        name,
+        result_type_resolver_fn,
+        binary_dp_impl_fn,
+        docs,
+        binary_inplace_fn=None,
+        acceptance_fn=None,
+        weak_type_resolver=None,
+    ):
+        self.__name__ = "BinaryElementwiseFunc"
+        self.name_ = name
+        self.result_type_resolver_fn_ = result_type_resolver_fn
+        self.types_ = None
+        self.binary_fn_ = binary_dp_impl_fn
+        self.binary_inplace_fn_ = binary_inplace_fn
+        self.__doc__ = docs
+        if callable(acceptance_fn):
+            self.acceptance_fn_ = acceptance_fn
+        else:
+            self.acceptance_fn_ = _acceptance_fn_default_binary
+        if callable(weak_type_resolver):
+            self.weak_type_resolver_ = weak_type_resolver
+        else:
+            self.weak_type_resolver_ = _resolve_weak_types
+
+    def __str__(self):
+        return f"<{self.__name__} '{self.name_}'>"
+
+    def __repr__(self):
+        return f"<{self.__name__} '{self.name_}'>"
+
+    def get_implementation_function(self):
+        """Returns the out-of-place implementation
+        function for this elementwise binary function.
+
+        """
+        return self.binary_fn_
+
+    def get_implementation_inplace_function(self):
+        """Returns the in-place implementation
+        function for this elementwise binary function.
+
+        """
+        return self.binary_inplace_fn_
+
+    def get_type_result_resolver_function(self):
+        """Returns the type resolver function for this
+        elementwise binary function.
+        """
+        return self.result_type_resolver_fn_
+
+    def get_type_promotion_path_acceptance_function(self):
+        """Returns the acceptance function for this
+        elementwise binary function.
+
+        Acceptance function influences the type promotion
+        behavior of this binary function.
+        The function takes 6 arguments:
+            arg1_dtype - Data type of the first argument
+            arg2_dtype - Data type of the second argument
+            ret_buf1_dtype - Data type the first argument would be cast to
+            ret_buf2_dtype - Data type the second argument would be cast to
+            res_dtype - Data type of the output array with function values
+            sycl_dev - :class:`dpctl.SyclDevice` on which function evaluation
+                is carried out.
+
+        The acceptance function is only invoked if both input arrays must be
+        cast to intermediary data types, as would happen during call of
+        `dpctl.tensor.hypot` with both arrays being of integral data type.
+        """
+        return self.acceptance_fn_
+
+    def get_array_dtype_scalar_type_resolver_function(self):
+        """Returns the function which determines how to treat
+        Python scalar types for this elementwise binary function.
+
+        Resolver influences what type the scalar will be
+        treated as prior to type promotion behavior.
+        The function takes 3 arguments:
+
+        Args:
+            o1_dtype (object, dtype):
+                A class representing a Python scalar type or a ``dtype``
+            o2_dtype (object, dtype):
+                A class representing a Python scalar type or a ``dtype``
+            sycl_dev (:class:`dpctl.SyclDevice`):
+                Device on which function evaluation is carried out.
+
+        One of ``o1_dtype`` and ``o2_dtype`` must be a ``dtype`` instance.
+        """
+        return self.weak_type_resolver_
+
+    @property
+    def nin(self):
+        """Returns the number of arguments treated as inputs."""
+        return 2
+
+    @property
+    def nout(self):
+        """Returns the number of arguments treated as outputs."""
+        return 1
+
+    @property
+    def types(self):
+        """Returns information about types supported by
+        implementation function, using NumPy's character
+        encoding for data types, e.g.
+
+        :Example:
+            .. code-block:: python
+
+                dpctl.tensor.divide.types
+                # Outputs: ['ee->e', 'ff->f', 'fF->F', 'dd->d', 'dD->D',
+                #    'Ff->F', 'FF->F', 'Dd->D', 'DD->D']
+        """
+        types = self.types_
+        if not types:
+            types = []
+            _all_dtypes = _all_data_types(True, True)
+            for dt1 in _all_dtypes:
+                for dt2 in _all_dtypes:
+                    dt3 = self.result_type_resolver_fn_(dt1, dt2)
+                    if dt3:
+                        types.append(f"{dt1.char}{dt2.char}->{dt3.char}")
+            self.types_ = types
+        return types
+
+    def __call__(self, o1, o2, /, *, out=None, order="K"):
+        if order not in ["K", "C", "F", "A"]:
+            order = "K"
+        q1, o1_usm_type = _get_queue_usm_type(o1)
+        q2, o2_usm_type = _get_queue_usm_type(o2)
+        if q1 is None and q2 is None:
+            raise ExecutionPlacementError(
+                "Execution placement can not be unambiguously inferred "
+                "from input arguments. "
+                "One of the arguments must represent USM allocation and "
+                "expose `__sycl_usm_array_interface__` property"
+            )
+        if q1 is None:
+            exec_q = q2
+            res_usm_type = o2_usm_type
+        elif q2 is None:
+            exec_q = q1
+            res_usm_type = o1_usm_type
+        else:
+            exec_q = dpctl.utils.get_execution_queue((q1, q2))
+            if exec_q is None:
+                raise ExecutionPlacementError(
+                    "Execution placement can not be unambiguously inferred "
+                    "from input arguments."
+                )
+            res_usm_type = dpctl.utils.get_coerced_usm_type(
+                (
+                    o1_usm_type,
+                    o2_usm_type,
+                )
+            )
+        dpctl.utils.validate_usm_type(res_usm_type, allow_none=False)
+        o1_shape = _get_shape(o1)
+        o2_shape = _get_shape(o2)
+        if not all(
+            isinstance(s, (tuple, list))
+            for s in (
+                o1_shape,
+                o2_shape,
+            )
+        ):
+            raise TypeError(
+                "Shape of arguments can not be inferred. "
+                "Arguments are expected to be "
+                "lists, tuples, or both"
+            )
+        try:
+            res_shape = _broadcast_shape_impl(
+                [
+                    o1_shape,
+                    o2_shape,
+                ]
+            )
+        except ValueError:
+            raise ValueError(
+                "operands could not be broadcast together with shapes "
+                f"{o1_shape} and {o2_shape}"
+            )
+        sycl_dev = exec_q.sycl_device
+        o1_dtype = _get_dtype(o1, sycl_dev)
+        o2_dtype = _get_dtype(o2, sycl_dev)
+        if not all(_validate_dtype(o) for o in (o1_dtype, o2_dtype)):
+            raise ValueError("Operands have unsupported data types")
+
+        o1_dtype, o2_dtype = self.weak_type_resolver_(
+            o1_dtype, o2_dtype, sycl_dev
+        )
+
+        buf1_dt, buf2_dt, res_dt = _find_buf_dtype2(
+            o1_dtype,
+            o2_dtype,
+            self.result_type_resolver_fn_,
+            sycl_dev,
+            acceptance_fn=self.acceptance_fn_,
+        )
+
+        if res_dt is None:
+            raise ValueError(
+                f"function '{self.name_}' does not support input types "
+                f"({o1_dtype}, {o2_dtype}), "
+                "and the inputs could not be safely coerced to any "
+                "supported types according to the casting rule ''safe''."
+            )
+
+        orig_out = out
+        _manager = SequentialOrderManager[exec_q]
+        if out is not None:
+            if not isinstance(out, dpt.usm_ndarray):
+                raise TypeError(
+                    f"output array must be of usm_ndarray type, got {type(out)}"
+                )
+
+            if not out.flags.writable:
+                raise ValueError("provided `out` array is read-only")
+
+            if out.shape != res_shape:
+                raise ValueError(
+                    "The shape of input and output arrays are inconsistent. "
+                    f"Expected output shape is {res_shape}, got {out.shape}"
+                )
+
+            if res_dt != out.dtype:
+                raise ValueError(
+                    f"Output array of type {res_dt} is needed, "
+                    f"got {out.dtype}"
+                )
+
+            if (
+                dpctl.utils.get_execution_queue((exec_q, out.sycl_queue))
+                is None
+            ):
+                raise ExecutionPlacementError(
+                    "Input and output allocation queues are not compatible"
+                )
+
+            if isinstance(o1, dpt.usm_ndarray):
+                if ti._array_overlap(o1, out) and buf1_dt is None:
+                    if not ti._same_logical_tensors(o1, out):
+                        out = dpt_ext.empty_like(out)
+                    elif self.binary_inplace_fn_ is not None:
+                        # if there is a dedicated in-place kernel
+                        # it can be called here, otherwise continues
+                        if isinstance(o2, dpt.usm_ndarray):
+                            src2 = o2
+                            if (
+                                ti._array_overlap(o2, out)
+                                and not ti._same_logical_tensors(o2, out)
+                                and buf2_dt is None
+                            ):
+                                buf2_dt = o2_dtype
+                        else:
+                            src2 = dpt_ext.asarray(
+                                o2, dtype=o2_dtype, sycl_queue=exec_q
+                            )
+                        if buf2_dt is None:
+                            if src2.shape != res_shape:
+                                src2 = dpt_ext.broadcast_to(src2, res_shape)
+                            dep_evs = _manager.submitted_events
+                            ht_, comp_ev = self.binary_inplace_fn_(
+                                lhs=o1,
+                                rhs=src2,
+                                sycl_queue=exec_q,
+                                depends=dep_evs,
+                            )
+                            _manager.add_event_pair(ht_, comp_ev)
+                        else:
+                            buf2 = dpt_ext.empty_like(src2, dtype=buf2_dt)
+                            dep_evs = _manager.submitted_events
+                            (
+                                ht_copy_ev,
+                                copy_ev,
+                            ) = ti._copy_usm_ndarray_into_usm_ndarray(
+                                src=src2,
+                                dst=buf2,
+                                sycl_queue=exec_q,
+                                depends=dep_evs,
+                            )
+                            _manager.add_event_pair(ht_copy_ev, copy_ev)
+
+                            buf2 = dpt_ext.broadcast_to(buf2, res_shape)
+                            ht_, bf_ev = self.binary_inplace_fn_(
+                                lhs=o1,
+                                rhs=buf2,
+                                sycl_queue=exec_q,
+                                depends=[copy_ev],
+                            )
+                            _manager.add_event_pair(ht_, bf_ev)
+
+                        return out
+
+            if isinstance(o2, dpt.usm_ndarray):
+                if (
+                    ti._array_overlap(o2, out)
+                    and not ti._same_logical_tensors(o2, out)
+                    and buf2_dt is None
+                ):
+                    # should not reach if out is reallocated
+                    # after being checked against o1
+                    out = dpt_ext.empty_like(out)
+
+        if isinstance(o1, dpt.usm_ndarray):
+            src1 = o1
+        else:
+            src1 = dpt_ext.asarray(o1, dtype=o1_dtype, sycl_queue=exec_q)
+        if isinstance(o2, dpt.usm_ndarray):
+            src2 = o2
+        else:
+            src2 = dpt_ext.asarray(o2, dtype=o2_dtype, sycl_queue=exec_q)
+
+        if order == "A":
+            order = (
+                "F"
+                if all(
+                    arr.flags.f_contiguous
+                    for arr in (
+                        src1,
+                        src2,
+                    )
+                )
+                else "C"
+            )
+
+        if buf1_dt is None and buf2_dt is None:
+            if out is None:
+                if order == "K":
+                    out = _empty_like_pair_orderK(
+                        src1, src2, res_dt, res_shape, res_usm_type, exec_q
+                    )
+                else:
+                    out = dpt_ext.empty(
+                        res_shape,
+                        dtype=res_dt,
+                        usm_type=res_usm_type,
+                        sycl_queue=exec_q,
+                        order=order,
+                    )
+            if src1.shape != res_shape:
+                src1 = dpt_ext.broadcast_to(src1, res_shape)
+            if src2.shape != res_shape:
+                src2 = dpt_ext.broadcast_to(src2, res_shape)
+            deps_ev = _manager.submitted_events
+            ht_binary_ev, binary_ev = self.binary_fn_(
+                src1=src1,
+                src2=src2,
+                dst=out,
+                sycl_queue=exec_q,
+                depends=deps_ev,
+            )
+            _manager.add_event_pair(ht_binary_ev, binary_ev)
+            if not (orig_out is None or orig_out is out):
+                # Copy the out data from temporary buffer to original memory
+                ht_copy_out_ev, cpy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
+                    src=out,
+                    dst=orig_out,
+                    sycl_queue=exec_q,
+                    depends=[binary_ev],
+                )
+                _manager.add_event_pair(ht_copy_out_ev, cpy_ev)
+                out = orig_out
+            return out
+        elif buf1_dt is None:
+            if order == "K":
+                buf2 = _empty_like_orderK(src2, buf2_dt)
+            else:
+                buf2 = dpt_ext.empty_like(src2, dtype=buf2_dt, order=order)
+            dep_evs = _manager.submitted_events
+            ht_copy_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
+                src=src2, dst=buf2, sycl_queue=exec_q, depends=dep_evs
+            )
+            _manager.add_event_pair(ht_copy_ev, copy_ev)
+            if out is None:
+                if order == "K":
+                    out = _empty_like_pair_orderK(
+                        src1, buf2, res_dt, res_shape, res_usm_type, exec_q
+                    )
+                else:
+                    out = dpt_ext.empty(
+                        res_shape,
+                        dtype=res_dt,
+                        usm_type=res_usm_type,
+                        sycl_queue=exec_q,
+                        order=order,
+                    )
+
+            if src1.shape != res_shape:
+                src1 = dpt_ext.broadcast_to(src1, res_shape)
+            buf2 = dpt_ext.broadcast_to(buf2, res_shape)
+            ht_binary_ev, binary_ev = self.binary_fn_(
+                src1=src1,
+                src2=buf2,
+                dst=out,
+                sycl_queue=exec_q,
+                depends=[copy_ev],
+            )
+            _manager.add_event_pair(ht_binary_ev, binary_ev)
+            if not (orig_out is None or orig_out is out):
+                # Copy the out data from temporary buffer to original memory
+                ht_copy_out_ev, cpy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
+                    src=out,
+                    dst=orig_out,
+                    sycl_queue=exec_q,
+                    depends=[binary_ev],
+                )
+                _manager.add_event_pair(ht_copy_out_ev, cpy_ev)
+                out = orig_out
+            return out
+        elif buf2_dt is None:
+            if order == "K":
+                buf1 = _empty_like_orderK(src1, buf1_dt)
+            else:
+                buf1 = dpt_ext.empty_like(src1, dtype=buf1_dt, order=order)
+            dep_evs = _manager.submitted_events
+            ht_copy_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
+                src=src1, dst=buf1, sycl_queue=exec_q, depends=dep_evs
+            )
+            _manager.add_event_pair(ht_copy_ev, copy_ev)
+            if out is None:
+                if order == "K":
+                    out = _empty_like_pair_orderK(
+                        buf1, src2, res_dt, res_shape, res_usm_type, exec_q
+                    )
+                else:
+                    out = dpt_ext.empty(
+                        res_shape,
+                        dtype=res_dt,
+                        usm_type=res_usm_type,
+                        sycl_queue=exec_q,
+                        order=order,
+                    )
+
+            buf1 = dpt_ext.broadcast_to(buf1, res_shape)
+            if src2.shape != res_shape:
+                src2 = dpt_ext.broadcast_to(src2, res_shape)
+            ht_binary_ev, binary_ev = self.binary_fn_(
+                src1=buf1,
+                src2=src2,
+                dst=out,
+                sycl_queue=exec_q,
+                depends=[copy_ev],
+            )
+            _manager.add_event_pair(ht_binary_ev, binary_ev)
+            if not (orig_out is None or orig_out is out):
+                # Copy the out data from temporary buffer to original memory
+                ht_copy_out_ev, cpy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
+                    src=out,
+                    dst=orig_out,
+                    sycl_queue=exec_q,
+                    depends=[binary_ev],
+                )
+                _manager.add_event_pair(ht_copy_out_ev, cpy_ev)
+                out = orig_out
+            return out
+
+        if order == "K":
+            if src1.flags.c_contiguous and src2.flags.c_contiguous:
+                order = "C"
+            elif src1.flags.f_contiguous and src2.flags.f_contiguous:
+                order = "F"
+        if order == "K":
+            buf1 = _empty_like_orderK(src1, buf1_dt)
+        else:
+            buf1 = dpt_ext.empty_like(src1, dtype=buf1_dt, order=order)
+        dep_evs = _manager.submitted_events
+        ht_copy1_ev, copy1_ev = ti._copy_usm_ndarray_into_usm_ndarray(
+            src=src1, dst=buf1, sycl_queue=exec_q, depends=dep_evs
+        )
+        _manager.add_event_pair(ht_copy1_ev, copy1_ev)
+        if order == "K":
+            buf2 = _empty_like_orderK(src2, buf2_dt)
+        else:
+            buf2 = dpt_ext.empty_like(src2, dtype=buf2_dt, order=order)
+        ht_copy2_ev, copy2_ev = ti._copy_usm_ndarray_into_usm_ndarray(
+            src=src2, dst=buf2, sycl_queue=exec_q, depends=dep_evs
+        )
+        _manager.add_event_pair(ht_copy2_ev, copy2_ev)
+        if out is None:
+            if order == "K":
+                out = _empty_like_pair_orderK(
+                    buf1, buf2, res_dt, res_shape, res_usm_type, exec_q
+                )
+            else:
+                out = dpt_ext.empty(
+                    res_shape,
+                    dtype=res_dt,
+                    usm_type=res_usm_type,
+                    sycl_queue=exec_q,
+                    order=order,
+                )
+
+        buf1 = dpt_ext.broadcast_to(buf1, res_shape)
+        buf2 = dpt_ext.broadcast_to(buf2, res_shape)
+        ht_, bf_ev = self.binary_fn_(
+            src1=buf1,
+            src2=buf2,
+            dst=out,
+            sycl_queue=exec_q,
+            depends=[copy1_ev, copy2_ev],
+        )
+        _manager.add_event_pair(ht_, bf_ev)
+        return out
+
+    def _inplace_op(self, o1, o2):
+        if self.binary_inplace_fn_ is None:
+            raise ValueError(
+                "binary function does not have a dedicated in-place "
+                "implementation"
+            )
+        if not isinstance(o1, dpt.usm_ndarray):
+            raise TypeError(
+                "Expected first argument to be "
+                f"dpctl.tensor.usm_ndarray, got {type(o1)}"
+            )
+        if not o1.flags.writable:
+            raise ValueError("provided left-hand side array is read-only")
+        q1, o1_usm_type = o1.sycl_queue, o1.usm_type
+        q2, o2_usm_type = _get_queue_usm_type(o2)
+        if q2 is None:
+            exec_q = q1
+            res_usm_type = o1_usm_type
+        else:
+            exec_q = dpctl.utils.get_execution_queue((q1, q2))
+            if exec_q is None:
+                raise ExecutionPlacementError(
+                    "Execution placement can not be unambiguously inferred "
+                    "from input arguments."
+                )
+            res_usm_type = dpctl.utils.get_coerced_usm_type(
+                (
+                    o1_usm_type,
+                    o2_usm_type,
+                )
+            )
+        dpctl.utils.validate_usm_type(res_usm_type, allow_none=False)
+        o1_shape = o1.shape
+        o2_shape = _get_shape(o2)
+        if not isinstance(o2_shape, (tuple, list)):
+            raise TypeError(
+                "Shape of second argument can not be inferred. "
+                "Expected list or tuple."
+            )
+        try:
+            res_shape = _broadcast_shape_impl(
+                [
+                    o1_shape,
+                    o2_shape,
+                ]
+            )
+        except ValueError:
+            raise ValueError(
+                "operands could not be broadcast together with shapes "
+                f"{o1_shape} and {o2_shape}"
+            )
+
+        if res_shape != o1_shape:
+            raise ValueError(
+                "The shape of the non-broadcastable left-hand "
+                f"side {o1_shape} is inconsistent with the "
+                f"broadcast shape {res_shape}."
+            )
+
+        sycl_dev = exec_q.sycl_device
+        o1_dtype = o1.dtype
+        o2_dtype = _get_dtype(o2, sycl_dev)
+        if not _validate_dtype(o2_dtype):
+            raise ValueError("Operand has an unsupported data type")
+
+        o1_dtype, o2_dtype = self.weak_type_resolver_(
+            o1_dtype, o2_dtype, sycl_dev
+        )
+
+        buf_dt, res_dt = _find_buf_dtype_in_place_op(
+            o1_dtype,
+            o2_dtype,
+            self.result_type_resolver_fn_,
+            sycl_dev,
+        )
+
+        if res_dt is None:
+            raise ValueError(
+                f"function '{self.name_}' does not support input types "
+                f"({o1_dtype}, {o2_dtype}), "
+                "and the inputs could not be safely coerced to any "
+                "supported types according to the casting rule "
+                "''same_kind''."
+            )
+
+        if res_dt != o1_dtype:
+            raise ValueError(
+                f"Output array of type {res_dt} is needed, " f"got {o1_dtype}"
+            )
+
+        _manager = SequentialOrderManager[exec_q]
+        if isinstance(o2, dpt.usm_ndarray):
+            src2 = o2
+            if (
+                ti._array_overlap(o2, o1)
+                and not ti._same_logical_tensors(o2, o1)
+                and buf_dt is None
+            ):
+                buf_dt = o2_dtype
+        else:
+            src2 = dpt_ext.asarray(o2, dtype=o2_dtype, sycl_queue=exec_q)
+        if buf_dt is None:
+            if src2.shape != res_shape:
+                src2 = dpt_ext.broadcast_to(src2, res_shape)
+            dep_evs = _manager.submitted_events
+            ht_, comp_ev = self.binary_inplace_fn_(
+                lhs=o1,
+                rhs=src2,
+                sycl_queue=exec_q,
+                depends=dep_evs,
+            )
+            _manager.add_event_pair(ht_, comp_ev)
+        else:
+            buf = dpt_ext.empty_like(src2, dtype=buf_dt)
+            dep_evs = _manager.submitted_events
+            (
+                ht_copy_ev,
+                copy_ev,
+            ) = ti._copy_usm_ndarray_into_usm_ndarray(
+                src=src2,
+                dst=buf,
+                sycl_queue=exec_q,
+                depends=dep_evs,
+            )
+            _manager.add_event_pair(ht_copy_ev, copy_ev)
+
+            buf = dpt_ext.broadcast_to(buf, res_shape)
+            ht_, bf_ev = self.binary_inplace_fn_(
+                lhs=o1,
+                rhs=buf,
+                sycl_queue=exec_q,
+                depends=[copy_ev],
+            )
+            _manager.add_event_pair(ht_, bf_ev)
+
+        return o1
diff --git a/dpctl_ext/tensor/_elementwise_funcs.py b/dpctl_ext/tensor/_elementwise_funcs.py
index ae0ef8aa3496..08d59d8289a3 100644
--- a/dpctl_ext/tensor/_elementwise_funcs.py
+++ b/dpctl_ext/tensor/_elementwise_funcs.py
@@ -30,7 +30,7 @@
 # when dpnp fully migrates dpctl/tensor
 import dpctl_ext.tensor._tensor_elementwise_impl as ti
 
-from ._elementwise_common import UnaryElementwiseFunc
+from ._elementwise_common import BinaryElementwiseFunc, UnaryElementwiseFunc
 from ._type_utils import (
     _acceptance_fn_negative,
     _acceptance_fn_reciprocal,
@@ -124,6 +124,41 @@
 )
 del _acosh_docstring
 
+# B01: ===== ADD   (x1, x2)
+
+_add_docstring_ = r"""
+add(x1, x2, /, \*, out=None, order='K')
+
+Calculates the sum for each element `x1_i` of the input array `x1` with
+the respective element `x2_i` of the input array `x2`.
+
+Args:
+    x1 (usm_ndarray):
+        First input array. May have any data type.
+    x2 (usm_ndarray):
+        Second input array. May have any data type.
+    out (Union[usm_ndarray, None], optional):
+        Output array to populate.
+        Array must have the correct shape and the expected data type.
+    order ("C","F","A","K", optional):
+        Memory layout of the new output array, if parameter
+        `out` is ``None``.
+        Default: "K".
+
+Returns:
+    usm_ndarray:
+        An array containing the element-wise sums. The data type of the
+        returned array is determined by the Type Promotion Rules.
+"""
+add = BinaryElementwiseFunc(
+    "add",
+    ti._add_result_type,
+    ti._add,
+    _add_docstring_,
+    binary_inplace_fn=ti._add_inplace,
+)
+del _add_docstring_
+
 # U04: ===== ASIN  (x)
 _asin_docstring = r"""
 asin(x, /, \*, out=None, order='K')
@@ -211,6 +246,41 @@
 )
 del _atan_docstring
 
+# B02: ===== ATAN2 (x1, x2)
+_atan2_docstring_ = r"""
+atan2(x1, x2, /, \*, out=None, order='K')
+
+Calculates the inverse tangent of the quotient `x1_i/x2_i` for each element
+`x1_i` of the input array `x1` with the respective element `x2_i` of the
+input array `x2`. Each element-wise result is expressed in radians.
+
+Args:
+    x1 (usm_ndarray):
+        First input array, expected to have a real-valued floating-point
+        data type.
+    x2 (usm_ndarray):
+        Second input array, also expected to have a real-valued
+        floating-point data type.
+    out (Union[usm_ndarray, None], optional):
+        Output array to populate.
+        Array must have the correct shape and the expected data type.
+    order ("C","F","A","K", optional):
+        Memory layout of the new output array, if parameter
+        `out` is ``None``.
+        Default: "K".
+
+Returns:
+    usm_ndarray:
+        An array containing the inverse tangent of the quotient `x1`/`x2`.
+        The returned array must have a real-valued floating-point data type
+        determined by Type Promotion Rules.
+"""
+
+atan2 = BinaryElementwiseFunc(
+    "atan2", ti._atan2_result_type, ti._atan2, _atan2_docstring_
+)
+del _atan2_docstring_
+
 # U07: ===== ATANH (x)
 _atanh_docstring = r"""
 atanh(x, /, \*, out=None, order='K')
@@ -240,6 +310,80 @@
 )
 del _atanh_docstring
 
+# B03: ===== BITWISE_AND           (x1, x2)
+_bitwise_and_docstring_ = r"""
+bitwise_and(x1, x2, /, \*, out=None, order='K')
+
+Computes the bitwise AND of the underlying binary representation of each
+element `x1_i` of the input array `x1` with the respective element `x2_i`
+of the input array `x2`.
+
+Args:
+    x1 (usm_ndarray):
+        First input array, expected to have integer or boolean data type.
+    x2 (usm_ndarray):
+        Second input array, also expected to have integer or boolean data
+        type.
+    out (Union[usm_ndarray, None], optional):
+        Output array to populate.
+        Array must have the correct shape and the expected data type.
+    order ("C","F","A","K", optional):
+        Memory layout of the new output array, if parameter
+        `out` is ``None``.
+        Default: "K".
+
+Returns:
+    usm_ndarray:
+        An array containing the element-wise results. The data type
+        of the returned array is determined by the Type Promotion Rules.
+"""
+
+bitwise_and = BinaryElementwiseFunc(
+    "bitwise_and",
+    ti._bitwise_and_result_type,
+    ti._bitwise_and,
+    _bitwise_and_docstring_,
+    binary_inplace_fn=ti._bitwise_and_inplace,
+)
+del _bitwise_and_docstring_
+
+# B04: ===== BITWISE_LEFT_SHIFT    (x1, x2)
+_bitwise_left_shift_docstring_ = r"""
+bitwise_left_shift(x1, x2, /, \*, out=None, order='K')
+
+Shifts the bits of each element `x1_i` of the input array x1 to the left by
+appending `x2_i` (i.e., the respective element in the input array `x2`) zeros to
+the right of `x1_i`.
+
+Args:
+    x1 (usm_ndarray):
+        First input array, expected to have integer data type.
+    x2 (usm_ndarray):
+        Second input array, also expected to have integer data type.
+        Each element must be greater than or equal to 0.
+    out (Union[usm_ndarray, None], optional):
+        Output array to populate.
+        Array must have the correct shape and the expected data type.
+    order ("C","F","A","K", optional):
+        Memory layout of the new output array, if parameter
+        `out` is ``None``.
+        Default: "K".
+
+Returns:
+    usm_ndarray:
+        An array containing the element-wise results. The data type
+        of the returned array is determined by the Type Promotion Rules.
+"""
+
+bitwise_left_shift = BinaryElementwiseFunc(
+    "bitwise_left_shift",
+    ti._bitwise_left_shift_result_type,
+    ti._bitwise_left_shift,
+    _bitwise_left_shift_docstring_,
+    binary_inplace_fn=ti._bitwise_left_shift_inplace,
+)
+del _bitwise_left_shift_docstring_
+
 # U08: ===== BITWISE_INVERT        (x)
 _bitwise_invert_docstring = r"""
 bitwise_invert(x, /, \*, out=None, order='K')
@@ -272,6 +416,117 @@
 )
 del _bitwise_invert_docstring
 
+# B05: ===== BITWISE_OR            (x1, x2)
+_bitwise_or_docstring_ = r"""
+bitwise_or(x1, x2, /, \*, out=None, order='K')
+
+Computes the bitwise OR of the underlying binary representation of each
+element `x1_i` of the input array `x1` with the respective element `x2_i`
+of the input array `x2`.
+
+Args:
+    x1 (usm_ndarray):
+        First input array, expected to have integer or boolean data type.
+    x2 (usm_ndarray):
+        Second input array, also expected to have integer or boolean data
+        type.
+    out (Union[usm_ndarray, None], optional):
+        Output array to populate.
+        Array must have the correct shape and the expected data type.
+    order ("C","F","A","K", optional):
+        Memory layout of the new output array, if parameter
+        `out` is ``None``.
+        Default: "K".
+
+Returns:
+    usm_ndarray:
+        An array containing the element-wise results. The data type
+        of the returned array is determined by the Type Promotion Rules.
+"""
+
+bitwise_or = BinaryElementwiseFunc(
+    "bitwise_or",
+    ti._bitwise_or_result_type,
+    ti._bitwise_or,
+    _bitwise_or_docstring_,
+    binary_inplace_fn=ti._bitwise_or_inplace,
+)
+del _bitwise_or_docstring_
+
+# B06: ===== BITWISE_RIGHT_SHIFT   (x1, x2)
+_bitwise_right_shift_docstring_ = r"""
+bitwise_right_shift(x1, x2, /, \*, out=None, order='K')
+
+Shifts the bits of each element `x1_i` of the input array `x1` to the right
+according to the respective element `x2_i` of the input array `x2`.
+
+Args:
+    x1 (usm_ndarray):
+        First input array, expected to have integer data type.
+    x2 (usm_ndarray):
+        Second input array, also expected to have integer data type.
+        Each element must be greater than or equal to 0.
+    out (Union[usm_ndarray, None], optional):
+        Output array to populate.
+        Array must have the correct shape and the expected data type.
+    order ("C","F","A","K", optional):
+        Memory layout of the new output array, if parameter
+        `out` is ``None``.
+        Default: "K".
+
+Returns:
+    usm_ndarray:
+        An array containing the element-wise results. The data type
+        of the returned array is determined by the Type Promotion Rules.
+"""
+
+bitwise_right_shift = BinaryElementwiseFunc(
+    "bitwise_right_shift",
+    ti._bitwise_right_shift_result_type,
+    ti._bitwise_right_shift,
+    _bitwise_right_shift_docstring_,
+    binary_inplace_fn=ti._bitwise_right_shift_inplace,
+)
+del _bitwise_right_shift_docstring_
+
+
+# B07: ===== BITWISE_XOR           (x1, x2)
+_bitwise_xor_docstring_ = r"""
+bitwise_xor(x1, x2, /, \*, out=None, order='K')
+
+Computes the bitwise XOR of the underlying binary representation of each
+element `x1_i` of the input array `x1` with the respective element `x2_i`
+of the input array `x2`.
+
+Args:
+    x1 (usm_ndarray):
+        First input array, expected to have integer or boolean data type.
+    x2 (usm_ndarray):
+        Second input array, also expected to have integer or boolean data
+        type.
+    out (Union[usm_ndarray, None], optional):
+        Output array to populate.
+        Array must have the correct shape and the expected data type.
+    order ("C","F","A","K", optional):
+        Memory layout of the new output array, if parameter
+        `out` is ``None``.
+        Default: "K".
+
+Returns:
+    usm_ndarray:
+        An array containing the element-wise results. The data type
+        of the returned array is determined by the Type Promotion Rules.
+"""
+
+bitwise_xor = BinaryElementwiseFunc(
+    "bitwise_xor",
+    ti._bitwise_xor_result_type,
+    ti._bitwise_xor,
+    _bitwise_xor_docstring_,
+    binary_inplace_fn=ti._bitwise_xor_inplace,
+)
+del _bitwise_xor_docstring_
+
 # U09: ==== CEIL          (x)
 _ceil_docstring = r"""
 ceil(x, /, \*, out=None, order='K')
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/add.hpp b/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/add.hpp
new file mode 100644
index 000000000000..1b7440304f0e
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/add.hpp
@@ -0,0 +1,688 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for elementwise evaluation of ADD(x1, x2)
+/// function.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <complex>
+#include <cstddef>
+#include <cstdint>
+#include <type_traits>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "sycl_complex.hpp"
+#include "vec_size_util.hpp"
+
+#include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+#include "kernels/elementwise_functions/common_inplace.hpp"
+
+#include "utils/type_dispatch_building.hpp"
+#include "utils/type_utils.hpp"
+
+namespace dpctl::tensor::kernels::add
+{
+
+using dpctl::tensor::ssize_t;
+namespace td_ns = dpctl::tensor::type_dispatch;
+namespace tu_ns = dpctl::tensor::type_utils;
+
+template <typename argT1, typename argT2, typename resT>
+struct AddFunctor
+{
+
+    using supports_sg_loadstore = std::negation<
+        std::disjunction<tu_ns::is_complex<argT1>, tu_ns::is_complex<argT2>>>;
+    using supports_vec = std::negation<
+        std::disjunction<tu_ns::is_complex<argT1>, tu_ns::is_complex<argT2>>>;
+
+    resT operator()(const argT1 &in1, const argT2 &in2) const
+    {
+        if constexpr (tu_ns::is_complex<argT1>::value &&
+                      tu_ns::is_complex<argT2>::value)
+        {
+            using rT1 = typename argT1::value_type;
+            using rT2 = typename argT2::value_type;
+
+            return exprm_ns::complex<rT1>(in1) + exprm_ns::complex<rT2>(in2);
+        }
+        else if constexpr (tu_ns::is_complex<argT1>::value &&
+                           !tu_ns::is_complex<argT2>::value)
+        {
+            using rT1 = typename argT1::value_type;
+
+            return exprm_ns::complex<rT1>(in1) + in2;
+        }
+        else if constexpr (!tu_ns::is_complex<argT1>::value &&
+                           tu_ns::is_complex<argT2>::value)
+        {
+            using rT2 = typename argT2::value_type;
+
+            return in1 + exprm_ns::complex<rT2>(in2);
+        }
+        else {
+            return in1 + in2;
+        }
+    }
+
+    template <int vec_sz>
+    sycl::vec<resT, vec_sz>
+        operator()(const sycl::vec<argT1, vec_sz> &in1,
+                   const sycl::vec<argT2, vec_sz> &in2) const
+    {
+        auto tmp = in1 + in2;
+        if constexpr (std::is_same_v<resT,
+                                     typename decltype(tmp)::element_type>) {
+            return tmp;
+        }
+        else {
+            using dpctl::tensor::type_utils::vec_cast;
+
+            return vec_cast<resT, typename decltype(tmp)::element_type, vec_sz>(
+                tmp);
+        }
+    }
+};
+
+template <typename argT1,
+          typename argT2,
+          typename resT,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
+          bool enable_sg_loadstore = true>
+using AddContigFunctor =
+    elementwise_common::BinaryContigFunctor<argT1,
+                                            argT2,
+                                            resT,
+                                            AddFunctor<argT1, argT2, resT>,
+                                            vec_sz,
+                                            n_vecs,
+                                            enable_sg_loadstore>;
+
+template <typename argT1, typename argT2, typename resT, typename IndexerT>
+using AddStridedFunctor =
+    elementwise_common::BinaryStridedFunctor<argT1,
+                                             argT2,
+                                             resT,
+                                             IndexerT,
+                                             AddFunctor<argT1, argT2, resT>>;
+
+template <typename T1, typename T2>
+struct AddOutputType
+{
+    using value_type = typename std::disjunction<
+        td_ns::BinaryTypeMapResultEntry<T1, bool, T2, bool, bool>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint8_t,
+                                        T2,
+                                        std::uint8_t,
+                                        std::uint8_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::int8_t,
+                                        T2,
+                                        std::int8_t,
+                                        std::int8_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint16_t,
+                                        T2,
+                                        std::uint16_t,
+                                        std::uint16_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::int16_t,
+                                        T2,
+                                        std::int16_t,
+                                        std::int16_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint32_t,
+                                        T2,
+                                        std::uint32_t,
+                                        std::uint32_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::int32_t,
+                                        T2,
+                                        std::int32_t,
+                                        std::int32_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint64_t,
+                                        T2,
+                                        std::uint64_t,
+                                        std::uint64_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::int64_t,
+                                        T2,
+                                        std::int64_t,
+                                        std::int64_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        sycl::half,
+                                        T2,
+                                        sycl::half,
+                                        sycl::half>,
+        td_ns::BinaryTypeMapResultEntry<T1, float, T2, float, float>,
+        td_ns::BinaryTypeMapResultEntry<T1, double, T2, double, double>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::complex<float>,
+                                        T2,
+                                        std::complex<float>,
+                                        std::complex<float>>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::complex<double>,
+                                        T2,
+                                        std::complex<double>,
+                                        std::complex<double>>,
+        td_ns::DefaultResultEntry<void>>::result_type;
+
+    static constexpr bool is_defined = !std::is_same_v<value_type, void>;
+};
+
+namespace hyperparam_detail
+{
+
+namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
+
+using vsu_ns::BinaryContigHyperparameterSetEntry;
+using vsu_ns::ContigHyperparameterSetDefault;
+
+template <typename argTy1, typename argTy2>
+struct AddContigHyperparameterSet
+{
+    using value_type = typename std::disjunction<
+        BinaryContigHyperparameterSetEntry<argTy1,
+                                           std::int32_t,
+                                           argTy2,
+                                           std::int32_t,
+                                           2u,
+                                           2u>,
+        BinaryContigHyperparameterSetEntry<argTy1,
+                                           std::int32_t,
+                                           argTy2,
+                                           std::int32_t,
+                                           2u,
+                                           2u>,
+        BinaryContigHyperparameterSetEntry<argTy1,
+                                           std::int64_t,
+                                           argTy2,
+                                           std::int64_t,
+                                           2u,
+                                           2u>,
+        BinaryContigHyperparameterSetEntry<argTy1,
+                                           std::uint64_t,
+                                           argTy2,
+                                           std::uint64_t,
+                                           2u,
+                                           2u>,
+        BinaryContigHyperparameterSetEntry<argTy1,
+                                           float,
+                                           argTy2,
+                                           float,
+                                           2u,
+                                           2u>,
+        BinaryContigHyperparameterSetEntry<argTy1,
+                                           double,
+                                           argTy2,
+                                           double,
+                                           1u,
+                                           2u>,
+        ContigHyperparameterSetDefault<4u, 2u>>;
+
+    constexpr static auto vec_sz = value_type::vec_sz;
+    constexpr static auto n_vecs = value_type::n_vecs;
+};
+
+} // end of namespace hyperparam_detail
+
+template <typename argT1,
+          typename argT2,
+          typename resT,
+          std::uint8_t vec_sz,
+          std::uint8_t n_vecs>
+class add_contig_kernel;
+
+template <typename argTy1, typename argTy2>
+sycl::event add_contig_impl(sycl::queue &exec_q,
+                            std::size_t nelems,
+                            const char *arg1_p,
+                            ssize_t arg1_offset,
+                            const char *arg2_p,
+                            ssize_t arg2_offset,
+                            char *res_p,
+                            ssize_t res_offset,
+                            const std::vector<sycl::event> &depends = {})
+{
+    using AddHS = hyperparam_detail::AddContigHyperparameterSet<argTy1, argTy2>;
+    static constexpr auto vec_sz = AddHS::vec_sz;
+    static constexpr auto n_vecs = AddHS::n_vecs;
+
+    return elementwise_common::binary_contig_impl<
+        argTy1, argTy2, AddOutputType, AddContigFunctor, add_contig_kernel,
+        vec_sz, n_vecs>(exec_q, nelems, arg1_p, arg1_offset, arg2_p,
+                        arg2_offset, res_p, res_offset, depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct AddContigFactory
+{
+    fnT get()
+    {
+        if constexpr (!AddOutputType<T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = add_contig_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+
+template <typename fnT, typename T1, typename T2>
+struct AddTypeMapFactory
+{
+    /*! @brief get typeid for output type of std::add(T1 x, T2 y) */
+    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
+    {
+        using rT = typename AddOutputType<T1, T2>::value_type;
+        return td_ns::GetTypeid<rT>{}.get();
+    }
+};
+
+template <typename T1, typename T2, typename resT, typename IndexerT>
+class add_strided_kernel;
+
+template <typename argTy1, typename argTy2>
+sycl::event add_strided_impl(sycl::queue &exec_q,
+                             std::size_t nelems,
+                             int nd,
+                             const ssize_t *shape_and_strides,
+                             const char *arg1_p,
+                             ssize_t arg1_offset,
+                             const char *arg2_p,
+                             ssize_t arg2_offset,
+                             char *res_p,
+                             ssize_t res_offset,
+                             const std::vector<sycl::event> &depends,
+                             const std::vector<sycl::event> &additional_depends)
+{
+    return elementwise_common::binary_strided_impl<
+        argTy1, argTy2, AddOutputType, AddStridedFunctor, add_strided_kernel>(
+        exec_q, nelems, nd, shape_and_strides, arg1_p, arg1_offset, arg2_p,
+        arg2_offset, res_p, res_offset, depends, additional_depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct AddStridedFactory
+{
+    fnT get()
+    {
+        if constexpr (!AddOutputType<T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = add_strided_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+
+template <typename argT1, typename argT2, typename resT>
+class add_matrix_row_broadcast_sg_krn;
+
+template <typename argT1, typename argT2, typename resT>
+using AddContigMatrixContigRowBroadcastingFunctor =
+    elementwise_common::BinaryContigMatrixContigRowBroadcastingFunctor<
+        argT1,
+        argT2,
+        resT,
+        AddFunctor<argT1, argT2, resT>>;
+
+template <typename argT1, typename argT2, typename resT>
+sycl::event add_contig_matrix_contig_row_broadcast_impl(
+    sycl::queue &exec_q,
+    std::vector<sycl::event> &host_tasks,
+    std::size_t n0,
+    std::size_t n1,
+    const char *mat_p, // typeless pointer to (n0, n1) C-contiguous matrix
+    ssize_t mat_offset,
+    const char *vec_p, // typeless pointer to (n1,) contiguous row
+    ssize_t vec_offset,
+    char *res_p, // typeless pointer to (n0, n1) result C-contig. matrix,
+                 //    res[i,j] = mat[i,j] + vec[j]
+    ssize_t res_offset,
+    const std::vector<sycl::event> &depends = {})
+{
+    return elementwise_common::binary_contig_matrix_contig_row_broadcast_impl<
+        argT1, argT2, resT, AddContigMatrixContigRowBroadcastingFunctor,
+        add_matrix_row_broadcast_sg_krn>(exec_q, host_tasks, n0, n1, mat_p,
+                                         mat_offset, vec_p, vec_offset, res_p,
+                                         res_offset, depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct AddContigMatrixContigRowBroadcastFactory
+{
+    fnT get()
+    {
+        if constexpr (!AddOutputType<T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            using resT = typename AddOutputType<T1, T2>::value_type;
+            if constexpr (dpctl::tensor::type_utils::is_complex<T1>::value ||
+                          dpctl::tensor::type_utils::is_complex<T2>::value ||
+                          dpctl::tensor::type_utils::is_complex<resT>::value)
+            {
+                fnT fn = nullptr;
+                return fn;
+            }
+            else {
+                fnT fn =
+                    add_contig_matrix_contig_row_broadcast_impl<T1, T2, resT>;
+                return fn;
+            }
+        }
+    }
+};
+
+template <typename argT1, typename argT2, typename resT>
+sycl::event add_contig_row_contig_matrix_broadcast_impl(
+    sycl::queue &exec_q,
+    std::vector<sycl::event> &host_tasks,
+    std::size_t n0,
+    std::size_t n1,
+    const char *vec_p, // typeless pointer to (n1,) contiguous row
+    ssize_t vec_offset,
+    const char *mat_p, // typeless pointer to (n0, n1) C-contiguous matrix
+    ssize_t mat_offset,
+    char *res_p, // typeless pointer to (n0, n1) result C-contig. matrix,
+                 //    res[i,j] = mat[i,j] + vec[j]
+    ssize_t res_offset,
+    const std::vector<sycl::event> &depends = {})
+{
+    return add_contig_matrix_contig_row_broadcast_impl<argT2, argT1, resT>(
+        exec_q, host_tasks, n0, n1, mat_p, mat_offset, vec_p, vec_offset, res_p,
+        res_offset, depends);
+};
+
+template <typename fnT, typename T1, typename T2>
+struct AddContigRowContigMatrixBroadcastFactory
+{
+    fnT get()
+    {
+        if constexpr (!AddOutputType<T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            using resT = typename AddOutputType<T1, T2>::value_type;
+            if constexpr (dpctl::tensor::type_utils::is_complex<T1>::value ||
+                          dpctl::tensor::type_utils::is_complex<T2>::value ||
+                          dpctl::tensor::type_utils::is_complex<resT>::value)
+            {
+                fnT fn = nullptr;
+                return fn;
+            }
+            else {
+                fnT fn =
+                    add_contig_row_contig_matrix_broadcast_impl<T1, T2, resT>;
+                return fn;
+            }
+        }
+    }
+};
+
+template <typename argT, typename resT>
+struct AddInplaceFunctor
+{
+
+    using supports_sg_loadstore = std::negation<
+        std::disjunction<tu_ns::is_complex<argT>, tu_ns::is_complex<resT>>>;
+    using supports_vec = std::negation<
+        std::disjunction<tu_ns::is_complex<argT>, tu_ns::is_complex<resT>>>;
+
+    void operator()(resT &res, const argT &in)
+    {
+        res += in;
+    }
+
+    template <int vec_sz>
+    void operator()(sycl::vec<resT, vec_sz> &res,
+                    const sycl::vec<argT, vec_sz> &in)
+    {
+        res += in;
+    }
+};
+
+template <typename argT,
+          typename resT,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
+          bool enable_sg_loadstore = true>
+using AddInplaceContigFunctor = elementwise_common::BinaryInplaceContigFunctor<
+    argT,
+    resT,
+    AddInplaceFunctor<argT, resT>,
+    vec_sz,
+    n_vecs,
+    enable_sg_loadstore>;
+
+template <typename argT, typename resT, typename IndexerT>
+using AddInplaceStridedFunctor =
+    elementwise_common::BinaryInplaceStridedFunctor<
+        argT,
+        resT,
+        IndexerT,
+        AddInplaceFunctor<argT, resT>>;
+
+template <typename argT,
+          typename resT,
+          std::uint8_t vec_sz,
+          std::uint8_t n_vecs>
+class add_inplace_contig_kernel;
+
+/* @brief Types supported by in-place add */
+template <typename argTy, typename resTy>
+struct AddInplaceTypePairSupport
+{
+    /* value if true a kernel for <argTy, resTy> must be instantiated  */
+    static constexpr bool is_defined = std::disjunction<
+        td_ns::TypePairDefinedEntry<argTy, bool, resTy, bool>,
+        td_ns::TypePairDefinedEntry<argTy, std::int8_t, resTy, std::int8_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, resTy, std::uint8_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int16_t, resTy, std::int16_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, resTy, std::uint16_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int32_t, resTy, std::int32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint32_t, resTy, std::uint32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int64_t, resTy, std::int64_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint64_t, resTy, std::uint64_t>,
+        td_ns::TypePairDefinedEntry<argTy, sycl::half, resTy, sycl::half>,
+        td_ns::TypePairDefinedEntry<argTy, float, resTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, double, resTy, double>,
+        td_ns::TypePairDefinedEntry<argTy,
+                                    std::complex<float>,
+                                    resTy,
+                                    std::complex<float>>,
+        td_ns::TypePairDefinedEntry<argTy,
+                                    std::complex<double>,
+                                    resTy,
+                                    std::complex<double>>,
+        // fall-through
+        td_ns::NotDefinedEntry>::is_defined;
+};
+
+template <typename fnT, typename argT, typename resT>
+struct AddInplaceTypeMapFactory
+{
+    /*! @brief get typeid for output type of x += y */
+    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
+    {
+        if constexpr (AddInplaceTypePairSupport<argT, resT>::is_defined) {
+            return td_ns::GetTypeid<resT>{}.get();
+        }
+        else {
+            return td_ns::GetTypeid<void>{}.get();
+        }
+    }
+};
+
+template <typename argTy, typename resTy>
+sycl::event
+    add_inplace_contig_impl(sycl::queue &exec_q,
+                            std::size_t nelems,
+                            const char *arg_p,
+                            ssize_t arg_offset,
+                            char *res_p,
+                            ssize_t res_offset,
+                            const std::vector<sycl::event> &depends = {})
+{
+    static constexpr auto vec_sz =
+        hyperparam_detail::AddContigHyperparameterSet<resTy, argTy>::vec_sz;
+    static constexpr auto n_vecs =
+        hyperparam_detail::AddContigHyperparameterSet<resTy, argTy>::n_vecs;
+
+    return elementwise_common::binary_inplace_contig_impl<
+        argTy, resTy, AddInplaceContigFunctor, add_inplace_contig_kernel,
+        vec_sz, n_vecs>(exec_q, nelems, arg_p, arg_offset, res_p, res_offset,
+                        depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct AddInplaceContigFactory
+{
+    fnT get()
+    {
+        if constexpr (!AddInplaceTypePairSupport<T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = add_inplace_contig_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+
+template <typename resT, typename argT, typename IndexerT>
+class add_inplace_strided_kernel;
+
+template <typename argTy, typename resTy>
+sycl::event
+    add_inplace_strided_impl(sycl::queue &exec_q,
+                             std::size_t nelems,
+                             int nd,
+                             const ssize_t *shape_and_strides,
+                             const char *arg_p,
+                             ssize_t arg_offset,
+                             char *res_p,
+                             ssize_t res_offset,
+                             const std::vector<sycl::event> &depends,
+                             const std::vector<sycl::event> &additional_depends)
+{
+    return elementwise_common::binary_inplace_strided_impl<
+        argTy, resTy, AddInplaceStridedFunctor, add_inplace_strided_kernel>(
+        exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p,
+        res_offset, depends, additional_depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct AddInplaceStridedFactory
+{
+    fnT get()
+    {
+        if constexpr (!AddInplaceTypePairSupport<T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = add_inplace_strided_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+
+template <typename argT, typename resT>
+class add_inplace_row_matrix_broadcast_sg_krn;
+
+template <typename argT, typename resT>
+using AddInplaceRowMatrixBroadcastingFunctor =
+    elementwise_common::BinaryInplaceRowMatrixBroadcastingFunctor<
+        argT,
+        resT,
+        AddInplaceFunctor<argT, resT>>;
+
+template <typename argT, typename resT>
+sycl::event add_inplace_row_matrix_broadcast_impl(
+    sycl::queue &exec_q,
+    std::vector<sycl::event> &host_tasks,
+    std::size_t n0,
+    std::size_t n1,
+    const char *vec_p, // typeless pointer to (n1,) contiguous row
+    ssize_t vec_offset,
+    char *mat_p, // typeless pointer to (n0, n1) C-contiguous matrix
+    ssize_t mat_offset,
+    const std::vector<sycl::event> &depends = {})
+{
+    return elementwise_common::binary_inplace_row_matrix_broadcast_impl<
+        argT, resT, AddInplaceRowMatrixBroadcastingFunctor,
+        add_inplace_row_matrix_broadcast_sg_krn>(exec_q, host_tasks, n0, n1,
+                                                 vec_p, vec_offset, mat_p,
+                                                 mat_offset, depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct AddInplaceRowMatrixBroadcastFactory
+{
+    fnT get()
+    {
+        if constexpr (!AddInplaceTypePairSupport<T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            if constexpr (dpctl::tensor::type_utils::is_complex<T1>::value ||
+                          dpctl::tensor::type_utils::is_complex<T2>::value)
+            {
+                fnT fn = nullptr;
+                return fn;
+            }
+            else {
+                fnT fn = add_inplace_row_matrix_broadcast_impl<T1, T2>;
+                return fn;
+            }
+        }
+    }
+};
+
+} // namespace dpctl::tensor::kernels::add
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/atan2.hpp b/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/atan2.hpp
new file mode 100644
index 000000000000..220722d5b596
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/atan2.hpp
@@ -0,0 +1,233 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for elementwise evaluation of ATAN2(x1, x2)
+/// function.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <cmath>
+#include <cstddef>
+#include <cstdint>
+#include <type_traits>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "vec_size_util.hpp"
+
+#include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+
+#include "utils/type_dispatch_building.hpp"
+
+namespace dpctl::tensor::kernels::atan2
+{
+
+using dpctl::tensor::ssize_t;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+template <typename argT1, typename argT2, typename resT>
+struct Atan2Functor
+{
+
+    using supports_sg_loadstore = std::true_type;
+    using supports_vec = std::false_type;
+
+    resT operator()(const argT1 &in1, const argT2 &in2) const
+    {
+        if (std::isinf(in2) && !sycl::signbit(in2)) {
+            if (std::isfinite(in1)) {
+                return sycl::copysign(resT(0), in1);
+            }
+        }
+        return sycl::atan2(in1, in2);
+    }
+};
+
+template <typename argT1,
+          typename argT2,
+          typename resT,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
+          bool enable_sg_loadstore = true>
+using Atan2ContigFunctor =
+    elementwise_common::BinaryContigFunctor<argT1,
+                                            argT2,
+                                            resT,
+                                            Atan2Functor<argT1, argT2, resT>,
+                                            vec_sz,
+                                            n_vecs,
+                                            enable_sg_loadstore>;
+
+template <typename argT1, typename argT2, typename resT, typename IndexerT>
+using Atan2StridedFunctor =
+    elementwise_common::BinaryStridedFunctor<argT1,
+                                             argT2,
+                                             resT,
+                                             IndexerT,
+                                             Atan2Functor<argT1, argT2, resT>>;
+
+template <typename T1, typename T2>
+struct Atan2OutputType
+{
+    using value_type = typename std::disjunction<
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        sycl::half,
+                                        T2,
+                                        sycl::half,
+                                        sycl::half>,
+        td_ns::BinaryTypeMapResultEntry<T1, float, T2, float, float>,
+        td_ns::BinaryTypeMapResultEntry<T1, double, T2, double, double>,
+        td_ns::DefaultResultEntry<void>>::result_type;
+
+    static constexpr bool is_defined = !std::is_same_v<value_type, void>;
+};
+
+namespace hyperparam_detail
+{
+
+namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
+
+using vsu_ns::BinaryContigHyperparameterSetEntry;
+using vsu_ns::ContigHyperparameterSetDefault;
+
+template <typename argTy1, typename argTy2>
+struct Atan2ContigHyperparameterSet
+{
+    using value_type =
+        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
+
+    constexpr static auto vec_sz = value_type::vec_sz;
+    constexpr static auto n_vecs = value_type::n_vecs;
+};
+
+} // end of namespace hyperparam_detail
+
+template <typename argT1,
+          typename argT2,
+          typename resT,
+          std::uint8_t vec_sz,
+          std::uint8_t n_vecs>
+class atan2_contig_kernel;
+
+template <typename argTy1, typename argTy2>
+sycl::event atan2_contig_impl(sycl::queue &exec_q,
+                              std::size_t nelems,
+                              const char *arg1_p,
+                              ssize_t arg1_offset,
+                              const char *arg2_p,
+                              ssize_t arg2_offset,
+                              char *res_p,
+                              ssize_t res_offset,
+                              const std::vector<sycl::event> &depends = {})
+{
+    using Atan2HS =
+        hyperparam_detail::Atan2ContigHyperparameterSet<argTy1, argTy2>;
+    static constexpr std::uint8_t vec_sz = Atan2HS::vec_sz;
+    static constexpr std::uint8_t n_vecs = Atan2HS::n_vecs;
+
+    return elementwise_common::binary_contig_impl<
+        argTy1, argTy2, Atan2OutputType, Atan2ContigFunctor,
+        atan2_contig_kernel, vec_sz, n_vecs>(exec_q, nelems, arg1_p,
+                                             arg1_offset, arg2_p, arg2_offset,
+                                             res_p, res_offset, depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct Atan2ContigFactory
+{
+    fnT get()
+    {
+        if constexpr (!Atan2OutputType<T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = atan2_contig_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+
+template <typename fnT, typename T1, typename T2>
+struct Atan2TypeMapFactory
+{
+    /*! @brief get typeid for output type of sycl::atan2(T1 x, T2 y) */
+    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
+    {
+        using rT = typename Atan2OutputType<T1, T2>::value_type;
+        return td_ns::GetTypeid<rT>{}.get();
+    }
+};
+
+template <typename T1, typename T2, typename resT, typename IndexerT>
+class atan2_strided_kernel;
+
+template <typename argTy1, typename argTy2>
+sycl::event
+    atan2_strided_impl(sycl::queue &exec_q,
+                       std::size_t nelems,
+                       int nd,
+                       const ssize_t *shape_and_strides,
+                       const char *arg1_p,
+                       ssize_t arg1_offset,
+                       const char *arg2_p,
+                       ssize_t arg2_offset,
+                       char *res_p,
+                       ssize_t res_offset,
+                       const std::vector<sycl::event> &depends,
+                       const std::vector<sycl::event> &additional_depends)
+{
+    return elementwise_common::binary_strided_impl<
+        argTy1, argTy2, Atan2OutputType, Atan2StridedFunctor,
+        atan2_strided_kernel>(exec_q, nelems, nd, shape_and_strides, arg1_p,
+                              arg1_offset, arg2_p, arg2_offset, res_p,
+                              res_offset, depends, additional_depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct Atan2StridedFactory
+{
+    fnT get()
+    {
+        if constexpr (!Atan2OutputType<T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = atan2_strided_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+
+} // namespace dpctl::tensor::kernels::atan2
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/bitwise_and.hpp b/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/bitwise_and.hpp
new file mode 100644
index 000000000000..d0b644c2f6bb
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/bitwise_and.hpp
@@ -0,0 +1,461 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for elementwise bitwise_and(ar1, ar2) operation.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <cstddef>
+#include <cstdint>
+#include <type_traits>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "vec_size_util.hpp"
+
+#include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+#include "kernels/elementwise_functions/common_inplace.hpp"
+
+#include "utils/type_dispatch_building.hpp"
+#include "utils/type_utils.hpp"
+
+namespace dpctl::tensor::kernels::bitwise_and
+{
+
+using dpctl::tensor::ssize_t;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+template <typename argT1, typename argT2, typename resT>
+struct BitwiseAndFunctor
+{
+    static_assert(std::is_same_v<resT, argT1>);
+    static_assert(std::is_same_v<resT, argT2>);
+
+    using supports_sg_loadstore = typename std::true_type;
+    using supports_vec = typename std::true_type;
+
+    resT operator()(const argT1 &in1, const argT2 &in2) const
+    {
+        if constexpr (std::is_same_v<resT, bool>) {
+            return in1 && in2;
+        }
+        else {
+            return (in1 & in2);
+        }
+    }
+
+    template <int vec_sz>
+    sycl::vec<resT, vec_sz>
+        operator()(const sycl::vec<argT1, vec_sz> &in1,
+                   const sycl::vec<argT2, vec_sz> &in2) const
+    {
+
+        if constexpr (std::is_same_v<resT, bool>) {
+            using dpctl::tensor::type_utils::vec_cast;
+
+            auto tmp = (in1 && in2);
+            return vec_cast<resT, typename decltype(tmp)::element_type, vec_sz>(
+                tmp);
+        }
+        else {
+            return (in1 & in2);
+        }
+    }
+};
+
+template <typename argT1,
+          typename argT2,
+          typename resT,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
+          bool enable_sg_loadstore = true>
+using BitwiseAndContigFunctor = elementwise_common::BinaryContigFunctor<
+    argT1,
+    argT2,
+    resT,
+    BitwiseAndFunctor<argT1, argT2, resT>,
+    vec_sz,
+    n_vecs,
+    enable_sg_loadstore>;
+
+template <typename argT1, typename argT2, typename resT, typename IndexerT>
+using BitwiseAndStridedFunctor = elementwise_common::BinaryStridedFunctor<
+    argT1,
+    argT2,
+    resT,
+    IndexerT,
+    BitwiseAndFunctor<argT1, argT2, resT>>;
+
+template <typename T1, typename T2>
+struct BitwiseAndOutputType
+{
+    using value_type = typename std::disjunction<
+        td_ns::BinaryTypeMapResultEntry<T1, bool, T2, bool, bool>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint8_t,
+                                        T2,
+                                        std::uint8_t,
+                                        std::uint8_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::int8_t,
+                                        T2,
+                                        std::int8_t,
+                                        std::int8_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint16_t,
+                                        T2,
+                                        std::uint16_t,
+                                        std::uint16_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::int16_t,
+                                        T2,
+                                        std::int16_t,
+                                        std::int16_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint32_t,
+                                        T2,
+                                        std::uint32_t,
+                                        std::uint32_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::int32_t,
+                                        T2,
+                                        std::int32_t,
+                                        std::int32_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint64_t,
+                                        T2,
+                                        std::uint64_t,
+                                        std::uint64_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::int64_t,
+                                        T2,
+                                        std::int64_t,
+                                        std::int64_t>,
+        td_ns::DefaultResultEntry<void>>::result_type;
+
+    static constexpr bool is_defined = !std::is_same_v<value_type, void>;
+};
+
+namespace hyperparam_detail
+{
+
+namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
+
+using vsu_ns::BinaryContigHyperparameterSetEntry;
+using vsu_ns::ContigHyperparameterSetDefault;
+
+template <typename argTy1, typename argTy2>
+struct BitwiseAndContigHyperparameterSet
+{
+    using value_type =
+        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
+
+    constexpr static auto vec_sz = value_type::vec_sz;
+    constexpr static auto n_vecs = value_type::n_vecs;
+};
+} // end of namespace hyperparam_detail
+
+template <typename argT1,
+          typename argT2,
+          typename resT,
+          std::uint8_t vec_sz,
+          std::uint8_t n_vecs>
+class bitwise_and_contig_kernel;
+
+template <typename argTy1, typename argTy2>
+sycl::event
+    bitwise_and_contig_impl(sycl::queue &exec_q,
+                            std::size_t nelems,
+                            const char *arg1_p,
+                            ssize_t arg1_offset,
+                            const char *arg2_p,
+                            ssize_t arg2_offset,
+                            char *res_p,
+                            ssize_t res_offset,
+                            const std::vector<sycl::event> &depends = {})
+{
+    using BitwiseAndHS =
+        hyperparam_detail::BitwiseAndContigHyperparameterSet<argTy1, argTy2>;
+    static constexpr std::uint8_t vec_sz = BitwiseAndHS::vec_sz;
+    static constexpr std::uint8_t n_vec = BitwiseAndHS::n_vecs;
+
+    return elementwise_common::binary_contig_impl<
+        argTy1, argTy2, BitwiseAndOutputType, BitwiseAndContigFunctor,
+        bitwise_and_contig_kernel, vec_sz, n_vec>(
+        exec_q, nelems, arg1_p, arg1_offset, arg2_p, arg2_offset, res_p,
+        res_offset, depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct BitwiseAndContigFactory
+{
+    fnT get()
+    {
+        if constexpr (!BitwiseAndOutputType<T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = bitwise_and_contig_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+
+template <typename fnT, typename T1, typename T2>
+struct BitwiseAndTypeMapFactory
+{
+    /*! @brief get typeid for output type of operator()>(x, y), always bool
+     */
+    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
+    {
+        using rT = typename BitwiseAndOutputType<T1, T2>::value_type;
+        return td_ns::GetTypeid<rT>{}.get();
+    }
+};
+
+template <typename T1, typename T2, typename resT, typename IndexerT>
+class bitwise_and_strided_kernel;
+
+template <typename argTy1, typename argTy2>
+sycl::event
+    bitwise_and_strided_impl(sycl::queue &exec_q,
+                             std::size_t nelems,
+                             int nd,
+                             const ssize_t *shape_and_strides,
+                             const char *arg1_p,
+                             ssize_t arg1_offset,
+                             const char *arg2_p,
+                             ssize_t arg2_offset,
+                             char *res_p,
+                             ssize_t res_offset,
+                             const std::vector<sycl::event> &depends,
+                             const std::vector<sycl::event> &additional_depends)
+{
+    return elementwise_common::binary_strided_impl<
+        argTy1, argTy2, BitwiseAndOutputType, BitwiseAndStridedFunctor,
+        bitwise_and_strided_kernel>(
+        exec_q, nelems, nd, shape_and_strides, arg1_p, arg1_offset, arg2_p,
+        arg2_offset, res_p, res_offset, depends, additional_depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct BitwiseAndStridedFactory
+{
+    fnT get()
+    {
+        if constexpr (!BitwiseAndOutputType<T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = bitwise_and_strided_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+
+template <typename argT, typename resT>
+struct BitwiseAndInplaceFunctor
+{
+    using supports_sg_loadstore = typename std::true_type;
+    using supports_vec = typename std::true_type;
+
+    void operator()(resT &res, const argT &in) const
+    {
+        if constexpr (std::is_same_v<resT, bool>) {
+            res = res && in;
+        }
+        else {
+            res &= in;
+        }
+    }
+
+    template <int vec_sz>
+    void operator()(sycl::vec<resT, vec_sz> &res,
+                    const sycl::vec<argT, vec_sz> &in) const
+    {
+
+        if constexpr (std::is_same_v<resT, bool>) {
+            using dpctl::tensor::type_utils::vec_cast;
+
+            auto tmp = (res && in);
+            res = vec_cast<resT, typename decltype(tmp)::element_type, vec_sz>(
+                tmp);
+        }
+        else {
+            res &= in;
+        }
+    }
+};
+
+template <typename argT,
+          typename resT,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
+          bool enable_sg_loadstore = true>
+using BitwiseAndInplaceContigFunctor =
+    elementwise_common::BinaryInplaceContigFunctor<
+        argT,
+        resT,
+        BitwiseAndInplaceFunctor<argT, resT>,
+        vec_sz,
+        n_vecs,
+        enable_sg_loadstore>;
+
+template <typename argT, typename resT, typename IndexerT>
+using BitwiseAndInplaceStridedFunctor =
+    elementwise_common::BinaryInplaceStridedFunctor<
+        argT,
+        resT,
+        IndexerT,
+        BitwiseAndInplaceFunctor<argT, resT>>;
+
+template <typename argT,
+          typename resT,
+          std::uint8_t vec_sz,
+          std::uint8_t n_vecs>
+class bitwise_and_inplace_contig_kernel;
+
+/* @brief Types supported by in-place bitwise AND */
+template <typename argTy, typename resTy>
+struct BitwiseAndInplaceTypePairSupport
+{
+    /* value if true a kernel for <argTy, resTy> must be instantiated  */
+    static constexpr bool is_defined = std::disjunction<
+        td_ns::TypePairDefinedEntry<argTy, bool, resTy, bool>,
+        td_ns::TypePairDefinedEntry<argTy, std::int8_t, resTy, std::int8_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, resTy, std::uint8_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int16_t, resTy, std::int16_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, resTy, std::uint16_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int32_t, resTy, std::int32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint32_t, resTy, std::uint32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int64_t, resTy, std::int64_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint64_t, resTy, std::uint64_t>,
+        // fall-through
+        td_ns::NotDefinedEntry>::is_defined;
+};
+
+template <typename fnT, typename argT, typename resT>
+struct BitwiseAndInplaceTypeMapFactory
+{
+    /*! @brief get typeid for output type of x &= y */
+    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
+    {
+        if constexpr (BitwiseAndInplaceTypePairSupport<argT, resT>::is_defined)
+        {
+            return td_ns::GetTypeid<resT>{}.get();
+        }
+        else {
+            return td_ns::GetTypeid<void>{}.get();
+        }
+    }
+};
+
+template <typename argTy, typename resTy>
+sycl::event bitwise_and_inplace_contig_impl(
+    sycl::queue &exec_q,
+    std::size_t nelems,
+    const char *arg_p,
+    ssize_t arg_offset,
+    char *res_p,
+    ssize_t res_offset,
+    const std::vector<sycl::event> &depends = {})
+{
+    using BitwiseAndHS =
+        hyperparam_detail::BitwiseAndContigHyperparameterSet<resTy, argTy>;
+    static constexpr std::uint8_t vec_sz = BitwiseAndHS::vec_sz;
+    static constexpr std::uint8_t n_vecs = BitwiseAndHS::n_vecs;
+
+    return elementwise_common::binary_inplace_contig_impl<
+        argTy, resTy, BitwiseAndInplaceContigFunctor,
+        bitwise_and_inplace_contig_kernel, vec_sz, n_vecs>(
+        exec_q, nelems, arg_p, arg_offset, res_p, res_offset, depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct BitwiseAndInplaceContigFactory
+{
+    fnT get()
+    {
+        if constexpr (!BitwiseAndInplaceTypePairSupport<T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = bitwise_and_inplace_contig_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+
+template <typename resT, typename argT, typename IndexerT>
+class bitwise_and_inplace_strided_kernel;
+
+template <typename argTy, typename resTy>
+sycl::event bitwise_and_inplace_strided_impl(
+    sycl::queue &exec_q,
+    std::size_t nelems,
+    int nd,
+    const ssize_t *shape_and_strides,
+    const char *arg_p,
+    ssize_t arg_offset,
+    char *res_p,
+    ssize_t res_offset,
+    const std::vector<sycl::event> &depends,
+    const std::vector<sycl::event> &additional_depends)
+{
+    return elementwise_common::binary_inplace_strided_impl<
+        argTy, resTy, BitwiseAndInplaceStridedFunctor,
+        bitwise_and_inplace_strided_kernel>(
+        exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p,
+        res_offset, depends, additional_depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct BitwiseAndInplaceStridedFactory
+{
+    fnT get()
+    {
+        if constexpr (!BitwiseAndInplaceTypePairSupport<T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = bitwise_and_inplace_strided_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+
+} // namespace dpctl::tensor::kernels::bitwise_and
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/bitwise_left_shift.hpp b/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/bitwise_left_shift.hpp
new file mode 100644
index 000000000000..549a220fbabc
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/bitwise_left_shift.hpp
@@ -0,0 +1,485 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for elementwise bitwise_left_shift(ar1, ar2)
+/// operation.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <cstddef>
+#include <cstdint>
+#include <type_traits>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "vec_size_util.hpp"
+
+#include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+#include "kernels/elementwise_functions/common_inplace.hpp"
+
+#include "utils/type_dispatch_building.hpp"
+
+namespace dpctl::tensor::kernels::bitwise_left_shift
+{
+
+using dpctl::tensor::ssize_t;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+template <typename argT1, typename argT2, typename resT>
+struct BitwiseLeftShiftFunctor
+{
+    static_assert(std::is_integral_v<argT1>);
+    static_assert(std::is_integral_v<argT2>);
+    static_assert(!std::is_same_v<argT1, bool>);
+    static_assert(!std::is_same_v<argT2, bool>);
+
+    using supports_sg_loadstore = typename std::true_type;
+    using supports_vec = typename std::true_type;
+
+    resT operator()(const argT1 &in1, const argT2 &in2) const
+    {
+        return impl(in1, in2);
+    }
+
+    template <int vec_sz>
+    sycl::vec<resT, vec_sz>
+        operator()(const sycl::vec<argT1, vec_sz> &in1,
+                   const sycl::vec<argT2, vec_sz> &in2) const
+    {
+        sycl::vec<resT, vec_sz> res;
+#pragma unroll
+        for (int i = 0; i < vec_sz; ++i) {
+            res[i] = impl(in1[i], in2[i]);
+        }
+        return res;
+    }
+
+private:
+    resT impl(const argT1 &in1, const argT2 &in2) const
+    {
+        static constexpr argT2 in1_bitsize =
+            static_cast<argT2>(sizeof(argT1) * 8);
+        static constexpr resT zero = resT(0);
+
+        // bitshift op with second operand negative, or >= bitwidth(argT1) is UB
+        // array API spec mandates 0
+        if constexpr (std::is_unsigned_v<argT2>) {
+            return (in2 < in1_bitsize) ? (in1 << in2) : zero;
+        }
+        else {
+            return (in2 < argT2(0))
+                       ? zero
+                       : ((in2 < in1_bitsize) ? (in1 << in2) : zero);
+        }
+    }
+};
+
+template <typename argT1,
+          typename argT2,
+          typename resT,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
+          bool enable_sg_loadstore = true>
+using BitwiseLeftShiftContigFunctor = elementwise_common::BinaryContigFunctor<
+    argT1,
+    argT2,
+    resT,
+    BitwiseLeftShiftFunctor<argT1, argT2, resT>,
+    vec_sz,
+    n_vecs,
+    enable_sg_loadstore>;
+
+template <typename argT1, typename argT2, typename resT, typename IndexerT>
+using BitwiseLeftShiftStridedFunctor = elementwise_common::BinaryStridedFunctor<
+    argT1,
+    argT2,
+    resT,
+    IndexerT,
+    BitwiseLeftShiftFunctor<argT1, argT2, resT>>;
+
+template <typename T1, typename T2>
+struct BitwiseLeftShiftOutputType
+{
+    using ResT = T1;
+    using value_type = typename std::disjunction<
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::int8_t,
+                                        T2,
+                                        std::int8_t,
+                                        std::int8_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint8_t,
+                                        T2,
+                                        std::uint8_t,
+                                        std::uint8_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::int16_t,
+                                        T2,
+                                        std::int16_t,
+                                        std::int16_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint16_t,
+                                        T2,
+                                        std::uint16_t,
+                                        std::uint16_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::int32_t,
+                                        T2,
+                                        std::int32_t,
+                                        std::int32_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint32_t,
+                                        T2,
+                                        std::uint32_t,
+                                        std::uint32_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::int64_t,
+                                        T2,
+                                        std::int64_t,
+                                        std::int64_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint64_t,
+                                        T2,
+                                        std::uint64_t,
+                                        std::uint64_t>,
+        td_ns::DefaultResultEntry<void>>::result_type;
+
+    static constexpr bool is_defined = !std::is_same_v<value_type, void>;
+};
+
+namespace hyperparam_detail
+{
+
+namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
+
+using vsu_ns::BinaryContigHyperparameterSetEntry;
+using vsu_ns::ContigHyperparameterSetDefault;
+
+template <typename argTy1, typename argTy2>
+struct BitwiseLeftShiftContigHyperparameterSet
+{
+    using value_type =
+        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
+
+    constexpr static auto vec_sz = value_type::vec_sz;
+    constexpr static auto n_vecs = value_type::n_vecs;
+};
+
+} // end of namespace hyperparam_detail
+
+template <typename argT1,
+          typename argT2,
+          typename resT,
+          std::uint8_t vec_sz,
+          std::uint8_t n_vecs>
+class bitwise_left_shift_contig_kernel;
+
+template <typename argTy1, typename argTy2>
+sycl::event
+    bitwise_left_shift_contig_impl(sycl::queue &exec_q,
+                                   std::size_t nelems,
+                                   const char *arg1_p,
+                                   ssize_t arg1_offset,
+                                   const char *arg2_p,
+                                   ssize_t arg2_offset,
+                                   char *res_p,
+                                   ssize_t res_offset,
+                                   const std::vector<sycl::event> &depends = {})
+{
+    using BitwiseLSHS =
+        hyperparam_detail::BitwiseLeftShiftContigHyperparameterSet<argTy1,
+                                                                   argTy2>;
+    static constexpr std::uint8_t vec_sz = BitwiseLSHS::vec_sz;
+    static constexpr std::uint8_t n_vecs = BitwiseLSHS::n_vecs;
+
+    return elementwise_common::binary_contig_impl<
+        argTy1, argTy2, BitwiseLeftShiftOutputType,
+        BitwiseLeftShiftContigFunctor, bitwise_left_shift_contig_kernel, vec_sz,
+        n_vecs>(exec_q, nelems, arg1_p, arg1_offset, arg2_p, arg2_offset, res_p,
+                res_offset, depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct BitwiseLeftShiftContigFactory
+{
+    fnT get()
+    {
+        if constexpr (!BitwiseLeftShiftOutputType<T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = bitwise_left_shift_contig_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+
+template <typename fnT, typename T1, typename T2>
+struct BitwiseLeftShiftTypeMapFactory
+{
+    /*! @brief get typeid for output type of operator()>(x, y), always bool
+     */
+    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
+    {
+        using rT = typename BitwiseLeftShiftOutputType<T1, T2>::value_type;
+        return td_ns::GetTypeid<rT>{}.get();
+    }
+};
+
+template <typename T1, typename T2, typename resT, typename IndexerT>
+class bitwise_left_shift_strided_kernel;
+
+template <typename argTy1, typename argTy2>
+sycl::event bitwise_left_shift_strided_impl(
+    sycl::queue &exec_q,
+    std::size_t nelems,
+    int nd,
+    const ssize_t *shape_and_strides,
+    const char *arg1_p,
+    ssize_t arg1_offset,
+    const char *arg2_p,
+    ssize_t arg2_offset,
+    char *res_p,
+    ssize_t res_offset,
+    const std::vector<sycl::event> &depends,
+    const std::vector<sycl::event> &additional_depends)
+{
+    return elementwise_common::binary_strided_impl<
+        argTy1, argTy2, BitwiseLeftShiftOutputType,
+        BitwiseLeftShiftStridedFunctor, bitwise_left_shift_strided_kernel>(
+        exec_q, nelems, nd, shape_and_strides, arg1_p, arg1_offset, arg2_p,
+        arg2_offset, res_p, res_offset, depends, additional_depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct BitwiseLeftShiftStridedFactory
+{
+    fnT get()
+    {
+        if constexpr (!BitwiseLeftShiftOutputType<T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = bitwise_left_shift_strided_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+
+template <typename argT, typename resT>
+struct BitwiseLeftShiftInplaceFunctor
+{
+    static_assert(std::is_integral_v<argT>);
+    static_assert(!std::is_same_v<argT, bool>);
+
+    using supports_sg_loadstore = typename std::true_type;
+    using supports_vec = typename std::true_type;
+
+    void operator()(resT &res, const argT &in) const
+    {
+        impl(res, in);
+    }
+
+    template <int vec_sz>
+    void operator()(sycl::vec<resT, vec_sz> &res,
+                    const sycl::vec<argT, vec_sz> &in) const
+    {
+#pragma unroll
+        for (int i = 0; i < vec_sz; ++i) {
+            impl(res[i], in[i]);
+        }
+    }
+
+private:
+    void impl(resT &res, const argT &in) const
+    {
+        static constexpr argT res_bitsize = static_cast<argT>(sizeof(resT) * 8);
+        static constexpr resT zero = resT(0);
+
+        // bitshift op with second operand negative, or >= bitwidth(argT1) is UB
+        // array API spec mandates 0
+        if constexpr (std::is_unsigned_v<argT>) {
+            (in < res_bitsize) ? (res <<= in) : res = zero;
+        }
+        else {
+            (in < argT(0)) ? res = zero
+                           : ((in < res_bitsize) ? (res <<= in) : res = zero);
+        }
+    }
+};
+
+template <typename argT,
+          typename resT,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
+          bool enable_sg_loadstore = true>
+using BitwiseLeftShiftInplaceContigFunctor =
+    elementwise_common::BinaryInplaceContigFunctor<
+        argT,
+        resT,
+        BitwiseLeftShiftInplaceFunctor<argT, resT>,
+        vec_sz,
+        n_vecs,
+        enable_sg_loadstore>;
+
+template <typename argT, typename resT, typename IndexerT>
+using BitwiseLeftShiftInplaceStridedFunctor =
+    elementwise_common::BinaryInplaceStridedFunctor<
+        argT,
+        resT,
+        IndexerT,
+        BitwiseLeftShiftInplaceFunctor<argT, resT>>;
+
+template <typename argT,
+          typename resT,
+          std::uint8_t vec_sz,
+          std::uint8_t n_vecs>
+class bitwise_left_shift_inplace_contig_kernel;
+
+/* @brief Types supported by in-place bitwise left shift */
+template <typename argTy, typename resTy>
+struct BitwiseLeftShiftInplaceTypePairSupport
+{
+    /* value if true a kernel for <argTy, resTy> must be instantiated  */
+    static constexpr bool is_defined = std::disjunction<
+        td_ns::TypePairDefinedEntry<argTy, std::int8_t, resTy, std::int8_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, resTy, std::uint8_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int16_t, resTy, std::int16_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, resTy, std::uint16_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int32_t, resTy, std::int32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint32_t, resTy, std::uint32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int64_t, resTy, std::int64_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint64_t, resTy, std::uint64_t>,
+        // fall-through
+        td_ns::NotDefinedEntry>::is_defined;
+};
+
+template <typename fnT, typename argT, typename resT>
+struct BitwiseLeftShiftInplaceTypeMapFactory
+{
+    /*! @brief get typeid for output type of x <<= y */
+    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
+    {
+        if constexpr (BitwiseLeftShiftInplaceTypePairSupport<argT,
+                                                             resT>::is_defined)
+        {
+            return td_ns::GetTypeid<resT>{}.get();
+        }
+        else {
+            return td_ns::GetTypeid<void>{}.get();
+        }
+    }
+};
+
+template <typename argTy, typename resTy>
+sycl::event bitwise_left_shift_inplace_contig_impl(
+    sycl::queue &exec_q,
+    std::size_t nelems,
+    const char *arg_p,
+    ssize_t arg_offset,
+    char *res_p,
+    ssize_t res_offset,
+    const std::vector<sycl::event> &depends = {})
+{
+    using BitwiseLSHS =
+        hyperparam_detail::BitwiseLeftShiftContigHyperparameterSet<resTy,
+                                                                   argTy>;
+    static constexpr std::uint8_t vec_sz = BitwiseLSHS::vec_sz;
+    static constexpr std::uint8_t n_vecs = BitwiseLSHS::n_vecs;
+
+    return elementwise_common::binary_inplace_contig_impl<
+        argTy, resTy, BitwiseLeftShiftInplaceContigFunctor,
+        bitwise_left_shift_inplace_contig_kernel, vec_sz, n_vecs>(
+        exec_q, nelems, arg_p, arg_offset, res_p, res_offset, depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct BitwiseLeftShiftInplaceContigFactory
+{
+    fnT get()
+    {
+        if constexpr (!BitwiseLeftShiftInplaceTypePairSupport<T1,
+                                                              T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = bitwise_left_shift_inplace_contig_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+
+template <typename resT, typename argT, typename IndexerT>
+class bitwise_left_shift_inplace_strided_kernel;
+
+template <typename argTy, typename resTy>
+sycl::event bitwise_left_shift_inplace_strided_impl(
+    sycl::queue &exec_q,
+    std::size_t nelems,
+    int nd,
+    const ssize_t *shape_and_strides,
+    const char *arg_p,
+    ssize_t arg_offset,
+    char *res_p,
+    ssize_t res_offset,
+    const std::vector<sycl::event> &depends,
+    const std::vector<sycl::event> &additional_depends)
+{
+    return elementwise_common::binary_inplace_strided_impl<
+        argTy, resTy, BitwiseLeftShiftInplaceStridedFunctor,
+        bitwise_left_shift_inplace_strided_kernel>(
+        exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p,
+        res_offset, depends, additional_depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct BitwiseLeftShiftInplaceStridedFactory
+{
+    fnT get()
+    {
+        if constexpr (!BitwiseLeftShiftInplaceTypePairSupport<T1,
+                                                              T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = bitwise_left_shift_inplace_strided_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+
+} // namespace dpctl::tensor::kernels::bitwise_left_shift
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/bitwise_or.hpp b/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/bitwise_or.hpp
new file mode 100644
index 000000000000..6714f238ffce
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/bitwise_or.hpp
@@ -0,0 +1,461 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for elementwise bitwise_or(ar1, ar2) operation.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <cstddef>
+#include <cstdint>
+#include <type_traits>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "vec_size_util.hpp"
+
+#include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+#include "kernels/elementwise_functions/common_inplace.hpp"
+
+#include "utils/type_dispatch_building.hpp"
+#include "utils/type_utils.hpp"
+
+namespace dpctl::tensor::kernels::bitwise_or
+{
+
+using dpctl::tensor::ssize_t;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+template <typename argT1, typename argT2, typename resT>
+struct BitwiseOrFunctor
+{
+    static_assert(std::is_same_v<resT, argT1>);
+    static_assert(std::is_same_v<resT, argT2>);
+
+    using supports_sg_loadstore = typename std::true_type;
+    using supports_vec = typename std::true_type;
+
+    resT operator()(const argT1 &in1, const argT2 &in2) const
+    {
+        if constexpr (std::is_same_v<resT, bool>) {
+            return in1 || in2;
+        }
+        else {
+            return (in1 | in2);
+        }
+    }
+
+    template <int vec_sz>
+    sycl::vec<resT, vec_sz>
+        operator()(const sycl::vec<argT1, vec_sz> &in1,
+                   const sycl::vec<argT2, vec_sz> &in2) const
+    {
+
+        if constexpr (std::is_same_v<resT, bool>) {
+            using dpctl::tensor::type_utils::vec_cast;
+
+            auto tmp = (in1 || in2);
+            return vec_cast<resT, typename decltype(tmp)::element_type, vec_sz>(
+                tmp);
+        }
+        else {
+            return (in1 | in2);
+        }
+    }
+};
+
+template <typename argT1,
+          typename argT2,
+          typename resT,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
+          bool enable_sg_loadstore = true>
+using BitwiseOrContigFunctor = elementwise_common::BinaryContigFunctor<
+    argT1,
+    argT2,
+    resT,
+    BitwiseOrFunctor<argT1, argT2, resT>,
+    vec_sz,
+    n_vecs,
+    enable_sg_loadstore>;
+
+template <typename argT1, typename argT2, typename resT, typename IndexerT>
+using BitwiseOrStridedFunctor = elementwise_common::BinaryStridedFunctor<
+    argT1,
+    argT2,
+    resT,
+    IndexerT,
+    BitwiseOrFunctor<argT1, argT2, resT>>;
+
+template <typename T1, typename T2>
+struct BitwiseOrOutputType
+{
+    using value_type = typename std::disjunction<
+        td_ns::BinaryTypeMapResultEntry<T1, bool, T2, bool, bool>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint8_t,
+                                        T2,
+                                        std::uint8_t,
+                                        std::uint8_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::int8_t,
+                                        T2,
+                                        std::int8_t,
+                                        std::int8_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint16_t,
+                                        T2,
+                                        std::uint16_t,
+                                        std::uint16_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::int16_t,
+                                        T2,
+                                        std::int16_t,
+                                        std::int16_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint32_t,
+                                        T2,
+                                        std::uint32_t,
+                                        std::uint32_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::int32_t,
+                                        T2,
+                                        std::int32_t,
+                                        std::int32_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint64_t,
+                                        T2,
+                                        std::uint64_t,
+                                        std::uint64_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::int64_t,
+                                        T2,
+                                        std::int64_t,
+                                        std::int64_t>,
+        td_ns::DefaultResultEntry<void>>::result_type;
+
+    static constexpr bool is_defined = !std::is_same_v<value_type, void>;
+};
+
+namespace hyperparam_detail
+{
+
+namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
+
+using vsu_ns::BinaryContigHyperparameterSetEntry;
+using vsu_ns::ContigHyperparameterSetDefault;
+
+template <typename argTy1, typename argTy2>
+struct BitwiseOrContigHyperparameterSet
+{
+    using value_type =
+        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
+
+    constexpr static auto vec_sz = value_type::vec_sz;
+    constexpr static auto n_vecs = value_type::n_vecs;
+};
+
+} // end of namespace hyperparam_detail
+
+template <typename argT1,
+          typename argT2,
+          typename resT,
+          std::uint8_t vec_sz,
+          std::uint8_t n_vecs>
+class bitwise_or_contig_kernel;
+
+template <typename argTy1, typename argTy2>
+sycl::event bitwise_or_contig_impl(sycl::queue &exec_q,
+                                   std::size_t nelems,
+                                   const char *arg1_p,
+                                   ssize_t arg1_offset,
+                                   const char *arg2_p,
+                                   ssize_t arg2_offset,
+                                   char *res_p,
+                                   ssize_t res_offset,
+                                   const std::vector<sycl::event> &depends = {})
+{
+    using BitwiseOrHS =
+        hyperparam_detail::BitwiseOrContigHyperparameterSet<argTy1, argTy2>;
+    static constexpr std::uint8_t vec_sz = BitwiseOrHS::vec_sz;
+    static constexpr std::uint8_t n_vecs = BitwiseOrHS::n_vecs;
+
+    return elementwise_common::binary_contig_impl<
+        argTy1, argTy2, BitwiseOrOutputType, BitwiseOrContigFunctor,
+        bitwise_or_contig_kernel, vec_sz, n_vecs>(
+        exec_q, nelems, arg1_p, arg1_offset, arg2_p, arg2_offset, res_p,
+        res_offset, depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct BitwiseOrContigFactory
+{
+    fnT get()
+    {
+        if constexpr (!BitwiseOrOutputType<T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = bitwise_or_contig_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+
+template <typename fnT, typename T1, typename T2>
+struct BitwiseOrTypeMapFactory
+{
+    /*! @brief get typeid for output type of operator()>(x, y), always bool
+     */
+    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
+    {
+        using rT = typename BitwiseOrOutputType<T1, T2>::value_type;
+        return td_ns::GetTypeid<rT>{}.get();
+    }
+};
+
+template <typename T1, typename T2, typename resT, typename IndexerT>
+class bitwise_or_strided_kernel;
+
+template <typename argTy1, typename argTy2>
+sycl::event
+    bitwise_or_strided_impl(sycl::queue &exec_q,
+                            std::size_t nelems,
+                            int nd,
+                            const ssize_t *shape_and_strides,
+                            const char *arg1_p,
+                            ssize_t arg1_offset,
+                            const char *arg2_p,
+                            ssize_t arg2_offset,
+                            char *res_p,
+                            ssize_t res_offset,
+                            const std::vector<sycl::event> &depends,
+                            const std::vector<sycl::event> &additional_depends)
+{
+    return elementwise_common::binary_strided_impl<
+        argTy1, argTy2, BitwiseOrOutputType, BitwiseOrStridedFunctor,
+        bitwise_or_strided_kernel>(
+        exec_q, nelems, nd, shape_and_strides, arg1_p, arg1_offset, arg2_p,
+        arg2_offset, res_p, res_offset, depends, additional_depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct BitwiseOrStridedFactory
+{
+    fnT get()
+    {
+        if constexpr (!BitwiseOrOutputType<T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = bitwise_or_strided_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+
+template <typename argT, typename resT>
+struct BitwiseOrInplaceFunctor
+{
+    using supports_sg_loadstore = typename std::true_type;
+    using supports_vec = typename std::true_type;
+
+    void operator()(resT &res, const argT &in) const
+    {
+        if constexpr (std::is_same_v<resT, bool>) {
+            res = res || in;
+        }
+        else {
+            res |= in;
+        }
+    }
+
+    template <int vec_sz>
+    void operator()(sycl::vec<resT, vec_sz> &res,
+                    const sycl::vec<argT, vec_sz> &in) const
+    {
+
+        if constexpr (std::is_same_v<resT, bool>) {
+            using dpctl::tensor::type_utils::vec_cast;
+
+            auto tmp = (res || in);
+            res = vec_cast<resT, typename decltype(tmp)::element_type, vec_sz>(
+                tmp);
+        }
+        else {
+            res |= in;
+        }
+    }
+};
+
+template <typename argT,
+          typename resT,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
+          bool enable_sg_loadstore = true>
+using BitwiseOrInplaceContigFunctor =
+    elementwise_common::BinaryInplaceContigFunctor<
+        argT,
+        resT,
+        BitwiseOrInplaceFunctor<argT, resT>,
+        vec_sz,
+        n_vecs,
+        enable_sg_loadstore>;
+
+template <typename argT, typename resT, typename IndexerT>
+using BitwiseOrInplaceStridedFunctor =
+    elementwise_common::BinaryInplaceStridedFunctor<
+        argT,
+        resT,
+        IndexerT,
+        BitwiseOrInplaceFunctor<argT, resT>>;
+
+template <typename argT,
+          typename resT,
+          std::uint8_t vec_sz,
+          std::uint8_t n_vecs>
+class bitwise_or_inplace_contig_kernel;
+
+/* @brief Types supported by in-place bitwise OR */
+template <typename argTy, typename resTy>
+struct BitwiseOrInplaceTypePairSupport
+{
+    /* value if true a kernel for <argTy, resTy> must be instantiated  */
+    static constexpr bool is_defined = std::disjunction<
+        td_ns::TypePairDefinedEntry<argTy, bool, resTy, bool>,
+        td_ns::TypePairDefinedEntry<argTy, std::int8_t, resTy, std::int8_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, resTy, std::uint8_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int16_t, resTy, std::int16_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, resTy, std::uint16_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int32_t, resTy, std::int32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint32_t, resTy, std::uint32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int64_t, resTy, std::int64_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint64_t, resTy, std::uint64_t>,
+        // fall-through
+        td_ns::NotDefinedEntry>::is_defined;
+};
+
+template <typename fnT, typename argT, typename resT>
+struct BitwiseOrInplaceTypeMapFactory
+{
+    /*! @brief get typeid for output type of x |= y */
+    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
+    {
+        if constexpr (BitwiseOrInplaceTypePairSupport<argT, resT>::is_defined) {
+            return td_ns::GetTypeid<resT>{}.get();
+        }
+        else {
+            return td_ns::GetTypeid<void>{}.get();
+        }
+    }
+};
+
+template <typename argTy, typename resTy>
+sycl::event
+    bitwise_or_inplace_contig_impl(sycl::queue &exec_q,
+                                   std::size_t nelems,
+                                   const char *arg_p,
+                                   ssize_t arg_offset,
+                                   char *res_p,
+                                   ssize_t res_offset,
+                                   const std::vector<sycl::event> &depends = {})
+{
+    using BitwiseOrHS =
+        hyperparam_detail::BitwiseOrContigHyperparameterSet<resTy, argTy>;
+
+    static constexpr std::uint8_t vec_sz = BitwiseOrHS::vec_sz;
+    static constexpr std::uint8_t n_vecs = BitwiseOrHS::n_vecs;
+
+    return elementwise_common::binary_inplace_contig_impl<
+        argTy, resTy, BitwiseOrInplaceContigFunctor,
+        bitwise_or_inplace_contig_kernel, vec_sz, n_vecs>(
+        exec_q, nelems, arg_p, arg_offset, res_p, res_offset, depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct BitwiseOrInplaceContigFactory
+{
+    fnT get()
+    {
+        if constexpr (!BitwiseOrInplaceTypePairSupport<T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = bitwise_or_inplace_contig_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+
+template <typename resT, typename argT, typename IndexerT>
+class bitwise_or_inplace_strided_kernel;
+
+template <typename argTy, typename resTy>
+sycl::event bitwise_or_inplace_strided_impl(
+    sycl::queue &exec_q,
+    std::size_t nelems,
+    int nd,
+    const ssize_t *shape_and_strides,
+    const char *arg_p,
+    ssize_t arg_offset,
+    char *res_p,
+    ssize_t res_offset,
+    const std::vector<sycl::event> &depends,
+    const std::vector<sycl::event> &additional_depends)
+{
+    return elementwise_common::binary_inplace_strided_impl<
+        argTy, resTy, BitwiseOrInplaceStridedFunctor,
+        bitwise_or_inplace_strided_kernel>(
+        exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p,
+        res_offset, depends, additional_depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct BitwiseOrInplaceStridedFactory
+{
+    fnT get()
+    {
+        if constexpr (!BitwiseOrInplaceTypePairSupport<T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = bitwise_or_inplace_strided_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+
+} // namespace dpctl::tensor::kernels::bitwise_or
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/bitwise_right_shift.hpp b/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/bitwise_right_shift.hpp
new file mode 100644
index 000000000000..49e05ac43f9a
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/bitwise_right_shift.hpp
@@ -0,0 +1,493 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for elementwise bitwise_right_shift(ar1, ar2)
+/// operation.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <cstddef>
+#include <cstdint>
+#include <type_traits>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "vec_size_util.hpp"
+
+#include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+#include "kernels/elementwise_functions/common_inplace.hpp"
+
+#include "utils/type_dispatch_building.hpp"
+
+namespace dpctl::tensor::kernels::bitwise_right_shift
+{
+
+using dpctl::tensor::ssize_t;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+template <typename argT1, typename argT2, typename resT>
+struct BitwiseRightShiftFunctor
+{
+    static_assert(std::is_same_v<resT, argT1>);
+    static_assert(std::is_integral_v<argT1>);
+    static_assert(std::is_integral_v<argT2>);
+
+    using supports_sg_loadstore = typename std::true_type;
+    using supports_vec = typename std::true_type;
+
+    resT operator()(const argT1 &in1, const argT2 &in2) const
+    {
+        return impl(in1, in2);
+    }
+
+    template <int vec_sz>
+    sycl::vec<resT, vec_sz>
+        operator()(const sycl::vec<argT1, vec_sz> &in1,
+                   const sycl::vec<argT2, vec_sz> &in2) const
+    {
+        sycl::vec<resT, vec_sz> res;
+#pragma unroll
+        for (int i = 0; i < vec_sz; ++i) {
+            res[i] = impl(in1[i], in2[i]);
+        }
+        return res;
+    }
+
+private:
+    resT impl(const argT1 &in1, const argT2 &in2) const
+    {
+        static constexpr argT2 in1_bitsize =
+            static_cast<argT2>(sizeof(argT1) * 8);
+        static constexpr resT zero = resT(0);
+
+        // bitshift op with second operand negative, or >= bitwidth(argT1) is UB
+        // array API spec mandates 0
+        if constexpr (std::is_unsigned_v<argT2>) {
+            return (in2 < in1_bitsize) ? (in1 >> in2) : zero;
+        }
+        else {
+            return (in2 < argT2(0))
+                       ? zero
+                       : ((in2 < in1_bitsize)
+                              ? (in1 >> in2)
+                              : (in1 < argT1(0) ? resT(-1) : zero));
+        }
+    }
+};
+
+template <typename argT1,
+          typename argT2,
+          typename resT,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
+          bool enable_sg_loadstore = true>
+using BitwiseRightShiftContigFunctor = elementwise_common::BinaryContigFunctor<
+    argT1,
+    argT2,
+    resT,
+    BitwiseRightShiftFunctor<argT1, argT2, resT>,
+    vec_sz,
+    n_vecs,
+    enable_sg_loadstore>;
+
+template <typename argT1, typename argT2, typename resT, typename IndexerT>
+using BitwiseRightShiftStridedFunctor =
+    elementwise_common::BinaryStridedFunctor<
+        argT1,
+        argT2,
+        resT,
+        IndexerT,
+        BitwiseRightShiftFunctor<argT1, argT2, resT>>;
+
+template <typename T1, typename T2>
+struct BitwiseRightShiftOutputType
+{
+    using ResT = T1;
+    using value_type = typename std::disjunction<
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::int8_t,
+                                        T2,
+                                        std::int8_t,
+                                        std::int8_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint8_t,
+                                        T2,
+                                        std::uint8_t,
+                                        std::uint8_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::int16_t,
+                                        T2,
+                                        std::int16_t,
+                                        std::int16_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint16_t,
+                                        T2,
+                                        std::uint16_t,
+                                        std::uint16_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::int32_t,
+                                        T2,
+                                        std::int32_t,
+                                        std::int32_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint32_t,
+                                        T2,
+                                        std::uint32_t,
+                                        std::uint32_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::int64_t,
+                                        T2,
+                                        std::int64_t,
+                                        std::int64_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint64_t,
+                                        T2,
+                                        std::uint64_t,
+                                        std::uint64_t>,
+        td_ns::DefaultResultEntry<void>>::result_type;
+
+    static constexpr bool is_defined = !std::is_same_v<value_type, void>;
+};
+
+namespace hyperparam_detail
+{
+
+namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
+
+using vsu_ns::BinaryContigHyperparameterSetEntry;
+using vsu_ns::ContigHyperparameterSetDefault;
+
+template <typename argTy1, typename argTy2>
+struct BitwiseRightShiftContigHyperparameterSet
+{
+    using value_type =
+        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
+
+    constexpr static auto vec_sz = value_type::vec_sz;
+    constexpr static auto n_vecs = value_type::n_vecs;
+};
+
+} // namespace hyperparam_detail
+
+template <typename argT1,
+          typename argT2,
+          typename resT,
+          std::uint8_t vec_sz,
+          std::uint8_t n_vecs>
+class bitwise_right_shift_contig_kernel;
+
+template <typename argTy1, typename argTy2>
+sycl::event bitwise_right_shift_contig_impl(
+    sycl::queue &exec_q,
+    std::size_t nelems,
+    const char *arg1_p,
+    ssize_t arg1_offset,
+    const char *arg2_p,
+    ssize_t arg2_offset,
+    char *res_p,
+    ssize_t res_offset,
+    const std::vector<sycl::event> &depends = {})
+{
+    using BitwiseRSHS =
+        hyperparam_detail::BitwiseRightShiftContigHyperparameterSet<argTy1,
+                                                                    argTy2>;
+    constexpr std::uint8_t vec_sz = BitwiseRSHS::vec_sz;
+    constexpr std::uint8_t n_vecs = BitwiseRSHS::n_vecs;
+
+    return elementwise_common::binary_contig_impl<
+        argTy1, argTy2, BitwiseRightShiftOutputType,
+        BitwiseRightShiftContigFunctor, bitwise_right_shift_contig_kernel,
+        vec_sz, n_vecs>(exec_q, nelems, arg1_p, arg1_offset, arg2_p,
+                        arg2_offset, res_p, res_offset, depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct BitwiseRightShiftContigFactory
+{
+    fnT get()
+    {
+        if constexpr (!BitwiseRightShiftOutputType<T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = bitwise_right_shift_contig_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+
+template <typename fnT, typename T1, typename T2>
+struct BitwiseRightShiftTypeMapFactory
+{
+    /*! @brief get typeid for output type of operator()>(x, y), always bool
+     */
+    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
+    {
+        using rT = typename BitwiseRightShiftOutputType<T1, T2>::value_type;
+        return td_ns::GetTypeid<rT>{}.get();
+    }
+};
+
+template <typename T1, typename T2, typename resT, typename IndexerT>
+class bitwise_right_shift_strided_kernel;
+
+template <typename argTy1, typename argTy2>
+sycl::event bitwise_right_shift_strided_impl(
+    sycl::queue &exec_q,
+    std::size_t nelems,
+    int nd,
+    const ssize_t *shape_and_strides,
+    const char *arg1_p,
+    ssize_t arg1_offset,
+    const char *arg2_p,
+    ssize_t arg2_offset,
+    char *res_p,
+    ssize_t res_offset,
+    const std::vector<sycl::event> &depends,
+    const std::vector<sycl::event> &additional_depends)
+{
+    return elementwise_common::binary_strided_impl<
+        argTy1, argTy2, BitwiseRightShiftOutputType,
+        BitwiseRightShiftStridedFunctor, bitwise_right_shift_strided_kernel>(
+        exec_q, nelems, nd, shape_and_strides, arg1_p, arg1_offset, arg2_p,
+        arg2_offset, res_p, res_offset, depends, additional_depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct BitwiseRightShiftStridedFactory
+{
+    fnT get()
+    {
+        if constexpr (!BitwiseRightShiftOutputType<T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = bitwise_right_shift_strided_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+
+template <typename argT, typename resT>
+struct BitwiseRightShiftInplaceFunctor
+{
+    static_assert(std::is_integral_v<argT>);
+    static_assert(!std::is_same_v<argT, bool>);
+
+    using supports_sg_loadstore = typename std::true_type;
+    using supports_vec = typename std::true_type;
+
+    void operator()(resT &res, const argT &in) const
+    {
+        impl(res, in);
+    }
+
+    template <int vec_sz>
+    void operator()(sycl::vec<resT, vec_sz> &res,
+                    const sycl::vec<argT, vec_sz> &in) const
+    {
+#pragma unroll
+        for (int i = 0; i < vec_sz; ++i) {
+            impl(res[i], in[i]);
+        }
+    }
+
+private:
+    void impl(resT &res, const argT &in) const
+    {
+        static constexpr argT res_bitsize = static_cast<argT>(sizeof(resT) * 8);
+        static constexpr resT zero = resT(0);
+
+        // bitshift op with second operand negative, or >= bitwidth(argT1) is UB
+        // array API spec mandates 0
+        if constexpr (std::is_unsigned_v<argT>) {
+            (in < res_bitsize) ? (res >>= in) : res = zero;
+        }
+        else {
+            (in < argT(0)) ? res = zero
+                           : ((in < res_bitsize) ? (res >>= in)
+                              : (res < resT(0))  ? res = resT(-1)
+                                                 : res = zero);
+        }
+    }
+};
+
+template <typename argT,
+          typename resT,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
+          bool enable_sg_loadstore = true>
+using BitwiseRightShiftInplaceContigFunctor =
+    elementwise_common::BinaryInplaceContigFunctor<
+        argT,
+        resT,
+        BitwiseRightShiftInplaceFunctor<argT, resT>,
+        vec_sz,
+        n_vecs,
+        enable_sg_loadstore>;
+
+template <typename argT, typename resT, typename IndexerT>
+using BitwiseRightShiftInplaceStridedFunctor =
+    elementwise_common::BinaryInplaceStridedFunctor<
+        argT,
+        resT,
+        IndexerT,
+        BitwiseRightShiftInplaceFunctor<argT, resT>>;
+
+template <typename argT,
+          typename resT,
+          std::uint8_t vec_sz,
+          std::uint8_t n_vecs>
+class bitwise_right_shift_inplace_contig_kernel;
+
+/* @brief Types supported by in-place bitwise right shift */
+template <typename argTy, typename resTy>
+struct BitwiseRightShiftInplaceTypePairSupport
+{
+    /* value if true a kernel for <argTy, resTy> must be instantiated  */
+    static constexpr bool is_defined = std::disjunction<
+        td_ns::TypePairDefinedEntry<argTy, std::int8_t, resTy, std::int8_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, resTy, std::uint8_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int16_t, resTy, std::int16_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, resTy, std::uint16_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int32_t, resTy, std::int32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint32_t, resTy, std::uint32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int64_t, resTy, std::int64_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint64_t, resTy, std::uint64_t>,
+        // fall-through
+        td_ns::NotDefinedEntry>::is_defined;
+};
+
+template <typename fnT, typename argT, typename resT>
+struct BitwiseRightShiftInplaceTypeMapFactory
+{
+    /*! @brief get typeid for output type of x >>= y */
+    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
+    {
+        if constexpr (BitwiseRightShiftInplaceTypePairSupport<argT,
+                                                              resT>::is_defined)
+        {
+            return td_ns::GetTypeid<resT>{}.get();
+        }
+        else {
+            return td_ns::GetTypeid<void>{}.get();
+        }
+    }
+};
+
+template <typename argTy, typename resTy>
+sycl::event bitwise_right_shift_inplace_contig_impl(
+    sycl::queue &exec_q,
+    std::size_t nelems,
+    const char *arg_p,
+    ssize_t arg_offset,
+    char *res_p,
+    ssize_t res_offset,
+    const std::vector<sycl::event> &depends = {})
+{
+    using BitwiseRSHS =
+        hyperparam_detail::BitwiseRightShiftContigHyperparameterSet<resTy,
+                                                                    argTy>;
+
+    // res = OP(res, arg)
+    static constexpr std::uint8_t vec_sz = BitwiseRSHS::vec_sz;
+    static constexpr std::uint8_t n_vecs = BitwiseRSHS::n_vecs;
+
+    return elementwise_common::binary_inplace_contig_impl<
+        argTy, resTy, BitwiseRightShiftInplaceContigFunctor,
+        bitwise_right_shift_inplace_contig_kernel, vec_sz, n_vecs>(
+        exec_q, nelems, arg_p, arg_offset, res_p, res_offset, depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct BitwiseRightShiftInplaceContigFactory
+{
+    fnT get()
+    {
+        if constexpr (!BitwiseRightShiftInplaceTypePairSupport<T1,
+                                                               T2>::is_defined)
+        {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = bitwise_right_shift_inplace_contig_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+
+template <typename resT, typename argT, typename IndexerT>
+class bitwise_right_shift_inplace_strided_kernel;
+
+template <typename argTy, typename resTy>
+sycl::event bitwise_right_shift_inplace_strided_impl(
+    sycl::queue &exec_q,
+    std::size_t nelems,
+    int nd,
+    const ssize_t *shape_and_strides,
+    const char *arg_p,
+    ssize_t arg_offset,
+    char *res_p,
+    ssize_t res_offset,
+    const std::vector<sycl::event> &depends,
+    const std::vector<sycl::event> &additional_depends)
+{
+    return elementwise_common::binary_inplace_strided_impl<
+        argTy, resTy, BitwiseRightShiftInplaceStridedFunctor,
+        bitwise_right_shift_inplace_strided_kernel>(
+        exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p,
+        res_offset, depends, additional_depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct BitwiseRightShiftInplaceStridedFactory
+{
+    fnT get()
+    {
+        if constexpr (!BitwiseRightShiftInplaceTypePairSupport<T1,
+                                                               T2>::is_defined)
+        {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = bitwise_right_shift_inplace_strided_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+
+} // namespace dpctl::tensor::kernels::bitwise_right_shift
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/bitwise_xor.hpp b/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/bitwise_xor.hpp
new file mode 100644
index 000000000000..2238492d50d3
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/bitwise_xor.hpp
@@ -0,0 +1,465 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for elementwise bitwise_xor(ar1, ar2) operation.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <cstddef>
+#include <cstdint>
+#include <type_traits>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "vec_size_util.hpp"
+
+#include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+#include "kernels/elementwise_functions/common_inplace.hpp"
+
+#include "utils/type_dispatch_building.hpp"
+#include "utils/type_utils.hpp"
+
+namespace dpctl::tensor::kernels::bitwise_xor
+{
+
+using dpctl::tensor::ssize_t;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+template <typename argT1, typename argT2, typename resT>
+struct BitwiseXorFunctor
+{
+    static_assert(std::is_same_v<resT, argT1>);
+    static_assert(std::is_same_v<resT, argT2>);
+
+    using supports_sg_loadstore = typename std::true_type;
+    using supports_vec = typename std::true_type;
+
+    resT operator()(const argT1 &in1, const argT2 &in2) const
+    {
+        if constexpr (std::is_same_v<resT, bool>) {
+            // (false != false) -> false, (false != true) -> true
+            // (true != false) -> true,  (true != true) -> false
+            return (in1 != in2);
+        }
+        else {
+            return (in1 ^ in2);
+        }
+    }
+
+    template <int vec_sz>
+    sycl::vec<resT, vec_sz>
+        operator()(const sycl::vec<argT1, vec_sz> &in1,
+                   const sycl::vec<argT2, vec_sz> &in2) const
+    {
+
+        if constexpr (std::is_same_v<resT, bool>) {
+            using dpctl::tensor::type_utils::vec_cast;
+
+            auto tmp = (in1 != in2);
+            return vec_cast<resT, typename decltype(tmp)::element_type, vec_sz>(
+                tmp);
+        }
+        else {
+            return (in1 ^ in2);
+        }
+    }
+};
+
+template <typename argT1,
+          typename argT2,
+          typename resT,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
+          bool enable_sg_loadstore = true>
+using BitwiseXorContigFunctor = elementwise_common::BinaryContigFunctor<
+    argT1,
+    argT2,
+    resT,
+    BitwiseXorFunctor<argT1, argT2, resT>,
+    vec_sz,
+    n_vecs,
+    enable_sg_loadstore>;
+
+template <typename argT1, typename argT2, typename resT, typename IndexerT>
+using BitwiseXorStridedFunctor = elementwise_common::BinaryStridedFunctor<
+    argT1,
+    argT2,
+    resT,
+    IndexerT,
+    BitwiseXorFunctor<argT1, argT2, resT>>;
+
+template <typename T1, typename T2>
+struct BitwiseXorOutputType
+{
+    using value_type = typename std::disjunction<
+        td_ns::BinaryTypeMapResultEntry<T1, bool, T2, bool, bool>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint8_t,
+                                        T2,
+                                        std::uint8_t,
+                                        std::uint8_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::int8_t,
+                                        T2,
+                                        std::int8_t,
+                                        std::int8_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint16_t,
+                                        T2,
+                                        std::uint16_t,
+                                        std::uint16_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::int16_t,
+                                        T2,
+                                        std::int16_t,
+                                        std::int16_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint32_t,
+                                        T2,
+                                        std::uint32_t,
+                                        std::uint32_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::int32_t,
+                                        T2,
+                                        std::int32_t,
+                                        std::int32_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint64_t,
+                                        T2,
+                                        std::uint64_t,
+                                        std::uint64_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::int64_t,
+                                        T2,
+                                        std::int64_t,
+                                        std::int64_t>,
+        td_ns::DefaultResultEntry<void>>::result_type;
+
+    static constexpr bool is_defined = !std::is_same_v<value_type, void>;
+};
+
+namespace hyperparam_detail
+{
+
+namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
+
+using vsu_ns::BinaryContigHyperparameterSetEntry;
+using vsu_ns::ContigHyperparameterSetDefault;
+
+template <typename argTy1, typename argTy2>
+struct BitwiseXorContigHyperparameterSet
+{
+    using value_type =
+        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
+
+    constexpr static auto vec_sz = value_type::vec_sz;
+    constexpr static auto n_vecs = value_type::n_vecs;
+};
+
+} // end of namespace hyperparam_detail
+
+template <typename argT1,
+          typename argT2,
+          typename resT,
+          std::uint8_t vec_sz,
+          std::uint8_t n_vecs>
+class bitwise_xor_contig_kernel;
+
+template <typename argTy1, typename argTy2>
+sycl::event
+    bitwise_xor_contig_impl(sycl::queue &exec_q,
+                            std::size_t nelems,
+                            const char *arg1_p,
+                            ssize_t arg1_offset,
+                            const char *arg2_p,
+                            ssize_t arg2_offset,
+                            char *res_p,
+                            ssize_t res_offset,
+                            const std::vector<sycl::event> &depends = {})
+{
+    using BitwiseXorHS =
+        hyperparam_detail::BitwiseXorContigHyperparameterSet<argTy1, argTy2>;
+    static constexpr std::uint8_t vec_sz = BitwiseXorHS::vec_sz;
+    static constexpr std::uint8_t n_vecs = BitwiseXorHS::n_vecs;
+
+    return elementwise_common::binary_contig_impl<
+        argTy1, argTy2, BitwiseXorOutputType, BitwiseXorContigFunctor,
+        bitwise_xor_contig_kernel, vec_sz, n_vecs>(
+        exec_q, nelems, arg1_p, arg1_offset, arg2_p, arg2_offset, res_p,
+        res_offset, depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct BitwiseXorContigFactory
+{
+    fnT get()
+    {
+        if constexpr (!BitwiseXorOutputType<T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = bitwise_xor_contig_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+
+template <typename fnT, typename T1, typename T2>
+struct BitwiseXorTypeMapFactory
+{
+    /*! @brief get typeid for output type of operator()>(x, y), always bool
+     */
+    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
+    {
+        using rT = typename BitwiseXorOutputType<T1, T2>::value_type;
+        return td_ns::GetTypeid<rT>{}.get();
+    }
+};
+
+template <typename T1, typename T2, typename resT, typename IndexerT>
+class bitwise_xor_strided_kernel;
+
+template <typename argTy1, typename argTy2>
+sycl::event
+    bitwise_xor_strided_impl(sycl::queue &exec_q,
+                             std::size_t nelems,
+                             int nd,
+                             const ssize_t *shape_and_strides,
+                             const char *arg1_p,
+                             ssize_t arg1_offset,
+                             const char *arg2_p,
+                             ssize_t arg2_offset,
+                             char *res_p,
+                             ssize_t res_offset,
+                             const std::vector<sycl::event> &depends,
+                             const std::vector<sycl::event> &additional_depends)
+{
+    return elementwise_common::binary_strided_impl<
+        argTy1, argTy2, BitwiseXorOutputType, BitwiseXorStridedFunctor,
+        bitwise_xor_strided_kernel>(
+        exec_q, nelems, nd, shape_and_strides, arg1_p, arg1_offset, arg2_p,
+        arg2_offset, res_p, res_offset, depends, additional_depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct BitwiseXorStridedFactory
+{
+    fnT get()
+    {
+        if constexpr (!BitwiseXorOutputType<T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = bitwise_xor_strided_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+
+template <typename argT, typename resT>
+struct BitwiseXorInplaceFunctor
+{
+    using supports_sg_loadstore = typename std::true_type;
+    using supports_vec = typename std::true_type;
+
+    void operator()(resT &res, const argT &in) const
+    {
+        if constexpr (std::is_same_v<resT, bool>) {
+            res = (res != in);
+        }
+        else {
+            res ^= in;
+        }
+    }
+
+    template <int vec_sz>
+    void operator()(sycl::vec<resT, vec_sz> &res,
+                    const sycl::vec<argT, vec_sz> &in) const
+    {
+
+        if constexpr (std::is_same_v<resT, bool>) {
+            using dpctl::tensor::type_utils::vec_cast;
+
+            auto tmp = (res != in);
+            res = vec_cast<resT, typename decltype(tmp)::element_type, vec_sz>(
+                tmp);
+        }
+        else {
+            res ^= in;
+        }
+    }
+};
+
+template <typename argT,
+          typename resT,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
+          bool enable_sg_loadstore = true>
+using BitwiseXorInplaceContigFunctor =
+    elementwise_common::BinaryInplaceContigFunctor<
+        argT,
+        resT,
+        BitwiseXorInplaceFunctor<argT, resT>,
+        vec_sz,
+        n_vecs,
+        enable_sg_loadstore>;
+
+template <typename argT, typename resT, typename IndexerT>
+using BitwiseXorInplaceStridedFunctor =
+    elementwise_common::BinaryInplaceStridedFunctor<
+        argT,
+        resT,
+        IndexerT,
+        BitwiseXorInplaceFunctor<argT, resT>>;
+
+template <typename argT,
+          typename resT,
+          std::uint8_t vec_sz,
+          std::uint8_t n_vecs>
+class bitwise_xor_inplace_contig_kernel;
+
+/* @brief Types supported by in-place bitwise XOR */
+template <typename argTy, typename resTy>
+struct BitwiseXorInplaceTypePairSupport
+{
+    /* value if true a kernel for <argTy, resTy> must be instantiated  */
+    static constexpr bool is_defined = std::disjunction<
+        td_ns::TypePairDefinedEntry<argTy, bool, resTy, bool>,
+        td_ns::TypePairDefinedEntry<argTy, std::int8_t, resTy, std::int8_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, resTy, std::uint8_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int16_t, resTy, std::int16_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, resTy, std::uint16_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int32_t, resTy, std::int32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint32_t, resTy, std::uint32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int64_t, resTy, std::int64_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint64_t, resTy, std::uint64_t>,
+        // fall-through
+        td_ns::NotDefinedEntry>::is_defined;
+};
+
+template <typename fnT, typename argT, typename resT>
+struct BitwiseXorInplaceTypeMapFactory
+{
+    /*! @brief get typeid for output type of x ^= y */
+    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
+    {
+        if constexpr (BitwiseXorInplaceTypePairSupport<argT, resT>::is_defined)
+        {
+            return td_ns::GetTypeid<resT>{}.get();
+        }
+        else {
+            return td_ns::GetTypeid<void>{}.get();
+        }
+    }
+};
+
+template <typename argTy, typename resTy>
+sycl::event bitwise_xor_inplace_contig_impl(
+    sycl::queue &exec_q,
+    std::size_t nelems,
+    const char *arg_p,
+    ssize_t arg_offset,
+    char *res_p,
+    ssize_t res_offset,
+    const std::vector<sycl::event> &depends = {})
+{
+    using BitwiseXorHS =
+        hyperparam_detail::BitwiseXorContigHyperparameterSet<resTy, argTy>;
+
+    static constexpr std::uint8_t vec_sz = BitwiseXorHS::vec_sz;
+    static constexpr std::uint8_t n_vecs = BitwiseXorHS::n_vecs;
+
+    return elementwise_common::binary_inplace_contig_impl<
+        argTy, resTy, BitwiseXorInplaceContigFunctor,
+        bitwise_xor_inplace_contig_kernel, vec_sz, n_vecs>(
+        exec_q, nelems, arg_p, arg_offset, res_p, res_offset, depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct BitwiseXorInplaceContigFactory
+{
+    fnT get()
+    {
+        if constexpr (!BitwiseXorInplaceTypePairSupport<T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = bitwise_xor_inplace_contig_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+
+template <typename resT, typename argT, typename IndexerT>
+class bitwise_xor_inplace_strided_kernel;
+
+template <typename argTy, typename resTy>
+sycl::event bitwise_xor_inplace_strided_impl(
+    sycl::queue &exec_q,
+    std::size_t nelems,
+    int nd,
+    const ssize_t *shape_and_strides,
+    const char *arg_p,
+    ssize_t arg_offset,
+    char *res_p,
+    ssize_t res_offset,
+    const std::vector<sycl::event> &depends,
+    const std::vector<sycl::event> &additional_depends)
+{
+    return elementwise_common::binary_inplace_strided_impl<
+        argTy, resTy, BitwiseXorInplaceStridedFunctor,
+        bitwise_xor_inplace_strided_kernel>(
+        exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p,
+        res_offset, depends, additional_depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct BitwiseXorInplaceStridedFactory
+{
+    fnT get()
+    {
+        if constexpr (!BitwiseXorInplaceTypePairSupport<T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = bitwise_xor_inplace_strided_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+
+} // namespace dpctl::tensor::kernels::bitwise_xor
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/common_inplace.hpp b/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/common_inplace.hpp
new file mode 100644
index 000000000000..2c028bc30155
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/common_inplace.hpp
@@ -0,0 +1,478 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines common code for in-place elementwise tensor operations.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <algorithm>
+#include <cstddef>
+#include <cstdint>
+#include <iterator>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "utils/offset_utils.hpp"
+#include "utils/sycl_alloc_utils.hpp"
+#include "utils/sycl_utils.hpp"
+
+#include "kernels/alignment.hpp"
+#include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/elementwise_functions/common_detail.hpp"
+
+namespace dpctl::tensor::kernels::elementwise_common
+{
+
+using dpctl::tensor::ssize_t;
+using dpctl::tensor::kernels::alignment_utils::
+    disabled_sg_loadstore_wrapper_krn;
+using dpctl::tensor::kernels::alignment_utils::is_aligned;
+using dpctl::tensor::kernels::alignment_utils::required_alignment;
+
+using dpctl::tensor::sycl_utils::sub_group_load;
+using dpctl::tensor::sycl_utils::sub_group_store;
+
+template <typename argT,
+          typename resT,
+          typename BinaryInplaceOperatorT,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
+          bool enable_sg_loadstore = true>
+struct BinaryInplaceContigFunctor
+{
+private:
+    const argT *rhs = nullptr;
+    resT *lhs = nullptr;
+    std::size_t nelems_;
+
+public:
+    BinaryInplaceContigFunctor(const argT *rhs_tp,
+                               resT *lhs_tp,
+                               const std::size_t n_elems)
+        : rhs(rhs_tp), lhs(lhs_tp), nelems_(n_elems)
+    {
+    }
+
+    void operator()(sycl::nd_item<1> ndit) const
+    {
+        BinaryInplaceOperatorT op{};
+        static constexpr std::uint8_t elems_per_wi = vec_sz * n_vecs;
+        /* Each work-item processes vec_sz elements, contiguous in memory */
+        /* NB: Workgroup size must be divisible by sub-group size */
+
+        if constexpr (enable_sg_loadstore &&
+                      BinaryInplaceOperatorT::supports_sg_loadstore::value &&
+                      BinaryInplaceOperatorT::supports_vec::value &&
+                      (vec_sz > 1))
+        {
+            auto sg = ndit.get_sub_group();
+            std::uint16_t sgSize = sg.get_max_local_range()[0];
+
+            std::size_t base =
+                elems_per_wi * (ndit.get_group(0) * ndit.get_local_range(0) +
+                                sg.get_group_id()[0] * sgSize);
+
+            if (base + elems_per_wi * sgSize < nelems_) {
+
+#pragma unroll
+                for (std::uint8_t it = 0; it < elems_per_wi; it += vec_sz) {
+                    const std::size_t offset = base + it * sgSize;
+                    auto rhs_multi_ptr = sycl::address_space_cast<
+                        sycl::access::address_space::global_space,
+                        sycl::access::decorated::yes>(&rhs[offset]);
+                    auto lhs_multi_ptr = sycl::address_space_cast<
+                        sycl::access::address_space::global_space,
+                        sycl::access::decorated::yes>(&lhs[offset]);
+
+                    const sycl::vec<argT, vec_sz> &arg_vec =
+                        sub_group_load<vec_sz>(sg, rhs_multi_ptr);
+                    sycl::vec<resT, vec_sz> res_vec =
+                        sub_group_load<vec_sz>(sg, lhs_multi_ptr);
+                    op(res_vec, arg_vec);
+
+                    sub_group_store<vec_sz>(sg, res_vec, lhs_multi_ptr);
+                }
+            }
+            else {
+                const std::size_t lane_id = sg.get_local_id()[0];
+                for (std::size_t k = base + lane_id; k < nelems_; k += sgSize) {
+                    op(lhs[k], rhs[k]);
+                }
+            }
+        }
+        else if constexpr (enable_sg_loadstore &&
+                           BinaryInplaceOperatorT::supports_sg_loadstore::value)
+        {
+            auto sg = ndit.get_sub_group();
+            std::uint16_t sgSize = sg.get_max_local_range()[0];
+
+            std::size_t base =
+                elems_per_wi * (ndit.get_group(0) * ndit.get_local_range(0) +
+                                sg.get_group_id()[0] * sgSize);
+
+            if (base + elems_per_wi * sgSize < nelems_) {
+#pragma unroll
+                for (std::uint8_t it = 0; it < elems_per_wi; it += vec_sz) {
+                    const std::size_t offset = base + it * sgSize;
+                    auto rhs_multi_ptr = sycl::address_space_cast<
+                        sycl::access::address_space::global_space,
+                        sycl::access::decorated::yes>(&rhs[offset]);
+                    auto lhs_multi_ptr = sycl::address_space_cast<
+                        sycl::access::address_space::global_space,
+                        sycl::access::decorated::yes>(&lhs[offset]);
+
+                    const sycl::vec<argT, vec_sz> arg_vec =
+                        sub_group_load<vec_sz>(sg, rhs_multi_ptr);
+                    sycl::vec<resT, vec_sz> res_vec =
+                        sub_group_load<vec_sz>(sg, lhs_multi_ptr);
+#pragma unroll
+                    for (std::uint8_t vec_id = 0; vec_id < vec_sz; ++vec_id) {
+                        op(res_vec[vec_id], arg_vec[vec_id]);
+                    }
+                    sub_group_store<vec_sz>(sg, res_vec, lhs_multi_ptr);
+                }
+            }
+            else {
+                const std::size_t lane_id = sg.get_local_id()[0];
+                for (std::size_t k = base + lane_id; k < nelems_; k += sgSize) {
+                    op(lhs[k], rhs[k]);
+                }
+            }
+        }
+        else {
+            const std::size_t sgSize =
+                ndit.get_sub_group().get_local_range()[0];
+            const std::size_t gid = ndit.get_global_linear_id();
+            const std::size_t elems_per_sg = elems_per_wi * sgSize;
+
+            const std::size_t start =
+                (gid / sgSize) * (elems_per_sg - sgSize) + gid;
+            const std::size_t end = std::min(nelems_, start + elems_per_sg);
+            for (std::size_t offset = start; offset < end; offset += sgSize) {
+                op(lhs[offset], rhs[offset]);
+            }
+        }
+    }
+};
+
+template <typename argT,
+          typename resT,
+          typename TwoOffsets_IndexerT,
+          typename BinaryInplaceOperatorT>
+struct BinaryInplaceStridedFunctor
+{
+private:
+    const argT *rhs = nullptr;
+    resT *lhs = nullptr;
+    TwoOffsets_IndexerT two_offsets_indexer_;
+
+public:
+    BinaryInplaceStridedFunctor(const argT *rhs_tp,
+                                resT *lhs_tp,
+                                const TwoOffsets_IndexerT &inp_res_indexer)
+        : rhs(rhs_tp), lhs(lhs_tp), two_offsets_indexer_(inp_res_indexer)
+    {
+    }
+
+    void operator()(sycl::id<1> wid) const
+    {
+        const auto &two_offsets_ =
+            two_offsets_indexer_(static_cast<ssize_t>(wid.get(0)));
+
+        const auto &inp_offset = two_offsets_.get_first_offset();
+        const auto &lhs_offset = two_offsets_.get_second_offset();
+
+        BinaryInplaceOperatorT op{};
+        op(lhs[lhs_offset], rhs[inp_offset]);
+    }
+};
+
+template <typename argT, typename resT, typename BinaryOperatorT>
+struct BinaryInplaceRowMatrixBroadcastingFunctor
+{
+private:
+    const argT *padded_vec;
+    resT *mat;
+    std::size_t n_elems;
+    std::size_t n1;
+
+public:
+    BinaryInplaceRowMatrixBroadcastingFunctor(const argT *row_tp,
+                                              resT *mat_tp,
+                                              std::size_t n_elems_in_mat,
+                                              std::size_t n_elems_in_row)
+        : padded_vec(row_tp), mat(mat_tp), n_elems(n_elems_in_mat),
+          n1(n_elems_in_row)
+    {
+    }
+
+    void operator()(sycl::nd_item<1> ndit) const
+    {
+        /* Workgroup size is expected to be a multiple of sub-group size */
+        BinaryOperatorT op{};
+        static_assert(BinaryOperatorT::supports_sg_loadstore::value);
+
+        auto sg = ndit.get_sub_group();
+        const std::size_t gid = ndit.get_global_linear_id();
+
+        std::uint8_t sgSize = sg.get_max_local_range()[0];
+        std::size_t base = gid - sg.get_local_id()[0];
+
+        if (base + sgSize < n_elems) {
+            auto in_multi_ptr = sycl::address_space_cast<
+                sycl::access::address_space::global_space,
+                sycl::access::decorated::yes>(&padded_vec[base % n1]);
+
+            auto out_multi_ptr = sycl::address_space_cast<
+                sycl::access::address_space::global_space,
+                sycl::access::decorated::yes>(&mat[base]);
+
+            const argT vec_el = sub_group_load(sg, in_multi_ptr);
+            resT mat_el = sub_group_load(sg, out_multi_ptr);
+
+            op(mat_el, vec_el);
+
+            sub_group_store(sg, mat_el, out_multi_ptr);
+        }
+        else {
+            const std::size_t start = base + sg.get_local_id()[0];
+            for (std::size_t k = start; k < n_elems; k += sgSize) {
+                op(mat[k], padded_vec[k % n1]);
+            }
+        }
+    }
+};
+
+// Typedefs for function pointers
+
+typedef sycl::event (*binary_inplace_contig_impl_fn_ptr_t)(
+    sycl::queue &,
+    std::size_t,
+    const char *,
+    ssize_t,
+    char *,
+    ssize_t,
+    const std::vector<sycl::event> &);
+
+typedef sycl::event (*binary_inplace_strided_impl_fn_ptr_t)(
+    sycl::queue &,
+    std::size_t,
+    int,
+    const ssize_t *,
+    const char *,
+    ssize_t,
+    char *,
+    ssize_t,
+    const std::vector<sycl::event> &,
+    const std::vector<sycl::event> &);
+
+typedef sycl::event (*binary_inplace_row_matrix_broadcast_impl_fn_ptr_t)(
+    sycl::queue &,
+    std::vector<sycl::event> &,
+    std::size_t,
+    std::size_t,
+    const char *,
+    ssize_t,
+    char *,
+    ssize_t,
+    const std::vector<sycl::event> &);
+
+template <typename argTy,
+          typename resTy,
+          template <typename T1,
+                    typename T2,
+                    std::uint8_t vs,
+                    std::uint8_t nv,
+                    bool enable_sg_loadstore>
+          class BinaryInplaceContigFunctorT,
+          template <typename T1, typename T2, std::uint8_t vs, std::uint8_t nv>
+          class kernel_name,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u>
+sycl::event
+    binary_inplace_contig_impl(sycl::queue &exec_q,
+                               std::size_t nelems,
+                               const char *rhs_p,
+                               ssize_t rhs_offset,
+                               char *lhs_p,
+                               ssize_t lhs_offset,
+                               const std::vector<sycl::event> &depends = {})
+{
+    sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+
+        const std::size_t lws = 128;
+        const std::size_t n_groups =
+            ((nelems + lws * n_vecs * vec_sz - 1) / (lws * n_vecs * vec_sz));
+        const auto gws_range = sycl::range<1>(n_groups * lws);
+        const auto lws_range = sycl::range<1>(lws);
+
+        const argTy *arg_tp =
+            reinterpret_cast<const argTy *>(rhs_p) + rhs_offset;
+        resTy *res_tp = reinterpret_cast<resTy *>(lhs_p) + lhs_offset;
+
+        if (is_aligned<required_alignment>(arg_tp) &&
+            is_aligned<required_alignment>(res_tp))
+        {
+            static constexpr bool enable_sg_loadstore = true;
+            using KernelName = kernel_name<argTy, resTy, vec_sz, n_vecs>;
+            using Impl =
+                BinaryInplaceContigFunctorT<argTy, resTy, vec_sz, n_vecs,
+                                            enable_sg_loadstore>;
+
+            cgh.parallel_for<KernelName>(
+                sycl::nd_range<1>(gws_range, lws_range),
+                Impl(arg_tp, res_tp, nelems));
+        }
+        else {
+            static constexpr bool disable_sg_loadstore = true;
+            using InnerKernelName = kernel_name<argTy, resTy, vec_sz, n_vecs>;
+            using KernelName =
+                disabled_sg_loadstore_wrapper_krn<InnerKernelName>;
+            using Impl =
+                BinaryInplaceContigFunctorT<argTy, resTy, vec_sz, n_vecs,
+                                            disable_sg_loadstore>;
+
+            cgh.parallel_for<KernelName>(
+                sycl::nd_range<1>(gws_range, lws_range),
+                Impl(arg_tp, res_tp, nelems));
+        }
+    });
+    return comp_ev;
+}
+
+template <typename argTy,
+          typename resTy,
+          template <typename T1, typename T2, typename IndT>
+          class BinaryInplaceStridedFunctorT,
+          template <typename T1, typename T2, typename IndT>
+          class kernel_name>
+sycl::event binary_inplace_strided_impl(
+    sycl::queue &exec_q,
+    std::size_t nelems,
+    int nd,
+    const ssize_t *shape_and_strides,
+    const char *rhs_p,
+    ssize_t rhs_offset,
+    char *lhs_p,
+    ssize_t lhs_offset,
+    const std::vector<sycl::event> &depends,
+    const std::vector<sycl::event> &additional_depends)
+{
+    sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+        cgh.depends_on(additional_depends);
+
+        using IndexerT =
+            typename dpctl::tensor::offset_utils::TwoOffsets_StridedIndexer;
+
+        const IndexerT indexer{nd, rhs_offset, lhs_offset, shape_and_strides};
+
+        const argTy *arg_tp = reinterpret_cast<const argTy *>(rhs_p);
+        resTy *res_tp = reinterpret_cast<resTy *>(lhs_p);
+
+        using Impl = BinaryInplaceStridedFunctorT<argTy, resTy, IndexerT>;
+
+        cgh.parallel_for<kernel_name<argTy, resTy, IndexerT>>(
+            {nelems}, Impl(arg_tp, res_tp, indexer));
+    });
+    return comp_ev;
+}
+
+template <typename argT,
+          typename resT,
+          template <typename T1, typename T3>
+          class BinaryInplaceRowMatrixBroadcastFunctorT,
+          template <typename T1, typename T3>
+          class kernel_name>
+sycl::event binary_inplace_row_matrix_broadcast_impl(
+    sycl::queue &exec_q,
+    std::vector<sycl::event> &host_tasks,
+    std::size_t n0,
+    std::size_t n1,
+    const char *vec_p, // typeless pointer to (n1,) contiguous row
+    ssize_t vec_offset,
+    char *mat_p, // typeless pointer to (n0, n1) C-contiguous matrix
+    ssize_t mat_offset,
+    const std::vector<sycl::event> &depends = {})
+{
+    const argT *vec = reinterpret_cast<const argT *>(vec_p) + vec_offset;
+    resT *mat = reinterpret_cast<resT *>(mat_p) + mat_offset;
+
+    const auto &dev = exec_q.get_device();
+    const auto &sg_sizes = dev.get_info<sycl::info::device::sub_group_sizes>();
+    // Get device-specific kernel info max_sub_group_size
+    std::size_t max_sgSize =
+        *(std::max_element(std::begin(sg_sizes), std::end(sg_sizes)));
+
+    std::size_t n1_padded = n1 + max_sgSize;
+    auto padded_vec_owner =
+        dpctl::tensor::alloc_utils::smart_malloc_device<argT>(n1_padded,
+                                                              exec_q);
+    argT *padded_vec = padded_vec_owner.get();
+
+    sycl::event make_padded_vec_ev =
+        dpctl::tensor::kernels::elementwise_detail::populate_padded_vector<
+            argT>(exec_q, vec, n1, padded_vec, n1_padded, depends);
+
+    // sub-group spans work-items [I, I + sgSize)
+    // base = ndit.get_global_linear_id() - sg.get_local_id()[0]
+    // Generically, sub_group_load( &mat[base]) may load arrays from
+    // different rows of mat. The start corresponds to row (base / n0)
+    // We read sub_group_load(&padded_vec[(base / n0)]). The vector is
+    // padded to ensure that reads are accessible
+
+    const std::size_t lws = 128;
+
+    sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(make_padded_vec_ev);
+
+        auto lwsRange = sycl::range<1>(lws);
+        std::size_t n_elems = n0 * n1;
+        std::size_t n_groups = (n_elems + lws - 1) / lws;
+        auto gwsRange = sycl::range<1>(n_groups * lws);
+
+        using Impl = BinaryInplaceRowMatrixBroadcastFunctorT<argT, resT>;
+
+        cgh.parallel_for<class kernel_name<argT, resT>>(
+            sycl::nd_range<1>(gwsRange, lwsRange),
+            Impl(padded_vec, mat, n_elems, n1));
+    });
+
+    sycl::event tmp_cleanup_ev = dpctl::tensor::alloc_utils::async_smart_free(
+        exec_q, {comp_ev}, padded_vec_owner);
+    host_tasks.push_back(tmp_cleanup_ev);
+
+    return comp_ev;
+}
+
+} // namespace dpctl::tensor::kernels::elementwise_common
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/linalg_functions/dot_product.hpp b/dpctl_ext/tensor/libtensor/include/kernels/linalg_functions/dot_product.hpp
index 697fad932755..b7f996bfa797 100644
--- a/dpctl_ext/tensor/libtensor/include/kernels/linalg_functions/dot_product.hpp
+++ b/dpctl_ext/tensor/libtensor/include/kernels/linalg_functions/dot_product.hpp
@@ -35,6 +35,7 @@
 #pragma once
 
 #include <algorithm>
+#include <cassert>
 #include <cstddef>
 #include <sycl/sycl.hpp>
 #include <type_traits>
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/linalg_functions/gemm.hpp b/dpctl_ext/tensor/libtensor/include/kernels/linalg_functions/gemm.hpp
index 8d2f1948754b..873c4dc89b44 100644
--- a/dpctl_ext/tensor/libtensor/include/kernels/linalg_functions/gemm.hpp
+++ b/dpctl_ext/tensor/libtensor/include/kernels/linalg_functions/gemm.hpp
@@ -35,11 +35,14 @@
 #pragma once
 
 #include <algorithm>
+#include <array
+#include <cassert>
 #include <complex>
 #include <cstddef>
 #include <cstdint>
 #include <stdexcept>
 #include <sycl/sycl.hpp>
+#include <tuple>
 #include <type_traits>
 #include <utility>
 #include <vector>
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/add.cpp b/dpctl_ext/tensor/libtensor/source/elementwise_functions/add.cpp
new file mode 100644
index 000000000000..e37fad67e294
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/elementwise_functions/add.cpp
@@ -0,0 +1,243 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "add.hpp"
+#include "elementwise_functions.hpp"
+#include "utils/type_dispatch.hpp"
+#include "utils/type_dispatch_building.hpp"
+
+#include "kernels/elementwise_functions/add.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+#include "kernels/elementwise_functions/common_inplace.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
+using ew_cmn_ns::binary_contig_impl_fn_ptr_t;
+using ew_cmn_ns::binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t;
+using ew_cmn_ns::binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t;
+using ew_cmn_ns::binary_strided_impl_fn_ptr_t;
+
+using ew_cmn_ns::binary_inplace_contig_impl_fn_ptr_t;
+using ew_cmn_ns::binary_inplace_row_matrix_broadcast_impl_fn_ptr_t;
+using ew_cmn_ns::binary_inplace_strided_impl_fn_ptr_t;
+
+// B01: ===== ADD (x1, x2)
+namespace impl
+{
+
+namespace add_fn_ns = dpctl::tensor::kernels::add;
+
+static binary_contig_impl_fn_ptr_t add_contig_dispatch_table[td_ns::num_types]
+                                                            [td_ns::num_types];
+
+static int add_output_id_table[td_ns::num_types][td_ns::num_types];
+static int add_inplace_output_id_table[td_ns::num_types][td_ns::num_types];
+
+static binary_strided_impl_fn_ptr_t
+    add_strided_dispatch_table[td_ns::num_types][td_ns::num_types];
+
+// add(matrix, row)
+static binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t
+    add_contig_matrix_contig_row_broadcast_dispatch_table[td_ns::num_types]
+                                                         [td_ns::num_types];
+
+// add(row, matrix)
+static binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t
+    add_contig_row_contig_matrix_broadcast_dispatch_table[td_ns::num_types]
+                                                         [td_ns::num_types];
+
+static binary_inplace_contig_impl_fn_ptr_t
+    add_inplace_contig_dispatch_table[td_ns::num_types][td_ns::num_types];
+static binary_inplace_strided_impl_fn_ptr_t
+    add_inplace_strided_dispatch_table[td_ns::num_types][td_ns::num_types];
+static binary_inplace_row_matrix_broadcast_impl_fn_ptr_t
+    add_inplace_row_matrix_dispatch_table[td_ns::num_types][td_ns::num_types];
+
+void populate_add_dispatch_tables(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = add_fn_ns;
+
+    // which input types are supported, and what is the type of the result
+    using fn_ns::AddTypeMapFactory;
+    DispatchTableBuilder<int, AddTypeMapFactory, num_types> dtb1;
+    dtb1.populate_dispatch_table(add_output_id_table);
+
+    // function pointers for operation on general strided arrays
+    using fn_ns::AddStridedFactory;
+    DispatchTableBuilder<binary_strided_impl_fn_ptr_t, AddStridedFactory,
+                         num_types>
+        dtb2;
+    dtb2.populate_dispatch_table(add_strided_dispatch_table);
+
+    // function pointers for operation on contiguous inputs and output
+    using fn_ns::AddContigFactory;
+    DispatchTableBuilder<binary_contig_impl_fn_ptr_t, AddContigFactory,
+                         num_types>
+        dtb3;
+    dtb3.populate_dispatch_table(add_contig_dispatch_table);
+
+    // function pointers for operation on contiguous matrix, contiguous row
+    // with contiguous matrix output
+    using fn_ns::AddContigMatrixContigRowBroadcastFactory;
+    DispatchTableBuilder<
+        binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t,
+        AddContigMatrixContigRowBroadcastFactory, num_types>
+        dtb4;
+    dtb4.populate_dispatch_table(
+        add_contig_matrix_contig_row_broadcast_dispatch_table);
+
+    // function pointers for operation on contiguous row, contiguous matrix
+    // with contiguous matrix output
+    using fn_ns::AddContigRowContigMatrixBroadcastFactory;
+    DispatchTableBuilder<
+        binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t,
+        AddContigRowContigMatrixBroadcastFactory, num_types>
+        dtb5;
+    dtb5.populate_dispatch_table(
+        add_contig_row_contig_matrix_broadcast_dispatch_table);
+
+    // function pointers for inplace operation on general strided arrays
+    using fn_ns::AddInplaceStridedFactory;
+    DispatchTableBuilder<binary_inplace_strided_impl_fn_ptr_t,
+                         AddInplaceStridedFactory, num_types>
+        dtb6;
+    dtb6.populate_dispatch_table(add_inplace_strided_dispatch_table);
+
+    // function pointers for inplace operation on contiguous inputs and output
+    using fn_ns::AddInplaceContigFactory;
+    DispatchTableBuilder<binary_inplace_contig_impl_fn_ptr_t,
+                         AddInplaceContigFactory, num_types>
+        dtb7;
+    dtb7.populate_dispatch_table(add_inplace_contig_dispatch_table);
+
+    // function pointers for inplace operation on contiguous matrix
+    // and contiguous row
+    using fn_ns::AddInplaceRowMatrixBroadcastFactory;
+    DispatchTableBuilder<binary_inplace_row_matrix_broadcast_impl_fn_ptr_t,
+                         AddInplaceRowMatrixBroadcastFactory, num_types>
+        dtb8;
+    dtb8.populate_dispatch_table(add_inplace_row_matrix_dispatch_table);
+
+    // which types are supported by the in-place kernels
+    using fn_ns::AddInplaceTypeMapFactory;
+    DispatchTableBuilder<int, AddInplaceTypeMapFactory, num_types> dtb9;
+    dtb9.populate_dispatch_table(add_inplace_output_id_table);
+};
+
+} // namespace impl
+
+void init_add(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        impl::populate_add_dispatch_tables();
+        using impl::add_contig_dispatch_table;
+        using impl::add_contig_matrix_contig_row_broadcast_dispatch_table;
+        using impl::add_contig_row_contig_matrix_broadcast_dispatch_table;
+        using impl::add_output_id_table;
+        using impl::add_strided_dispatch_table;
+
+        auto add_pyapi = [&](const arrayT &src1, const arrayT &src2,
+                             const arrayT &dst, sycl::queue &exec_q,
+                             const event_vecT &depends = {}) {
+            return py_binary_ufunc(
+                src1, src2, dst, exec_q, depends, add_output_id_table,
+                // function pointers to handle operation on contiguous arrays
+                // (pointers may be nullptr)
+                add_contig_dispatch_table,
+                // function pointers to handle operation on strided arrays (most
+                // general case)
+                add_strided_dispatch_table,
+                // function pointers to handle operation of c-contig matrix and
+                // c-contig row with broadcasting (may be nullptr)
+                add_contig_matrix_contig_row_broadcast_dispatch_table,
+                // function pointers to handle operation of c-contig matrix and
+                // c-contig row with broadcasting (may be nullptr)
+                add_contig_row_contig_matrix_broadcast_dispatch_table);
+        };
+        auto add_result_type_pyapi = [&](const py::dtype &dtype1,
+                                         const py::dtype &dtype2) {
+            return py_binary_ufunc_result_type(dtype1, dtype2,
+                                               add_output_id_table);
+        };
+        m.def("_add", add_pyapi, "", py::arg("src1"), py::arg("src2"),
+              py::arg("dst"), py::arg("sycl_queue"),
+              py::arg("depends") = py::list());
+        m.def("_add_result_type", add_result_type_pyapi, "");
+
+        using impl::add_inplace_contig_dispatch_table;
+        using impl::add_inplace_output_id_table;
+        using impl::add_inplace_row_matrix_dispatch_table;
+        using impl::add_inplace_strided_dispatch_table;
+
+        auto add_inplace_pyapi = [&](const arrayT &src, const arrayT &dst,
+                                     sycl::queue &exec_q,
+                                     const event_vecT &depends = {}) {
+            return py_binary_inplace_ufunc(
+                src, dst, exec_q, depends, add_inplace_output_id_table,
+                // function pointers to handle inplace operation on
+                // contiguous arrays (pointers may be nullptr)
+                add_inplace_contig_dispatch_table,
+                // function pointers to handle inplace operation on strided
+                // arrays (most general case)
+                add_inplace_strided_dispatch_table,
+                // function pointers to handle inplace operation on
+                // c-contig matrix with c-contig row with broadcasting
+                // (may be nullptr)
+                add_inplace_row_matrix_dispatch_table);
+        };
+        m.def("_add_inplace", add_inplace_pyapi, "", py::arg("lhs"),
+              py::arg("rhs"), py::arg("sycl_queue"),
+              py::arg("depends") = py::list());
+    }
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/add.hpp b/dpctl_ext/tensor/libtensor/source/elementwise_functions/add.hpp
new file mode 100644
index 000000000000..0797adb79ddb
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/elementwise_functions/add.hpp
@@ -0,0 +1,46 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern void init_add(py::module_ m);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/atan2.cpp b/dpctl_ext/tensor/libtensor/source/elementwise_functions/atan2.cpp
new file mode 100644
index 000000000000..60bb2e081fef
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/elementwise_functions/atan2.cpp
@@ -0,0 +1,146 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "atan2.hpp"
+#include "elementwise_functions.hpp"
+#include "utils/type_dispatch.hpp"
+#include "utils/type_dispatch_building.hpp"
+
+#include "kernels/elementwise_functions/atan2.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
+using ew_cmn_ns::binary_contig_impl_fn_ptr_t;
+using ew_cmn_ns::binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t;
+using ew_cmn_ns::binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t;
+using ew_cmn_ns::binary_strided_impl_fn_ptr_t;
+
+// B02: ===== ATAN2 (x1, x2)
+namespace impl
+{
+namespace atan2_fn_ns = dpctl::tensor::kernels::atan2;
+
+static binary_contig_impl_fn_ptr_t
+    atan2_contig_dispatch_table[td_ns::num_types][td_ns::num_types];
+static int atan2_output_id_table[td_ns::num_types][td_ns::num_types];
+
+static binary_strided_impl_fn_ptr_t
+    atan2_strided_dispatch_table[td_ns::num_types][td_ns::num_types];
+
+void populate_atan2_dispatch_tables(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = atan2_fn_ns;
+
+    // which input types are supported, and what is the type of the result
+    using fn_ns::Atan2TypeMapFactory;
+    DispatchTableBuilder<int, Atan2TypeMapFactory, num_types> dtb1;
+    dtb1.populate_dispatch_table(atan2_output_id_table);
+
+    // function pointers for operation on general strided arrays
+    using fn_ns::Atan2StridedFactory;
+    DispatchTableBuilder<binary_strided_impl_fn_ptr_t, Atan2StridedFactory,
+                         num_types>
+        dtb2;
+    dtb2.populate_dispatch_table(atan2_strided_dispatch_table);
+
+    // function pointers for operation on contiguous inputs and output
+    using fn_ns::Atan2ContigFactory;
+    DispatchTableBuilder<binary_contig_impl_fn_ptr_t, Atan2ContigFactory,
+                         num_types>
+        dtb3;
+    dtb3.populate_dispatch_table(atan2_contig_dispatch_table);
+};
+
+} // namespace impl
+
+void init_atan2(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        impl::populate_atan2_dispatch_tables();
+        using impl::atan2_contig_dispatch_table;
+        using impl::atan2_output_id_table;
+        using impl::atan2_strided_dispatch_table;
+
+        auto atan2_pyapi = [&](const arrayT &src1, const arrayT &src2,
+                               const arrayT &dst, sycl::queue &exec_q,
+                               const event_vecT &depends = {}) {
+            return py_binary_ufunc(
+                src1, src2, dst, exec_q, depends, atan2_output_id_table,
+                // function pointers to handle operation on contiguous arrays
+                // (pointers may be nullptr)
+                atan2_contig_dispatch_table,
+                // function pointers to handle operation on strided arrays (most
+                // general case)
+                atan2_strided_dispatch_table,
+                // function pointers to handle operation of c-contig matrix and
+                // c-contig row with broadcasting (may be nullptr)
+                td_ns::NullPtrTable<
+                    binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{},
+                // function pointers to handle operation of c-contig matrix and
+                // c-contig row with broadcasting (may be nullptr)
+                td_ns::NullPtrTable<
+                    binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{});
+        };
+        auto atan2_result_type_pyapi = [&](const py::dtype &dtype1,
+                                           const py::dtype &dtype2) {
+            return py_binary_ufunc_result_type(dtype1, dtype2,
+                                               atan2_output_id_table);
+        };
+        m.def("_atan2", atan2_pyapi, "", py::arg("src1"), py::arg("src2"),
+              py::arg("dst"), py::arg("sycl_queue"),
+              py::arg("depends") = py::list());
+        m.def("_atan2_result_type", atan2_result_type_pyapi, "");
+    }
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/atan2.hpp b/dpctl_ext/tensor/libtensor/source/elementwise_functions/atan2.hpp
new file mode 100644
index 000000000000..5bdf9b74db2e
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/elementwise_functions/atan2.hpp
@@ -0,0 +1,46 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern void init_atan2(py::module_ m);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/bitwise_and.cpp b/dpctl_ext/tensor/libtensor/source/elementwise_functions/bitwise_and.cpp
new file mode 100644
index 000000000000..3976f480ff6d
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/elementwise_functions/bitwise_and.cpp
@@ -0,0 +1,206 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "bitwise_and.hpp"
+#include "elementwise_functions.hpp"
+#include "utils/type_dispatch.hpp"
+#include "utils/type_dispatch_building.hpp"
+
+#include "kernels/elementwise_functions/bitwise_and.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+#include "kernels/elementwise_functions/common_inplace.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
+using ew_cmn_ns::binary_contig_impl_fn_ptr_t;
+using ew_cmn_ns::binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t;
+using ew_cmn_ns::binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t;
+using ew_cmn_ns::binary_strided_impl_fn_ptr_t;
+
+using ew_cmn_ns::binary_inplace_contig_impl_fn_ptr_t;
+using ew_cmn_ns::binary_inplace_row_matrix_broadcast_impl_fn_ptr_t;
+using ew_cmn_ns::binary_inplace_strided_impl_fn_ptr_t;
+
+// B03: ===== BITWISE_AND (x1, x2)
+namespace impl
+{
+namespace bitwise_and_fn_ns = dpctl::tensor::kernels::bitwise_and;
+
+static binary_contig_impl_fn_ptr_t
+    bitwise_and_contig_dispatch_table[td_ns::num_types][td_ns::num_types];
+
+static int bitwise_and_output_id_table[td_ns::num_types][td_ns::num_types];
+static int bitwise_and_inplace_output_id_table[td_ns::num_types]
+                                              [td_ns::num_types];
+
+static binary_strided_impl_fn_ptr_t
+    bitwise_and_strided_dispatch_table[td_ns::num_types][td_ns::num_types];
+
+static binary_inplace_contig_impl_fn_ptr_t
+    bitwise_and_inplace_contig_dispatch_table[td_ns::num_types]
+                                             [td_ns::num_types];
+static binary_inplace_strided_impl_fn_ptr_t
+    bitwise_and_inplace_strided_dispatch_table[td_ns::num_types]
+                                              [td_ns::num_types];
+
+void populate_bitwise_and_dispatch_tables(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = bitwise_and_fn_ns;
+
+    // which input types are supported, and what is the type of the result
+    using fn_ns::BitwiseAndTypeMapFactory;
+    DispatchTableBuilder<int, BitwiseAndTypeMapFactory, num_types> dtb1;
+    dtb1.populate_dispatch_table(bitwise_and_output_id_table);
+
+    // function pointers for operation on general strided arrays
+    using fn_ns::BitwiseAndStridedFactory;
+    DispatchTableBuilder<binary_strided_impl_fn_ptr_t, BitwiseAndStridedFactory,
+                         num_types>
+        dtb2;
+    dtb2.populate_dispatch_table(bitwise_and_strided_dispatch_table);
+
+    // function pointers for operation on contiguous inputs and output
+    using fn_ns::BitwiseAndContigFactory;
+    DispatchTableBuilder<binary_contig_impl_fn_ptr_t, BitwiseAndContigFactory,
+                         num_types>
+        dtb3;
+    dtb3.populate_dispatch_table(bitwise_and_contig_dispatch_table);
+
+    // function pointers for inplace operation on general strided arrays
+    using fn_ns::BitwiseAndInplaceStridedFactory;
+    DispatchTableBuilder<binary_inplace_strided_impl_fn_ptr_t,
+                         BitwiseAndInplaceStridedFactory, num_types>
+        dtb4;
+    dtb4.populate_dispatch_table(bitwise_and_inplace_strided_dispatch_table);
+
+    // function pointers for inplace operation on contiguous inputs and output
+    using fn_ns::BitwiseAndInplaceContigFactory;
+    DispatchTableBuilder<binary_inplace_contig_impl_fn_ptr_t,
+                         BitwiseAndInplaceContigFactory, num_types>
+        dtb5;
+    dtb5.populate_dispatch_table(bitwise_and_inplace_contig_dispatch_table);
+
+    // which types are supported by the in-place kernels
+    using fn_ns::BitwiseAndInplaceTypeMapFactory;
+    DispatchTableBuilder<int, BitwiseAndInplaceTypeMapFactory, num_types> dtb6;
+    dtb6.populate_dispatch_table(bitwise_and_inplace_output_id_table);
+};
+
+} // namespace impl
+
+void init_bitwise_and(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        impl::populate_bitwise_and_dispatch_tables();
+        using impl::bitwise_and_contig_dispatch_table;
+        using impl::bitwise_and_output_id_table;
+        using impl::bitwise_and_strided_dispatch_table;
+
+        auto bitwise_and_pyapi = [&](const arrayT &src1, const arrayT &src2,
+                                     const arrayT &dst, sycl::queue &exec_q,
+                                     const event_vecT &depends = {}) {
+            return py_binary_ufunc(
+                src1, src2, dst, exec_q, depends, bitwise_and_output_id_table,
+                // function pointers to handle operation on contiguous arrays
+                // (pointers may be nullptr)
+                bitwise_and_contig_dispatch_table,
+                // function pointers to handle operation on strided arrays (most
+                // general case)
+                bitwise_and_strided_dispatch_table,
+                // function pointers to handle operation of c-contig matrix and
+                // c-contig row with broadcasting (may be nullptr)
+                td_ns::NullPtrTable<
+                    binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{},
+                // function pointers to handle operation of c-contig matrix and
+                // c-contig row with broadcasting (may be nullptr)
+                td_ns::NullPtrTable<
+                    binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{});
+        };
+        auto bitwise_and_result_type_pyapi = [&](const py::dtype &dtype1,
+                                                 const py::dtype &dtype2) {
+            return py_binary_ufunc_result_type(dtype1, dtype2,
+                                               bitwise_and_output_id_table);
+        };
+        m.def("_bitwise_and", bitwise_and_pyapi, "", py::arg("src1"),
+              py::arg("src2"), py::arg("dst"), py::arg("sycl_queue"),
+              py::arg("depends") = py::list());
+        m.def("_bitwise_and_result_type", bitwise_and_result_type_pyapi, "");
+
+        using impl::bitwise_and_inplace_contig_dispatch_table;
+        using impl::bitwise_and_inplace_output_id_table;
+        using impl::bitwise_and_inplace_strided_dispatch_table;
+
+        auto bitwise_and_inplace_pyapi = [&](const arrayT &src,
+                                             const arrayT &dst,
+                                             sycl::queue &exec_q,
+                                             const event_vecT &depends = {}) {
+            return py_binary_inplace_ufunc(
+                src, dst, exec_q, depends, bitwise_and_inplace_output_id_table,
+                // function pointers to handle inplace operation on
+                // contiguous arrays (pointers may be nullptr)
+                bitwise_and_inplace_contig_dispatch_table,
+                // function pointers to handle inplace operation on strided
+                // arrays (most general case)
+                bitwise_and_inplace_strided_dispatch_table,
+                // function pointers to handle inplace operation on
+                // c-contig matrix with c-contig row with broadcasting
+                // (may be nullptr)
+                td_ns::NullPtrTable<
+                    binary_inplace_row_matrix_broadcast_impl_fn_ptr_t>{});
+        };
+        m.def("_bitwise_and_inplace", bitwise_and_inplace_pyapi, "",
+              py::arg("lhs"), py::arg("rhs"), py::arg("sycl_queue"),
+              py::arg("depends") = py::list());
+    }
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/bitwise_and.hpp b/dpctl_ext/tensor/libtensor/source/elementwise_functions/bitwise_and.hpp
new file mode 100644
index 000000000000..19f29ae8822e
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/elementwise_functions/bitwise_and.hpp
@@ -0,0 +1,46 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern void init_bitwise_and(py::module_ m);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/bitwise_left_shift.cpp b/dpctl_ext/tensor/libtensor/source/elementwise_functions/bitwise_left_shift.cpp
new file mode 100644
index 000000000000..c26c9a42864f
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/elementwise_functions/bitwise_left_shift.cpp
@@ -0,0 +1,216 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "bitwise_left_shift.hpp"
+#include "elementwise_functions.hpp"
+#include "utils/type_dispatch.hpp"
+#include "utils/type_dispatch_building.hpp"
+
+#include "kernels/elementwise_functions/bitwise_left_shift.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+#include "kernels/elementwise_functions/common_inplace.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
+using ew_cmn_ns::binary_contig_impl_fn_ptr_t;
+using ew_cmn_ns::binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t;
+using ew_cmn_ns::binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t;
+using ew_cmn_ns::binary_strided_impl_fn_ptr_t;
+
+using ew_cmn_ns::binary_inplace_contig_impl_fn_ptr_t;
+using ew_cmn_ns::binary_inplace_row_matrix_broadcast_impl_fn_ptr_t;
+using ew_cmn_ns::binary_inplace_strided_impl_fn_ptr_t;
+
+// B04: ===== BITWISE_LEFT_SHIFT (x1, x2)
+namespace impl
+{
+namespace bitwise_left_shift_fn_ns = dpctl::tensor::kernels::bitwise_left_shift;
+
+static binary_contig_impl_fn_ptr_t
+    bitwise_left_shift_contig_dispatch_table[td_ns::num_types]
+                                            [td_ns::num_types];
+
+static int bitwise_left_shift_output_id_table[td_ns::num_types]
+                                             [td_ns::num_types];
+static int bitwise_left_shift_inplace_output_id_table[td_ns::num_types]
+                                                     [td_ns::num_types];
+
+static binary_strided_impl_fn_ptr_t
+    bitwise_left_shift_strided_dispatch_table[td_ns::num_types]
+                                             [td_ns::num_types];
+
+static binary_inplace_contig_impl_fn_ptr_t
+    bitwise_left_shift_inplace_contig_dispatch_table[td_ns::num_types]
+                                                    [td_ns::num_types];
+static binary_inplace_strided_impl_fn_ptr_t
+    bitwise_left_shift_inplace_strided_dispatch_table[td_ns::num_types]
+                                                     [td_ns::num_types];
+
+void populate_bitwise_left_shift_dispatch_tables(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = bitwise_left_shift_fn_ns;
+
+    // which input types are supported, and what is the type of the result
+    using fn_ns::BitwiseLeftShiftTypeMapFactory;
+    DispatchTableBuilder<int, BitwiseLeftShiftTypeMapFactory, num_types> dtb1;
+    dtb1.populate_dispatch_table(bitwise_left_shift_output_id_table);
+
+    // function pointers for operation on general strided arrays
+    using fn_ns::BitwiseLeftShiftStridedFactory;
+    DispatchTableBuilder<binary_strided_impl_fn_ptr_t,
+                         BitwiseLeftShiftStridedFactory, num_types>
+        dtb2;
+    dtb2.populate_dispatch_table(bitwise_left_shift_strided_dispatch_table);
+
+    // function pointers for operation on contiguous inputs and output
+    using fn_ns::BitwiseLeftShiftContigFactory;
+    DispatchTableBuilder<binary_contig_impl_fn_ptr_t,
+                         BitwiseLeftShiftContigFactory, num_types>
+        dtb3;
+    dtb3.populate_dispatch_table(bitwise_left_shift_contig_dispatch_table);
+
+    // function pointers for inplace operation on general strided arrays
+    using fn_ns::BitwiseLeftShiftInplaceStridedFactory;
+    DispatchTableBuilder<binary_inplace_strided_impl_fn_ptr_t,
+                         BitwiseLeftShiftInplaceStridedFactory, num_types>
+        dtb4;
+    dtb4.populate_dispatch_table(
+        bitwise_left_shift_inplace_strided_dispatch_table);
+
+    // function pointers for inplace operation on contiguous inputs and output
+    using fn_ns::BitwiseLeftShiftInplaceContigFactory;
+    DispatchTableBuilder<binary_inplace_contig_impl_fn_ptr_t,
+                         BitwiseLeftShiftInplaceContigFactory, num_types>
+        dtb5;
+    dtb5.populate_dispatch_table(
+        bitwise_left_shift_inplace_contig_dispatch_table);
+
+    // which types are supported by the in-place kernels
+    using fn_ns::BitwiseLeftShiftInplaceTypeMapFactory;
+    DispatchTableBuilder<int, BitwiseLeftShiftInplaceTypeMapFactory, num_types>
+        dtb6;
+    dtb6.populate_dispatch_table(bitwise_left_shift_inplace_output_id_table);
+};
+
+} // namespace impl
+
+void init_bitwise_left_shift(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        impl::populate_bitwise_left_shift_dispatch_tables();
+        using impl::bitwise_left_shift_contig_dispatch_table;
+        using impl::bitwise_left_shift_output_id_table;
+        using impl::bitwise_left_shift_strided_dispatch_table;
+
+        auto bitwise_left_shift_pyapi = [&](const arrayT &src1,
+                                            const arrayT &src2,
+                                            const arrayT &dst,
+                                            sycl::queue &exec_q,
+                                            const event_vecT &depends = {}) {
+            return py_binary_ufunc(
+                src1, src2, dst, exec_q, depends,
+                bitwise_left_shift_output_id_table,
+                // function pointers to handle operation on contiguous arrays
+                // (pointers may be nullptr)
+                bitwise_left_shift_contig_dispatch_table,
+                // function pointers to handle operation on strided arrays (most
+                // general case)
+                bitwise_left_shift_strided_dispatch_table,
+                // function pointers to handle operation of c-contig matrix and
+                // c-contig row with broadcasting (may be nullptr)
+                td_ns::NullPtrTable<
+                    binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{},
+                // function pointers to handle operation of c-contig matrix and
+                // c-contig row with broadcasting (may be nullptr)
+                td_ns::NullPtrTable<
+                    binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{});
+        };
+        auto bitwise_left_shift_result_type_pyapi =
+            [&](const py::dtype &dtype1, const py::dtype &dtype2) {
+                return py_binary_ufunc_result_type(
+                    dtype1, dtype2, bitwise_left_shift_output_id_table);
+            };
+        m.def("_bitwise_left_shift", bitwise_left_shift_pyapi, "",
+              py::arg("src1"), py::arg("src2"), py::arg("dst"),
+              py::arg("sycl_queue"), py::arg("depends") = py::list());
+        m.def("_bitwise_left_shift_result_type",
+              bitwise_left_shift_result_type_pyapi, "");
+
+        using impl::bitwise_left_shift_inplace_contig_dispatch_table;
+        using impl::bitwise_left_shift_inplace_output_id_table;
+        using impl::bitwise_left_shift_inplace_strided_dispatch_table;
+
+        auto bitwise_left_shift_inplace_pyapi =
+            [&](const arrayT &src, const arrayT &dst, sycl::queue &exec_q,
+                const event_vecT &depends = {}) {
+                return py_binary_inplace_ufunc(
+                    src, dst, exec_q, depends,
+                    bitwise_left_shift_inplace_output_id_table,
+                    // function pointers to handle inplace operation on
+                    // contiguous arrays (pointers may be nullptr)
+                    bitwise_left_shift_inplace_contig_dispatch_table,
+                    // function pointers to handle inplace operation on strided
+                    // arrays (most general case)
+                    bitwise_left_shift_inplace_strided_dispatch_table,
+                    // function pointers to handle inplace operation on
+                    // c-contig matrix with c-contig row with broadcasting
+                    // (may be nullptr)
+                    td_ns::NullPtrTable<
+                        binary_inplace_row_matrix_broadcast_impl_fn_ptr_t>{});
+            };
+        m.def("_bitwise_left_shift_inplace", bitwise_left_shift_inplace_pyapi,
+              "", py::arg("lhs"), py::arg("rhs"), py::arg("sycl_queue"),
+              py::arg("depends") = py::list());
+    }
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/bitwise_left_shift.hpp b/dpctl_ext/tensor/libtensor/source/elementwise_functions/bitwise_left_shift.hpp
new file mode 100644
index 000000000000..49a7947d98c3
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/elementwise_functions/bitwise_left_shift.hpp
@@ -0,0 +1,46 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern void init_bitwise_left_shift(py::module_ m);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/bitwise_or.cpp b/dpctl_ext/tensor/libtensor/source/elementwise_functions/bitwise_or.cpp
new file mode 100644
index 000000000000..bbb138c406fb
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/elementwise_functions/bitwise_or.cpp
@@ -0,0 +1,206 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "bitwise_or.hpp"
+#include "elementwise_functions.hpp"
+#include "utils/type_dispatch.hpp"
+#include "utils/type_dispatch_building.hpp"
+
+#include "kernels/elementwise_functions/bitwise_or.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+#include "kernels/elementwise_functions/common_inplace.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
+using ew_cmn_ns::binary_contig_impl_fn_ptr_t;
+using ew_cmn_ns::binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t;
+using ew_cmn_ns::binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t;
+using ew_cmn_ns::binary_strided_impl_fn_ptr_t;
+
+using ew_cmn_ns::binary_inplace_contig_impl_fn_ptr_t;
+using ew_cmn_ns::binary_inplace_row_matrix_broadcast_impl_fn_ptr_t;
+using ew_cmn_ns::binary_inplace_strided_impl_fn_ptr_t;
+
+// B05: ===== BITWISE_OR (x1, x2)
+namespace impl
+{
+namespace bitwise_or_fn_ns = dpctl::tensor::kernels::bitwise_or;
+
+static binary_contig_impl_fn_ptr_t
+    bitwise_or_contig_dispatch_table[td_ns::num_types][td_ns::num_types];
+
+static int bitwise_or_output_id_table[td_ns::num_types][td_ns::num_types];
+static int bitwise_or_inplace_output_id_table[td_ns::num_types]
+                                             [td_ns::num_types];
+
+static binary_strided_impl_fn_ptr_t
+    bitwise_or_strided_dispatch_table[td_ns::num_types][td_ns::num_types];
+
+static binary_inplace_contig_impl_fn_ptr_t
+    bitwise_or_inplace_contig_dispatch_table[td_ns::num_types]
+                                            [td_ns::num_types];
+static binary_inplace_strided_impl_fn_ptr_t
+    bitwise_or_inplace_strided_dispatch_table[td_ns::num_types]
+                                             [td_ns::num_types];
+
+void populate_bitwise_or_dispatch_tables(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = bitwise_or_fn_ns;
+
+    // which input types are supported, and what is the type of the result
+    using fn_ns::BitwiseOrTypeMapFactory;
+    DispatchTableBuilder<int, BitwiseOrTypeMapFactory, num_types> dtb1;
+    dtb1.populate_dispatch_table(bitwise_or_output_id_table);
+
+    // function pointers for operation on general strided arrays
+    using fn_ns::BitwiseOrStridedFactory;
+    DispatchTableBuilder<binary_strided_impl_fn_ptr_t, BitwiseOrStridedFactory,
+                         num_types>
+        dtb2;
+    dtb2.populate_dispatch_table(bitwise_or_strided_dispatch_table);
+
+    // function pointers for operation on contiguous inputs and output
+    using fn_ns::BitwiseOrContigFactory;
+    DispatchTableBuilder<binary_contig_impl_fn_ptr_t, BitwiseOrContigFactory,
+                         num_types>
+        dtb3;
+    dtb3.populate_dispatch_table(bitwise_or_contig_dispatch_table);
+
+    // function pointers for inplace operation on general strided arrays
+    using fn_ns::BitwiseOrInplaceStridedFactory;
+    DispatchTableBuilder<binary_inplace_strided_impl_fn_ptr_t,
+                         BitwiseOrInplaceStridedFactory, num_types>
+        dtb4;
+    dtb4.populate_dispatch_table(bitwise_or_inplace_strided_dispatch_table);
+
+    // function pointers for inplace operation on contiguous inputs and output
+    using fn_ns::BitwiseOrInplaceContigFactory;
+    DispatchTableBuilder<binary_inplace_contig_impl_fn_ptr_t,
+                         BitwiseOrInplaceContigFactory, num_types>
+        dtb5;
+    dtb5.populate_dispatch_table(bitwise_or_inplace_contig_dispatch_table);
+
+    // which types are supported by the in-place kernels
+    using fn_ns::BitwiseOrInplaceTypeMapFactory;
+    DispatchTableBuilder<int, BitwiseOrInplaceTypeMapFactory, num_types> dtb6;
+    dtb6.populate_dispatch_table(bitwise_or_inplace_output_id_table);
+};
+
+} // namespace impl
+
+void init_bitwise_or(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        impl::populate_bitwise_or_dispatch_tables();
+        using impl::bitwise_or_contig_dispatch_table;
+        using impl::bitwise_or_output_id_table;
+        using impl::bitwise_or_strided_dispatch_table;
+
+        auto bitwise_or_pyapi = [&](const arrayT &src1, const arrayT &src2,
+                                    const arrayT &dst, sycl::queue &exec_q,
+                                    const event_vecT &depends = {}) {
+            return py_binary_ufunc(
+                src1, src2, dst, exec_q, depends, bitwise_or_output_id_table,
+                // function pointers to handle operation on contiguous arrays
+                // (pointers may be nullptr)
+                bitwise_or_contig_dispatch_table,
+                // function pointers to handle operation on strided arrays (most
+                // general case)
+                bitwise_or_strided_dispatch_table,
+                // function pointers to handle operation of c-contig matrix and
+                // c-contig row with broadcasting (may be nullptr)
+                td_ns::NullPtrTable<
+                    binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{},
+                // function pointers to handle operation of c-contig matrix and
+                // c-contig row with broadcasting (may be nullptr)
+                td_ns::NullPtrTable<
+                    binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{});
+        };
+        auto bitwise_or_result_type_pyapi = [&](const py::dtype &dtype1,
+                                                const py::dtype &dtype2) {
+            return py_binary_ufunc_result_type(dtype1, dtype2,
+                                               bitwise_or_output_id_table);
+        };
+        m.def("_bitwise_or", bitwise_or_pyapi, "", py::arg("src1"),
+              py::arg("src2"), py::arg("dst"), py::arg("sycl_queue"),
+              py::arg("depends") = py::list());
+        m.def("_bitwise_or_result_type", bitwise_or_result_type_pyapi, "");
+
+        using impl::bitwise_or_inplace_contig_dispatch_table;
+        using impl::bitwise_or_inplace_output_id_table;
+        using impl::bitwise_or_inplace_strided_dispatch_table;
+
+        auto bitwise_or_inplace_pyapi = [&](const arrayT &src,
+                                            const arrayT &dst,
+                                            sycl::queue &exec_q,
+                                            const event_vecT &depends = {}) {
+            return py_binary_inplace_ufunc(
+                src, dst, exec_q, depends, bitwise_or_inplace_output_id_table,
+                // function pointers to handle inplace operation on
+                // contiguous arrays (pointers may be nullptr)
+                bitwise_or_inplace_contig_dispatch_table,
+                // function pointers to handle inplace operation on strided
+                // arrays (most general case)
+                bitwise_or_inplace_strided_dispatch_table,
+                // function pointers to handle inplace operation on
+                // c-contig matrix with c-contig row with broadcasting
+                // (may be nullptr)
+                td_ns::NullPtrTable<
+                    binary_inplace_row_matrix_broadcast_impl_fn_ptr_t>{});
+        };
+        m.def("_bitwise_or_inplace", bitwise_or_inplace_pyapi, "",
+              py::arg("lhs"), py::arg("rhs"), py::arg("sycl_queue"),
+              py::arg("depends") = py::list());
+    }
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/bitwise_or.hpp b/dpctl_ext/tensor/libtensor/source/elementwise_functions/bitwise_or.hpp
new file mode 100644
index 000000000000..1e24caa54429
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/elementwise_functions/bitwise_or.hpp
@@ -0,0 +1,46 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern void init_bitwise_or(py::module_ m);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/bitwise_right_shift.cpp b/dpctl_ext/tensor/libtensor/source/elementwise_functions/bitwise_right_shift.cpp
new file mode 100644
index 000000000000..099dd56b4484
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/elementwise_functions/bitwise_right_shift.cpp
@@ -0,0 +1,217 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "bitwise_right_shift.hpp"
+#include "elementwise_functions.hpp"
+#include "utils/type_dispatch.hpp"
+#include "utils/type_dispatch_building.hpp"
+
+#include "kernels/elementwise_functions/bitwise_right_shift.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+#include "kernels/elementwise_functions/common_inplace.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
+using ew_cmn_ns::binary_contig_impl_fn_ptr_t;
+using ew_cmn_ns::binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t;
+using ew_cmn_ns::binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t;
+using ew_cmn_ns::binary_strided_impl_fn_ptr_t;
+
+using ew_cmn_ns::binary_inplace_contig_impl_fn_ptr_t;
+using ew_cmn_ns::binary_inplace_row_matrix_broadcast_impl_fn_ptr_t;
+using ew_cmn_ns::binary_inplace_strided_impl_fn_ptr_t;
+
+// B06: ===== BITWISE_RIGHT_SHIFT (x1, x2)
+namespace impl
+{
+namespace bitwise_right_shift_fn_ns =
+    dpctl::tensor::kernels::bitwise_right_shift;
+
+static binary_contig_impl_fn_ptr_t
+    bitwise_right_shift_contig_dispatch_table[td_ns::num_types]
+                                             [td_ns::num_types];
+
+static int bitwise_right_shift_output_id_table[td_ns::num_types]
+                                              [td_ns::num_types];
+static int bitwise_right_shift_inplace_output_id_table[td_ns::num_types]
+                                                      [td_ns::num_types];
+
+static binary_strided_impl_fn_ptr_t
+    bitwise_right_shift_strided_dispatch_table[td_ns::num_types]
+                                              [td_ns::num_types];
+
+static binary_inplace_contig_impl_fn_ptr_t
+    bitwise_right_shift_inplace_contig_dispatch_table[td_ns::num_types]
+                                                     [td_ns::num_types];
+static binary_inplace_strided_impl_fn_ptr_t
+    bitwise_right_shift_inplace_strided_dispatch_table[td_ns::num_types]
+                                                      [td_ns::num_types];
+
+void populate_bitwise_right_shift_dispatch_tables(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = bitwise_right_shift_fn_ns;
+
+    // which input types are supported, and what is the type of the result
+    using fn_ns::BitwiseRightShiftTypeMapFactory;
+    DispatchTableBuilder<int, BitwiseRightShiftTypeMapFactory, num_types> dtb1;
+    dtb1.populate_dispatch_table(bitwise_right_shift_output_id_table);
+
+    // function pointers for operation on general strided arrays
+    using fn_ns::BitwiseRightShiftStridedFactory;
+    DispatchTableBuilder<binary_strided_impl_fn_ptr_t,
+                         BitwiseRightShiftStridedFactory, num_types>
+        dtb2;
+    dtb2.populate_dispatch_table(bitwise_right_shift_strided_dispatch_table);
+
+    // function pointers for operation on contiguous inputs and output
+    using fn_ns::BitwiseRightShiftContigFactory;
+    DispatchTableBuilder<binary_contig_impl_fn_ptr_t,
+                         BitwiseRightShiftContigFactory, num_types>
+        dtb3;
+    dtb3.populate_dispatch_table(bitwise_right_shift_contig_dispatch_table);
+
+    // function pointers for inplace operation on general strided arrays
+    using fn_ns::BitwiseRightShiftInplaceStridedFactory;
+    DispatchTableBuilder<binary_inplace_strided_impl_fn_ptr_t,
+                         BitwiseRightShiftInplaceStridedFactory, num_types>
+        dtb4;
+    dtb4.populate_dispatch_table(
+        bitwise_right_shift_inplace_strided_dispatch_table);
+
+    // function pointers for inplace operation on contiguous inputs and output
+    using fn_ns::BitwiseRightShiftInplaceContigFactory;
+    DispatchTableBuilder<binary_inplace_contig_impl_fn_ptr_t,
+                         BitwiseRightShiftInplaceContigFactory, num_types>
+        dtb5;
+    dtb5.populate_dispatch_table(
+        bitwise_right_shift_inplace_contig_dispatch_table);
+
+    // which types are supported by the in-place kernels
+    using fn_ns::BitwiseRightShiftInplaceTypeMapFactory;
+    DispatchTableBuilder<int, BitwiseRightShiftInplaceTypeMapFactory, num_types>
+        dtb6;
+    dtb6.populate_dispatch_table(bitwise_right_shift_inplace_output_id_table);
+};
+
+} // namespace impl
+
+void init_bitwise_right_shift(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        impl::populate_bitwise_right_shift_dispatch_tables();
+        using impl::bitwise_right_shift_contig_dispatch_table;
+        using impl::bitwise_right_shift_output_id_table;
+        using impl::bitwise_right_shift_strided_dispatch_table;
+
+        auto bitwise_right_shift_pyapi = [&](const arrayT &src1,
+                                             const arrayT &src2,
+                                             const arrayT &dst,
+                                             sycl::queue &exec_q,
+                                             const event_vecT &depends = {}) {
+            return py_binary_ufunc(
+                src1, src2, dst, exec_q, depends,
+                bitwise_right_shift_output_id_table,
+                // function pointers to handle operation on contiguous arrays
+                // (pointers may be nullptr)
+                bitwise_right_shift_contig_dispatch_table,
+                // function pointers to handle operation on strided arrays (most
+                // general case)
+                bitwise_right_shift_strided_dispatch_table,
+                // function pointers to handle operation of c-contig matrix and
+                // c-contig row with broadcasting (may be nullptr)
+                td_ns::NullPtrTable<
+                    binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{},
+                // function pointers to handle operation of c-contig matrix and
+                // c-contig row with broadcasting (may be nullptr)
+                td_ns::NullPtrTable<
+                    binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{});
+        };
+        auto bitwise_right_shift_result_type_pyapi =
+            [&](const py::dtype &dtype1, const py::dtype &dtype2) {
+                return py_binary_ufunc_result_type(
+                    dtype1, dtype2, bitwise_right_shift_output_id_table);
+            };
+        m.def("_bitwise_right_shift", bitwise_right_shift_pyapi, "",
+              py::arg("src1"), py::arg("src2"), py::arg("dst"),
+              py::arg("sycl_queue"), py::arg("depends") = py::list());
+        m.def("_bitwise_right_shift_result_type",
+              bitwise_right_shift_result_type_pyapi, "");
+
+        using impl::bitwise_right_shift_inplace_contig_dispatch_table;
+        using impl::bitwise_right_shift_inplace_output_id_table;
+        using impl::bitwise_right_shift_inplace_strided_dispatch_table;
+
+        auto bitwise_right_shift_inplace_pyapi =
+            [&](const arrayT &src, const arrayT &dst, sycl::queue &exec_q,
+                const event_vecT &depends = {}) {
+                return py_binary_inplace_ufunc(
+                    src, dst, exec_q, depends,
+                    bitwise_right_shift_inplace_output_id_table,
+                    // function pointers to handle inplace operation on
+                    // contiguous arrays (pointers may be nullptr)
+                    bitwise_right_shift_inplace_contig_dispatch_table,
+                    // function pointers to handle inplace operation on strided
+                    // arrays (most general case)
+                    bitwise_right_shift_inplace_strided_dispatch_table,
+                    // function pointers to handle inplace operation on
+                    // c-contig matrix with c-contig row with broadcasting
+                    // (may be nullptr)
+                    td_ns::NullPtrTable<
+                        binary_inplace_row_matrix_broadcast_impl_fn_ptr_t>{});
+            };
+        m.def("_bitwise_right_shift_inplace", bitwise_right_shift_inplace_pyapi,
+              "", py::arg("lhs"), py::arg("rhs"), py::arg("sycl_queue"),
+              py::arg("depends") = py::list());
+    }
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/bitwise_right_shift.hpp b/dpctl_ext/tensor/libtensor/source/elementwise_functions/bitwise_right_shift.hpp
new file mode 100644
index 000000000000..aeb24d73b2fc
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/elementwise_functions/bitwise_right_shift.hpp
@@ -0,0 +1,46 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern void init_bitwise_right_shift(py::module_ m);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/bitwise_xor.cpp b/dpctl_ext/tensor/libtensor/source/elementwise_functions/bitwise_xor.cpp
new file mode 100644
index 000000000000..9a23fec82e72
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/elementwise_functions/bitwise_xor.cpp
@@ -0,0 +1,206 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "bitwise_xor.hpp"
+#include "elementwise_functions.hpp"
+#include "utils/type_dispatch.hpp"
+#include "utils/type_dispatch_building.hpp"
+
+#include "kernels/elementwise_functions/bitwise_xor.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+#include "kernels/elementwise_functions/common_inplace.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
+using ew_cmn_ns::binary_contig_impl_fn_ptr_t;
+using ew_cmn_ns::binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t;
+using ew_cmn_ns::binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t;
+using ew_cmn_ns::binary_strided_impl_fn_ptr_t;
+
+using ew_cmn_ns::binary_inplace_contig_impl_fn_ptr_t;
+using ew_cmn_ns::binary_inplace_row_matrix_broadcast_impl_fn_ptr_t;
+using ew_cmn_ns::binary_inplace_strided_impl_fn_ptr_t;
+
+// B07: ===== BITWISE_XOR (x1, x2)
+namespace impl
+{
+namespace bitwise_xor_fn_ns = dpctl::tensor::kernels::bitwise_xor;
+
+static binary_contig_impl_fn_ptr_t
+    bitwise_xor_contig_dispatch_table[td_ns::num_types][td_ns::num_types];
+
+static int bitwise_xor_output_id_table[td_ns::num_types][td_ns::num_types];
+static int bitwise_xor_inplace_output_id_table[td_ns::num_types]
+                                              [td_ns::num_types];
+
+static binary_strided_impl_fn_ptr_t
+    bitwise_xor_strided_dispatch_table[td_ns::num_types][td_ns::num_types];
+
+static binary_inplace_contig_impl_fn_ptr_t
+    bitwise_xor_inplace_contig_dispatch_table[td_ns::num_types]
+                                             [td_ns::num_types];
+static binary_inplace_strided_impl_fn_ptr_t
+    bitwise_xor_inplace_strided_dispatch_table[td_ns::num_types]
+                                              [td_ns::num_types];
+
+void populate_bitwise_xor_dispatch_tables(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = bitwise_xor_fn_ns;
+
+    // which input types are supported, and what is the type of the result
+    using fn_ns::BitwiseXorTypeMapFactory;
+    DispatchTableBuilder<int, BitwiseXorTypeMapFactory, num_types> dtb1;
+    dtb1.populate_dispatch_table(bitwise_xor_output_id_table);
+
+    // function pointers for operation on general strided arrays
+    using fn_ns::BitwiseXorStridedFactory;
+    DispatchTableBuilder<binary_strided_impl_fn_ptr_t, BitwiseXorStridedFactory,
+                         num_types>
+        dtb2;
+    dtb2.populate_dispatch_table(bitwise_xor_strided_dispatch_table);
+
+    // function pointers for operation on contiguous inputs and output
+    using fn_ns::BitwiseXorContigFactory;
+    DispatchTableBuilder<binary_contig_impl_fn_ptr_t, BitwiseXorContigFactory,
+                         num_types>
+        dtb3;
+    dtb3.populate_dispatch_table(bitwise_xor_contig_dispatch_table);
+
+    // function pointers for inplace operation on general strided arrays
+    using fn_ns::BitwiseXorInplaceStridedFactory;
+    DispatchTableBuilder<binary_inplace_strided_impl_fn_ptr_t,
+                         BitwiseXorInplaceStridedFactory, num_types>
+        dtb4;
+    dtb4.populate_dispatch_table(bitwise_xor_inplace_strided_dispatch_table);
+
+    // function pointers for inplace operation on contiguous inputs and output
+    using fn_ns::BitwiseXorInplaceContigFactory;
+    DispatchTableBuilder<binary_inplace_contig_impl_fn_ptr_t,
+                         BitwiseXorInplaceContigFactory, num_types>
+        dtb5;
+    dtb5.populate_dispatch_table(bitwise_xor_inplace_contig_dispatch_table);
+
+    // which types are supported by the in-place kernels
+    using fn_ns::BitwiseXorInplaceTypeMapFactory;
+    DispatchTableBuilder<int, BitwiseXorInplaceTypeMapFactory, num_types> dtb6;
+    dtb6.populate_dispatch_table(bitwise_xor_inplace_output_id_table);
+};
+
+} // namespace impl
+
+void init_bitwise_xor(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        impl::populate_bitwise_xor_dispatch_tables();
+        using impl::bitwise_xor_contig_dispatch_table;
+        using impl::bitwise_xor_output_id_table;
+        using impl::bitwise_xor_strided_dispatch_table;
+
+        auto bitwise_xor_pyapi = [&](const arrayT &src1, const arrayT &src2,
+                                     const arrayT &dst, sycl::queue &exec_q,
+                                     const event_vecT &depends = {}) {
+            return py_binary_ufunc(
+                src1, src2, dst, exec_q, depends, bitwise_xor_output_id_table,
+                // function pointers to handle operation on contiguous arrays
+                // (pointers may be nullptr)
+                bitwise_xor_contig_dispatch_table,
+                // function pointers to handle operation on strided arrays (most
+                // general case)
+                bitwise_xor_strided_dispatch_table,
+                // function pointers to handle operation of c-contig matrix and
+                // c-contig row with broadcasting (may be nullptr)
+                td_ns::NullPtrTable<
+                    binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{},
+                // function pointers to handle operation of c-contig matrix and
+                // c-contig row with broadcasting (may be nullptr)
+                td_ns::NullPtrTable<
+                    binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{});
+        };
+        auto bitwise_xor_result_type_pyapi = [&](const py::dtype &dtype1,
+                                                 const py::dtype &dtype2) {
+            return py_binary_ufunc_result_type(dtype1, dtype2,
+                                               bitwise_xor_output_id_table);
+        };
+        m.def("_bitwise_xor", bitwise_xor_pyapi, "", py::arg("src1"),
+              py::arg("src2"), py::arg("dst"), py::arg("sycl_queue"),
+              py::arg("depends") = py::list());
+        m.def("_bitwise_xor_result_type", bitwise_xor_result_type_pyapi, "");
+
+        using impl::bitwise_xor_inplace_contig_dispatch_table;
+        using impl::bitwise_xor_inplace_output_id_table;
+        using impl::bitwise_xor_inplace_strided_dispatch_table;
+
+        auto bitwise_xor_inplace_pyapi = [&](const arrayT &src,
+                                             const arrayT &dst,
+                                             sycl::queue &exec_q,
+                                             const event_vecT &depends = {}) {
+            return py_binary_inplace_ufunc(
+                src, dst, exec_q, depends, bitwise_xor_inplace_output_id_table,
+                // function pointers to handle inplace operation on
+                // contiguous arrays (pointers may be nullptr)
+                bitwise_xor_inplace_contig_dispatch_table,
+                // function pointers to handle inplace operation on strided
+                // arrays (most general case)
+                bitwise_xor_inplace_strided_dispatch_table,
+                // function pointers to handle inplace operation on
+                // c-contig matrix with c-contig row with broadcasting
+                // (may be nullptr)
+                td_ns::NullPtrTable<
+                    binary_inplace_row_matrix_broadcast_impl_fn_ptr_t>{});
+        };
+        m.def("_bitwise_xor_inplace", bitwise_xor_inplace_pyapi, "",
+              py::arg("lhs"), py::arg("rhs"), py::arg("sycl_queue"),
+              py::arg("depends") = py::list());
+    }
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/bitwise_xor.hpp b/dpctl_ext/tensor/libtensor/source/elementwise_functions/bitwise_xor.hpp
new file mode 100644
index 000000000000..4029574cdd7d
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/elementwise_functions/bitwise_xor.hpp
@@ -0,0 +1,46 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern void init_bitwise_xor(py::module_ m);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/elementwise_common.cpp b/dpctl_ext/tensor/libtensor/source/elementwise_functions/elementwise_common.cpp
index 144e39be252f..e4e730a1da6b 100644
--- a/dpctl_ext/tensor/libtensor/source/elementwise_functions/elementwise_common.cpp
+++ b/dpctl_ext/tensor/libtensor/source/elementwise_functions/elementwise_common.cpp
@@ -38,19 +38,19 @@
 #include "abs.hpp"
 #include "acos.hpp"
 #include "acosh.hpp"
-// #include "add.hpp"
+#include "add.hpp"
 #include "angle.hpp"
 #include "asin.hpp"
 #include "asinh.hpp"
 #include "atan.hpp"
-// #include "atan2.hpp"
+#include "atan2.hpp"
 #include "atanh.hpp"
-// #include "bitwise_and.hpp"
+#include "bitwise_and.hpp"
 #include "bitwise_invert.hpp"
-// #include "bitwise_left_shift.hpp"
-// #include "bitwise_or.hpp"
-// #include "bitwise_right_shift.hpp"
-// #include "bitwise_xor.hpp"
+#include "bitwise_left_shift.hpp"
+#include "bitwise_or.hpp"
+#include "bitwise_right_shift.hpp"
+#include "bitwise_xor.hpp"
 #include "cbrt.hpp"
 #include "ceil.hpp"
 #include "conj.hpp"
@@ -118,19 +118,19 @@ void init_elementwise_functions(py::module_ m)
     init_abs(m);
     init_acos(m);
     init_acosh(m);
-    // init_add(m);
+    init_add(m);
     init_angle(m);
     init_asin(m);
     init_asinh(m);
     init_atan(m);
-    // init_atan2(m);
+    init_atan2(m);
     init_atanh(m);
-    // init_bitwise_and(m);
+    init_bitwise_and(m);
     init_bitwise_invert(m);
-    // init_bitwise_left_shift(m);
-    // init_bitwise_or(m);
-    // init_bitwise_right_shift(m);
-    // init_bitwise_xor(m);
+    init_bitwise_left_shift(m);
+    init_bitwise_or(m);
+    init_bitwise_right_shift(m);
+    init_bitwise_xor(m);
     init_cbrt(m);
     init_ceil(m);
     init_conj(m);
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/elementwise_functions.hpp b/dpctl_ext/tensor/libtensor/source/elementwise_functions/elementwise_functions.hpp
index cd56c5707264..b8450f8e7296 100644
--- a/dpctl_ext/tensor/libtensor/source/elementwise_functions/elementwise_functions.hpp
+++ b/dpctl_ext/tensor/libtensor/source/elementwise_functions/elementwise_functions.hpp
@@ -49,6 +49,7 @@
 #include <pybind11/pybind11.h>
 
 #include "elementwise_functions_type_utils.hpp"
+#include "kernels/alignment.hpp"
 #include "kernels/dpctl_tensor_types.hpp"
 #include "simplify_iteration_space.hpp"
 #include "utils/memory_overlap.hpp"
@@ -65,6 +66,9 @@ namespace dpctl::tensor::py_internal
 namespace py = pybind11;
 namespace td_ns = dpctl::tensor::type_dispatch;
 
+using dpctl::tensor::kernels::alignment_utils::is_aligned;
+using dpctl::tensor::kernels::alignment_utils::required_alignment;
+
 /*! @brief Template implementing Python API for unary elementwise functions */
 template <typename output_typesT,
           typename contig_dispatchT,
@@ -281,4 +285,532 @@ py::object py_unary_ufunc_result_type(const py::dtype &input_dtype,
     }
 }
 
+// ======================== Binary functions ===========================
+
+namespace
+{
+template <class Container, class T>
+bool isEqual(Container const &c, std::initializer_list<T> const &l)
+{
+    return std::equal(std::begin(c), std::end(c), std::begin(l), std::end(l));
+}
+} // namespace
+
+/*! @brief Template implementing Python API for binary elementwise
+ *         functions */
+template <typename output_typesT,
+          typename contig_dispatchT,
+          typename strided_dispatchT,
+          typename contig_matrix_row_dispatchT,
+          typename contig_row_matrix_dispatchT>
+std::pair<sycl::event, sycl::event> py_binary_ufunc(
+    const dpctl::tensor::usm_ndarray &src1,
+    const dpctl::tensor::usm_ndarray &src2,
+    const dpctl::tensor::usm_ndarray &dst, // dst = op(src1, src2), elementwise
+    sycl::queue &exec_q,
+    const std::vector<sycl::event> depends,
+    //
+    const output_typesT &output_type_table,
+    const contig_dispatchT &contig_dispatch_table,
+    const strided_dispatchT &strided_dispatch_table,
+    const contig_matrix_row_dispatchT
+        &contig_matrix_row_broadcast_dispatch_table,
+    const contig_row_matrix_dispatchT
+        &contig_row_matrix_broadcast_dispatch_table)
+{
+    // check type_nums
+    int src1_typenum = src1.get_typenum();
+    int src2_typenum = src2.get_typenum();
+    int dst_typenum = dst.get_typenum();
+
+    auto array_types = td_ns::usm_ndarray_types();
+    int src1_typeid = array_types.typenum_to_lookup_id(src1_typenum);
+    int src2_typeid = array_types.typenum_to_lookup_id(src2_typenum);
+    int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum);
+
+    int output_typeid = output_type_table[src1_typeid][src2_typeid];
+
+    if (output_typeid != dst_typeid) {
+        throw py::value_error(
+            "Destination array has unexpected elemental data type.");
+    }
+
+    // check that queues are compatible
+    if (!dpctl::utils::queues_are_compatible(exec_q, {src1, src2, dst})) {
+        throw py::value_error(
+            "Execution queue is not compatible with allocation queues");
+    }
+
+    dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst);
+
+    // check shapes, broadcasting is assumed done by caller
+    // check that dimensions are the same
+    int dst_nd = dst.get_ndim();
+    if (dst_nd != src1.get_ndim() || dst_nd != src2.get_ndim()) {
+        throw py::value_error("Array dimensions are not the same.");
+    }
+
+    // check that shapes are the same
+    const py::ssize_t *src1_shape = src1.get_shape_raw();
+    const py::ssize_t *src2_shape = src2.get_shape_raw();
+    const py::ssize_t *dst_shape = dst.get_shape_raw();
+    bool shapes_equal(true);
+    std::size_t src_nelems(1);
+
+    for (int i = 0; i < dst_nd; ++i) {
+        src_nelems *= static_cast<std::size_t>(src1_shape[i]);
+        shapes_equal = shapes_equal && (src1_shape[i] == dst_shape[i] &&
+                                        src2_shape[i] == dst_shape[i]);
+    }
+    if (!shapes_equal) {
+        throw py::value_error("Array shapes are not the same.");
+    }
+
+    // if nelems is zero, return
+    if (src_nelems == 0) {
+        return std::make_pair(sycl::event(), sycl::event());
+    }
+
+    dpctl::tensor::validation::AmpleMemory::throw_if_not_ample(dst, src_nelems);
+
+    auto const &overlap = dpctl::tensor::overlap::MemoryOverlap();
+    auto const &same_logical_tensors =
+        dpctl::tensor::overlap::SameLogicalTensors();
+    if ((overlap(src1, dst) && !same_logical_tensors(src1, dst)) ||
+        (overlap(src2, dst) && !same_logical_tensors(src2, dst)))
+    {
+        throw py::value_error("Arrays index overlapping segments of memory");
+    }
+    // check memory overlap
+    const char *src1_data = src1.get_data();
+    const char *src2_data = src2.get_data();
+    char *dst_data = dst.get_data();
+
+    // handle contiguous inputs
+    bool is_src1_c_contig = src1.is_c_contiguous();
+    bool is_src1_f_contig = src1.is_f_contiguous();
+
+    bool is_src2_c_contig = src2.is_c_contiguous();
+    bool is_src2_f_contig = src2.is_f_contiguous();
+
+    bool is_dst_c_contig = dst.is_c_contiguous();
+    bool is_dst_f_contig = dst.is_f_contiguous();
+
+    bool all_c_contig =
+        (is_src1_c_contig && is_src2_c_contig && is_dst_c_contig);
+    bool all_f_contig =
+        (is_src1_f_contig && is_src2_f_contig && is_dst_f_contig);
+
+    // dispatch for contiguous inputs
+    if (all_c_contig || all_f_contig) {
+        auto contig_fn = contig_dispatch_table[src1_typeid][src2_typeid];
+
+        if (contig_fn != nullptr) {
+            auto comp_ev = contig_fn(exec_q, src_nelems, src1_data, 0,
+                                     src2_data, 0, dst_data, 0, depends);
+            sycl::event ht_ev = dpctl::utils::keep_args_alive(
+                exec_q, {src1, src2, dst}, {comp_ev});
+
+            return std::make_pair(ht_ev, comp_ev);
+        }
+    }
+
+    // simplify strides
+    auto const &src1_strides = src1.get_strides_vector();
+    auto const &src2_strides = src2.get_strides_vector();
+    auto const &dst_strides = dst.get_strides_vector();
+
+    using shT = std::vector<py::ssize_t>;
+    shT simplified_shape;
+    shT simplified_src1_strides;
+    shT simplified_src2_strides;
+    shT simplified_dst_strides;
+    py::ssize_t src1_offset(0);
+    py::ssize_t src2_offset(0);
+    py::ssize_t dst_offset(0);
+
+    int nd = dst_nd;
+    const py::ssize_t *shape = src1_shape;
+
+    simplify_iteration_space_3(
+        nd, shape, src1_strides, src2_strides, dst_strides,
+        // outputs
+        simplified_shape, simplified_src1_strides, simplified_src2_strides,
+        simplified_dst_strides, src1_offset, src2_offset, dst_offset);
+
+    std::vector<sycl::event> host_tasks{};
+    if (nd < 3) {
+        static constexpr auto unit_stride =
+            std::initializer_list<py::ssize_t>{1};
+
+        if ((nd == 1) && isEqual(simplified_src1_strides, unit_stride) &&
+            isEqual(simplified_src2_strides, unit_stride) &&
+            isEqual(simplified_dst_strides, unit_stride))
+        {
+            auto contig_fn = contig_dispatch_table[src1_typeid][src2_typeid];
+
+            if (contig_fn != nullptr) {
+                auto comp_ev = contig_fn(exec_q, src_nelems, src1_data,
+                                         src1_offset, src2_data, src2_offset,
+                                         dst_data, dst_offset, depends);
+                sycl::event ht_ev = dpctl::utils::keep_args_alive(
+                    exec_q, {src1, src2, dst}, {comp_ev});
+
+                return std::make_pair(ht_ev, comp_ev);
+            }
+        }
+        if (nd == 2) {
+            static constexpr auto zero_one_strides =
+                std::initializer_list<py::ssize_t>{0, 1};
+            static constexpr auto one_zero_strides =
+                std::initializer_list<py::ssize_t>{1, 0};
+            static constexpr py::ssize_t one{1};
+            // special case of C-contiguous matrix and a row
+            if (isEqual(simplified_src2_strides, zero_one_strides) &&
+                isEqual(simplified_src1_strides, {simplified_shape[1], one}) &&
+                isEqual(simplified_dst_strides, {simplified_shape[1], one}))
+            {
+                auto matrix_row_broadcast_fn =
+                    contig_matrix_row_broadcast_dispatch_table[src1_typeid]
+                                                              [src2_typeid];
+                if (matrix_row_broadcast_fn != nullptr) {
+                    int src1_itemsize = src1.get_elemsize();
+                    int src2_itemsize = src2.get_elemsize();
+                    int dst_itemsize = dst.get_elemsize();
+
+                    if (is_aligned<required_alignment>(
+                            src1_data + src1_offset * src1_itemsize) &&
+                        is_aligned<required_alignment>(
+                            src2_data + src2_offset * src2_itemsize) &&
+                        is_aligned<required_alignment>(
+                            dst_data + dst_offset * dst_itemsize))
+                    {
+                        std::size_t n0 = simplified_shape[0];
+                        std::size_t n1 = simplified_shape[1];
+                        sycl::event comp_ev = matrix_row_broadcast_fn(
+                            exec_q, host_tasks, n0, n1, src1_data, src1_offset,
+                            src2_data, src2_offset, dst_data, dst_offset,
+                            depends);
+
+                        return std::make_pair(
+                            dpctl::utils::keep_args_alive(
+                                exec_q, {src1, src2, dst}, host_tasks),
+                            comp_ev);
+                    }
+                }
+            }
+            if (isEqual(simplified_src1_strides, one_zero_strides) &&
+                isEqual(simplified_src2_strides, {one, simplified_shape[0]}) &&
+                isEqual(simplified_dst_strides, {one, simplified_shape[0]}))
+            {
+                auto row_matrix_broadcast_fn =
+                    contig_row_matrix_broadcast_dispatch_table[src1_typeid]
+                                                              [src2_typeid];
+                if (row_matrix_broadcast_fn != nullptr) {
+
+                    int src1_itemsize = src1.get_elemsize();
+                    int src2_itemsize = src2.get_elemsize();
+                    int dst_itemsize = dst.get_elemsize();
+
+                    if (is_aligned<required_alignment>(
+                            src1_data + src1_offset * src1_itemsize) &&
+                        is_aligned<required_alignment>(
+                            src2_data + src2_offset * src2_itemsize) &&
+                        is_aligned<required_alignment>(
+                            dst_data + dst_offset * dst_itemsize))
+                    {
+                        std::size_t n0 = simplified_shape[1];
+                        std::size_t n1 = simplified_shape[0];
+                        sycl::event comp_ev = row_matrix_broadcast_fn(
+                            exec_q, host_tasks, n0, n1, src1_data, src1_offset,
+                            src2_data, src2_offset, dst_data, dst_offset,
+                            depends);
+
+                        return std::make_pair(
+                            dpctl::utils::keep_args_alive(
+                                exec_q, {src1, src2, dst}, host_tasks),
+                            comp_ev);
+                    }
+                }
+            }
+        }
+    }
+
+    // dispatch to strided code
+    auto strided_fn = strided_dispatch_table[src1_typeid][src2_typeid];
+
+    if (strided_fn == nullptr) {
+        throw std::runtime_error(
+            "Strided implementation is missing for src1_typeid=" +
+            std::to_string(src1_typeid) +
+            " and src2_typeid=" + std::to_string(src2_typeid));
+    }
+
+    using dpctl::tensor::offset_utils::device_allocate_and_pack;
+    auto ptr_sz_event_triple_ = device_allocate_and_pack<py::ssize_t>(
+        exec_q, host_tasks, simplified_shape, simplified_src1_strides,
+        simplified_src2_strides, simplified_dst_strides);
+    auto shape_strides_owner = std::move(std::get<0>(ptr_sz_event_triple_));
+    auto &copy_shape_ev = std::get<2>(ptr_sz_event_triple_);
+
+    const py::ssize_t *shape_strides = shape_strides_owner.get();
+
+    sycl::event strided_fn_ev = strided_fn(
+        exec_q, src_nelems, nd, shape_strides, src1_data, src1_offset,
+        src2_data, src2_offset, dst_data, dst_offset, depends, {copy_shape_ev});
+
+    // async free of shape_strides temporary
+    sycl::event tmp_cleanup_ev = dpctl::tensor::alloc_utils::async_smart_free(
+        exec_q, {strided_fn_ev}, shape_strides_owner);
+    host_tasks.push_back(tmp_cleanup_ev);
+
+    return std::make_pair(
+        dpctl::utils::keep_args_alive(exec_q, {src1, src2, dst}, host_tasks),
+        strided_fn_ev);
+}
+
+/*! @brief Type querying for binary elementwise functions */
+template <typename output_typesT>
+py::object py_binary_ufunc_result_type(const py::dtype &input1_dtype,
+                                       const py::dtype &input2_dtype,
+                                       const output_typesT &output_types_table)
+{
+    int tn1 = input1_dtype.num(); // NumPy type numbers are the same as in dpctl
+    int tn2 = input2_dtype.num(); // NumPy type numbers are the same as in dpctl
+    int src1_typeid = -1;
+    int src2_typeid = -1;
+
+    auto array_types = td_ns::usm_ndarray_types();
+
+    try {
+        src1_typeid = array_types.typenum_to_lookup_id(tn1);
+        src2_typeid = array_types.typenum_to_lookup_id(tn2);
+    } catch (const std::exception &e) {
+        throw py::value_error(e.what());
+    }
+
+    if (src1_typeid < 0 || src1_typeid >= td_ns::num_types || src2_typeid < 0 ||
+        src2_typeid >= td_ns::num_types)
+    {
+        throw std::runtime_error("binary output type lookup failed");
+    }
+    int dst_typeid = output_types_table[src1_typeid][src2_typeid];
+
+    if (dst_typeid < 0) {
+        auto res = py::none();
+        return py::cast<py::object>(res);
+    }
+    else {
+        using type_utils::_dtype_from_typenum;
+
+        auto dst_typenum_t = static_cast<td_ns::typenum_t>(dst_typeid);
+        auto dt = _dtype_from_typenum(dst_typenum_t);
+
+        return py::cast<py::object>(dt);
+    }
+}
+
+// ==================== Inplace binary functions =======================
+
+template <typename output_typesT,
+          typename contig_dispatchT,
+          typename strided_dispatchT,
+          typename contig_row_matrix_dispatchT>
+std::pair<sycl::event, sycl::event>
+    py_binary_inplace_ufunc(const dpctl::tensor::usm_ndarray &lhs,
+                            const dpctl::tensor::usm_ndarray &rhs,
+                            sycl::queue &exec_q,
+                            const std::vector<sycl::event> depends,
+                            //
+                            const output_typesT &output_type_table,
+                            const contig_dispatchT &contig_dispatch_table,
+                            const strided_dispatchT &strided_dispatch_table,
+                            const contig_row_matrix_dispatchT
+                                &contig_row_matrix_broadcast_dispatch_table)
+{
+    dpctl::tensor::validation::CheckWritable::throw_if_not_writable(lhs);
+
+    // check type_nums
+    int rhs_typenum = rhs.get_typenum();
+    int lhs_typenum = lhs.get_typenum();
+
+    auto array_types = td_ns::usm_ndarray_types();
+    int rhs_typeid = array_types.typenum_to_lookup_id(rhs_typenum);
+    int lhs_typeid = array_types.typenum_to_lookup_id(lhs_typenum);
+
+    int output_typeid = output_type_table[rhs_typeid][lhs_typeid];
+
+    if (output_typeid != lhs_typeid) {
+        throw py::value_error(
+            "Left-hand side array has unexpected elemental data type.");
+    }
+
+    // check that queues are compatible
+    if (!dpctl::utils::queues_are_compatible(exec_q, {rhs, lhs})) {
+        throw py::value_error(
+            "Execution queue is not compatible with allocation queues");
+    }
+
+    // check shapes, broadcasting is assumed done by caller
+    // check that dimensions are the same
+    int lhs_nd = lhs.get_ndim();
+    if (lhs_nd != rhs.get_ndim()) {
+        throw py::value_error("Array dimensions are not the same.");
+    }
+
+    // check that shapes are the same
+    const py::ssize_t *rhs_shape = rhs.get_shape_raw();
+    const py::ssize_t *lhs_shape = lhs.get_shape_raw();
+    bool shapes_equal(true);
+    std::size_t rhs_nelems(1);
+
+    for (int i = 0; i < lhs_nd; ++i) {
+        rhs_nelems *= static_cast<std::size_t>(rhs_shape[i]);
+        shapes_equal = shapes_equal && (rhs_shape[i] == lhs_shape[i]);
+    }
+    if (!shapes_equal) {
+        throw py::value_error("Array shapes are not the same.");
+    }
+
+    // if nelems is zero, return
+    if (rhs_nelems == 0) {
+        return std::make_pair(sycl::event(), sycl::event());
+    }
+
+    dpctl::tensor::validation::AmpleMemory::throw_if_not_ample(lhs, rhs_nelems);
+
+    // check memory overlap
+    auto const &same_logical_tensors =
+        dpctl::tensor::overlap::SameLogicalTensors();
+    auto const &overlap = dpctl::tensor::overlap::MemoryOverlap();
+    if (overlap(rhs, lhs) && !same_logical_tensors(rhs, lhs)) {
+        throw py::value_error("Arrays index overlapping segments of memory");
+    }
+    // check memory overlap
+    const char *rhs_data = rhs.get_data();
+    char *lhs_data = lhs.get_data();
+
+    // handle contiguous inputs
+    bool is_rhs_c_contig = rhs.is_c_contiguous();
+    bool is_rhs_f_contig = rhs.is_f_contiguous();
+
+    bool is_lhs_c_contig = lhs.is_c_contiguous();
+    bool is_lhs_f_contig = lhs.is_f_contiguous();
+
+    bool both_c_contig = (is_rhs_c_contig && is_lhs_c_contig);
+    bool both_f_contig = (is_rhs_f_contig && is_lhs_f_contig);
+
+    // dispatch for contiguous inputs
+    if (both_c_contig || both_f_contig) {
+        auto contig_fn = contig_dispatch_table[rhs_typeid][lhs_typeid];
+
+        if (contig_fn != nullptr) {
+            auto comp_ev = contig_fn(exec_q, rhs_nelems, rhs_data, 0, lhs_data,
+                                     0, depends);
+            sycl::event ht_ev =
+                dpctl::utils::keep_args_alive(exec_q, {rhs, lhs}, {comp_ev});
+
+            return std::make_pair(ht_ev, comp_ev);
+        }
+    }
+
+    // simplify strides
+    auto const &rhs_strides = rhs.get_strides_vector();
+    auto const &lhs_strides = lhs.get_strides_vector();
+
+    using shT = std::vector<py::ssize_t>;
+    shT simplified_shape;
+    shT simplified_rhs_strides;
+    shT simplified_lhs_strides;
+    py::ssize_t rhs_offset(0);
+    py::ssize_t lhs_offset(0);
+
+    int nd = lhs_nd;
+    const py::ssize_t *shape = rhs_shape;
+
+    simplify_iteration_space(nd, shape, rhs_strides, lhs_strides,
+                             // outputs
+                             simplified_shape, simplified_rhs_strides,
+                             simplified_lhs_strides, rhs_offset, lhs_offset);
+
+    std::vector<sycl::event> host_tasks{};
+    if (nd < 3) {
+        static constexpr auto unit_stride =
+            std::initializer_list<py::ssize_t>{1};
+
+        if ((nd == 1) && isEqual(simplified_rhs_strides, unit_stride) &&
+            isEqual(simplified_lhs_strides, unit_stride))
+        {
+            auto contig_fn = contig_dispatch_table[rhs_typeid][lhs_typeid];
+
+            if (contig_fn != nullptr) {
+                auto comp_ev =
+                    contig_fn(exec_q, rhs_nelems, rhs_data, rhs_offset,
+                              lhs_data, lhs_offset, depends);
+                sycl::event ht_ev = dpctl::utils::keep_args_alive(
+                    exec_q, {rhs, lhs}, {comp_ev});
+
+                return std::make_pair(ht_ev, comp_ev);
+            }
+        }
+        if (nd == 2) {
+            static constexpr auto one_zero_strides =
+                std::initializer_list<py::ssize_t>{1, 0};
+            static constexpr py::ssize_t one{1};
+            // special case of C-contiguous matrix and a row
+            if (isEqual(simplified_rhs_strides, one_zero_strides) &&
+                isEqual(simplified_lhs_strides, {one, simplified_shape[0]}))
+            {
+                auto row_matrix_broadcast_fn =
+                    contig_row_matrix_broadcast_dispatch_table[rhs_typeid]
+                                                              [lhs_typeid];
+                if (row_matrix_broadcast_fn != nullptr) {
+                    std::size_t n0 = simplified_shape[1];
+                    std::size_t n1 = simplified_shape[0];
+                    sycl::event comp_ev = row_matrix_broadcast_fn(
+                        exec_q, host_tasks, n0, n1, rhs_data, rhs_offset,
+                        lhs_data, lhs_offset, depends);
+
+                    return std::make_pair(dpctl::utils::keep_args_alive(
+                                              exec_q, {lhs, rhs}, host_tasks),
+                                          comp_ev);
+                }
+            }
+        }
+    }
+
+    // dispatch to strided code
+    auto strided_fn = strided_dispatch_table[rhs_typeid][lhs_typeid];
+
+    if (strided_fn == nullptr) {
+        throw std::runtime_error(
+            "Strided implementation is missing for rhs_typeid=" +
+            std::to_string(rhs_typeid) +
+            " and lhs_typeid=" + std::to_string(lhs_typeid));
+    }
+
+    using dpctl::tensor::offset_utils::device_allocate_and_pack;
+    auto ptr_sz_event_triple_ = device_allocate_and_pack<py::ssize_t>(
+        exec_q, host_tasks, simplified_shape, simplified_rhs_strides,
+        simplified_lhs_strides);
+    auto shape_strides_owner = std::move(std::get<0>(ptr_sz_event_triple_));
+    auto copy_shape_ev = std::get<2>(ptr_sz_event_triple_);
+
+    const py::ssize_t *shape_strides = shape_strides_owner.get();
+
+    sycl::event strided_fn_ev =
+        strided_fn(exec_q, rhs_nelems, nd, shape_strides, rhs_data, rhs_offset,
+                   lhs_data, lhs_offset, depends, {copy_shape_ev});
+
+    // async free of shape_strides temporary
+    sycl::event tmp_cleanup_ev = dpctl::tensor::alloc_utils::async_smart_free(
+        exec_q, {strided_fn_ev}, shape_strides_owner);
+
+    host_tasks.push_back(tmp_cleanup_ev);
+
+    return std::make_pair(
+        dpctl::utils::keep_args_alive(exec_q, {rhs, lhs}, host_tasks),
+        strided_fn_ev);
+}
+
 } // namespace dpctl::tensor::py_internal
diff --git a/dpnp/backend/extensions/elementwise_functions/elementwise_functions.hpp b/dpnp/backend/extensions/elementwise_functions/elementwise_functions.hpp
index e23f74a678dc..bd06ba1bd583 100644
--- a/dpnp/backend/extensions/elementwise_functions/elementwise_functions.hpp
+++ b/dpnp/backend/extensions/elementwise_functions/elementwise_functions.hpp
@@ -30,6 +30,7 @@
 
 #include <cstddef>
 #include <exception>
+#include <iterator>
 #include <stdexcept>
 #include <utility>
 #include <vector>
diff --git a/dpnp/backend/include/dpnp4pybind11.hpp b/dpnp/backend/include/dpnp4pybind11.hpp
index 87657d1fbecd..1a4fb69782dd 100644
--- a/dpnp/backend/include/dpnp4pybind11.hpp
+++ b/dpnp/backend/include/dpnp4pybind11.hpp
@@ -31,6 +31,7 @@
 #include "dpctl_capi.h"
 
 #include <array>
+#include <cassert>
 #include <complex>
 #include <cstddef> // for std::size_t for C++ linkage
 #include <cstdint>
diff --git a/dpnp/dpnp_iface_bitwise.py b/dpnp/dpnp_iface_bitwise.py
index edf68b2f6581..bff5c4e3aed9 100644
--- a/dpnp/dpnp_iface_bitwise.py
+++ b/dpnp/dpnp_iface_bitwise.py
@@ -43,12 +43,11 @@
 # pylint: disable=no-name-in-module
 # pylint: disable=protected-access
 
-import dpctl.tensor._tensor_elementwise_impl as ti
 import numpy
 
 # TODO: revert to `import dpctl.tensor...`
 # when dpnp fully migrates dpctl/tensor
-import dpctl_ext.tensor._tensor_elementwise_impl as ti_ext
+import dpctl_ext.tensor._tensor_elementwise_impl as ti
 import dpnp.backend.extensions.ufunc._ufunc_impl as ufi
 from dpnp.dpnp_algo.dpnp_elementwise_common import DPNPBinaryFunc, DPNPUnaryFunc
 
@@ -517,8 +516,8 @@ def binary_repr(num, width=None):
 
 invert = DPNPUnaryFunc(
     "invert",
-    ti_ext._bitwise_invert_result_type,
-    ti_ext._bitwise_invert,
+    ti._bitwise_invert_result_type,
+    ti._bitwise_invert,
     _INVERT_DOCSTRING,
 )
 
diff --git a/dpnp/dpnp_iface_mathematical.py b/dpnp/dpnp_iface_mathematical.py
index c84b61dad4bf..d1bdbdcfc961 100644
--- a/dpnp/dpnp_iface_mathematical.py
+++ b/dpnp/dpnp_iface_mathematical.py
@@ -469,8 +469,8 @@ def _validate_interp_param(param, name, exec_q, usm_type, dtype=None):
 
 add = DPNPBinaryFunc(
     "add",
-    ti._add_result_type,
-    ti._add,
+    ti_ext._add_result_type,
+    ti_ext._add,
     _ADD_DOCSTRING,
     mkl_fn_to_call="_mkl_add_to_call",
     mkl_impl_fn="_add",
diff --git a/dpnp/dpnp_iface_trigonometric.py b/dpnp/dpnp_iface_trigonometric.py
index 6deab3a8876c..186ae47b0958 100644
--- a/dpnp/dpnp_iface_trigonometric.py
+++ b/dpnp/dpnp_iface_trigonometric.py
@@ -572,8 +572,8 @@ def _get_accumulation_res_dt(a, dtype):
 
 atan2 = DPNPBinaryFunc(
     "atan2",
-    ti._atan2_result_type,
-    ti._atan2,
+    ti_ext._atan2_result_type,
+    ti_ext._atan2,
     _ATAN2_DOCSTRING,
     mkl_fn_to_call="_mkl_atan2_to_call",
     mkl_impl_fn="_atan2",

From 1bfa0116d2598b2c02e136ab4793efb5484f7aa3 Mon Sep 17 00:00:00 2001
From: vlad-perevezentsev <vladislav.perevezentsev@intel.com>
Date: Wed, 18 Mar 2026 18:32:43 +0100
Subject: [PATCH 18/43] Extend `_tensor_elementwise_impl` with binary functions
 part 2 (#2804)

This PR extends `_tensor_elementwise_impl` with part of binary functions
: `divide, equal, floor_divide, greater, greater_equal, hypot, less,
less_equal, logaddexp`
---
 dpctl_ext/tensor/CMakeLists.txt               |  18 +-
 dpctl_ext/tensor/__init__.py                  |  18 +
 dpctl_ext/tensor/_elementwise_funcs.py        | 323 +++++++++
 .../kernels/elementwise_functions/equal.hpp   | 318 +++++++++
 .../elementwise_functions/floor_divide.hpp    | 552 +++++++++++++++
 .../kernels/elementwise_functions/greater.hpp | 319 +++++++++
 .../elementwise_functions/greater_equal.hpp   | 319 +++++++++
 .../kernels/elementwise_functions/hypot.hpp   | 249 +++++++
 .../kernels/elementwise_functions/less.hpp    | 316 +++++++++
 .../elementwise_functions/less_equal.hpp      | 318 +++++++++
 .../elementwise_functions/logaddexp.hpp       |   5 -
 .../elementwise_functions/true_divide.hpp     | 668 ++++++++++++++++++
 .../include/kernels/linalg_functions/gemm.hpp |   2 +-
 .../elementwise_common.cpp                    |  37 +-
 .../source/elementwise_functions/equal.cpp    | 145 ++++
 .../source/elementwise_functions/equal.hpp    |  46 ++
 .../elementwise_functions/floor_divide.cpp    | 205 ++++++
 .../elementwise_functions/floor_divide.hpp    |  46 ++
 .../source/elementwise_functions/greater.cpp  | 145 ++++
 .../source/elementwise_functions/greater.hpp  |  46 ++
 .../elementwise_functions/greater_equal.cpp   | 146 ++++
 .../elementwise_functions/greater_equal.hpp   |  46 ++
 .../source/elementwise_functions/hypot.cpp    | 145 ++++
 .../source/elementwise_functions/hypot.hpp    |  46 ++
 .../source/elementwise_functions/less.cpp     | 145 ++++
 .../source/elementwise_functions/less.hpp     |  46 ++
 .../elementwise_functions/less_equal.cpp      | 145 ++++
 .../elementwise_functions/less_equal.hpp      |  46 ++
 .../elementwise_functions/logaddexp.cpp       | 145 ++++
 .../elementwise_functions/logaddexp.hpp       |  46 ++
 .../elementwise_functions/true_divide.cpp     | 500 +++++++++++++
 .../elementwise_functions/true_divide.hpp     |  46 ++
 dpnp/dpnp_iface_logic.py                      |  20 +-
 dpnp/dpnp_iface_mathematical.py               |  12 +-
 dpnp/dpnp_iface_trigonometric.py              | 104 ++-
 35 files changed, 5630 insertions(+), 103 deletions(-)
 create mode 100644 dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/equal.hpp
 create mode 100644 dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/floor_divide.hpp
 create mode 100644 dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/greater.hpp
 create mode 100644 dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/greater_equal.hpp
 create mode 100644 dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/hypot.hpp
 create mode 100644 dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/less.hpp
 create mode 100644 dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/less_equal.hpp
 create mode 100644 dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/true_divide.hpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/elementwise_functions/equal.cpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/elementwise_functions/equal.hpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/elementwise_functions/floor_divide.cpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/elementwise_functions/floor_divide.hpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/elementwise_functions/greater.cpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/elementwise_functions/greater.hpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/elementwise_functions/greater_equal.cpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/elementwise_functions/greater_equal.hpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/elementwise_functions/hypot.cpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/elementwise_functions/hypot.hpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/elementwise_functions/less.cpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/elementwise_functions/less.hpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/elementwise_functions/less_equal.cpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/elementwise_functions/less_equal.hpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/elementwise_functions/logaddexp.cpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/elementwise_functions/logaddexp.hpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/elementwise_functions/true_divide.cpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/elementwise_functions/true_divide.hpp

diff --git a/dpctl_ext/tensor/CMakeLists.txt b/dpctl_ext/tensor/CMakeLists.txt
index b032dc34bdb3..7e1170f4ebff 100644
--- a/dpctl_ext/tensor/CMakeLists.txt
+++ b/dpctl_ext/tensor/CMakeLists.txt
@@ -94,26 +94,26 @@ set(_elementwise_sources
     #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/copysign.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/cos.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/cosh.cpp
-    #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/equal.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/equal.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/exp.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/exp2.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/expm1.cpp
-    #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/floor_divide.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/floor_divide.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/floor.cpp
-    #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/greater_equal.cpp
-    #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/greater.cpp
-    #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/hypot.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/greater_equal.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/greater.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/hypot.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/imag.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/isfinite.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/isinf.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/isnan.cpp
-    #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/less_equal.cpp
-    #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/less.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/less_equal.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/less.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/log.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/log1p.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/log2.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/log10.cpp
-    #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/logaddexp.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/logaddexp.cpp
     #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/logical_and.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/logical_not.cpp
     #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/logical_or.cpp
@@ -141,7 +141,7 @@ set(_elementwise_sources
     #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/subtract.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/tan.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/tanh.cpp
-    #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/true_divide.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/true_divide.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/trunc.cpp
 )
 set(_reduction_sources
diff --git a/dpctl_ext/tensor/__init__.py b/dpctl_ext/tensor/__init__.py
index 5172d426334a..279e3a95fd03 100644
--- a/dpctl_ext/tensor/__init__.py
+++ b/dpctl_ext/tensor/__init__.py
@@ -75,18 +75,27 @@
     conj,
     cos,
     cosh,
+    divide,
+    equal,
     exp,
     exp2,
     expm1,
     floor,
+    floor_divide,
+    greater,
+    greater_equal,
+    hypot,
     imag,
     isfinite,
     isinf,
     isnan,
+    less,
+    less_equal,
     log,
     log1p,
     log2,
     log10,
+    logaddexp,
     logical_not,
     negative,
     positive,
@@ -203,8 +212,10 @@
     "cumulative_prod",
     "cumulative_sum",
     "diff",
+    "divide",
     "empty",
     "empty_like",
+    "equal",
     "extract",
     "expand_dims",
     "eye",
@@ -214,9 +225,13 @@
     "finfo",
     "flip",
     "floor",
+    "floor_divide",
     "from_numpy",
     "full",
     "full_like",
+    "greater",
+    "greater_equal",
+    "hypot",
     "iinfo",
     "imag",
     "isfinite",
@@ -224,8 +239,11 @@
     "isdtype",
     "isin",
     "isnan",
+    "less",
+    "less_equal",
     "linspace",
     "log",
+    "logaddexp",
     "logical_not",
     "logsumexp",
     "log1p",
diff --git a/dpctl_ext/tensor/_elementwise_funcs.py b/dpctl_ext/tensor/_elementwise_funcs.py
index 08d59d8289a3..17bdf94d9be5 100644
--- a/dpctl_ext/tensor/_elementwise_funcs.py
+++ b/dpctl_ext/tensor/_elementwise_funcs.py
@@ -32,8 +32,10 @@
 
 from ._elementwise_common import BinaryElementwiseFunc, UnaryElementwiseFunc
 from ._type_utils import (
+    _acceptance_fn_divide,
     _acceptance_fn_negative,
     _acceptance_fn_reciprocal,
+    _resolve_weak_types_all_py_ints,
 )
 
 # U01: ==== ABS    (x)
@@ -637,6 +639,78 @@
 )
 del _cosh_docstring
 
+# B08: ==== DIVIDE        (x1, x2)
+_divide_docstring_ = r"""
+divide(x1, x2, /, \*, out=None, order='K')
+
+Calculates the ratio for each element `x1_i` of the input array `x1` with
+the respective element `x2_i` of the input array `x2`.
+
+Args:
+    x1 (usm_ndarray):
+        First input array, expected to have a floating-point data type.
+    x2 (usm_ndarray):
+        Second input array, also expected to have a floating-point data type.
+    out (Union[usm_ndarray, None], optional):
+        Output array to populate.
+        Array must have the correct shape and the expected data type.
+    order ("C","F","A","K", optional):
+        Memory layout of the new output array, if parameter
+        `out` is ``None``.
+        Default: "K".
+
+Returns:
+    usm_ndarray:
+        An array containing the result of element-wise division. The data type
+        of the returned array is determined by the Type Promotion Rules.
+"""
+
+divide = BinaryElementwiseFunc(
+    "divide",
+    ti._divide_result_type,
+    ti._divide,
+    _divide_docstring_,
+    binary_inplace_fn=ti._divide_inplace,
+    acceptance_fn=_acceptance_fn_divide,
+    weak_type_resolver=_resolve_weak_types_all_py_ints,
+)
+del _divide_docstring_
+
+# B09: ==== EQUAL         (x1, x2)
+_equal_docstring_ = r"""
+equal(x1, x2, /, \*, out=None, order='K')
+
+Calculates equality test results for each element `x1_i` of the input array `x1`
+with the respective element `x2_i` of the input array `x2`.
+
+Args:
+    x1 (usm_ndarray):
+        First input array. May have any data type.
+    x2 (usm_ndarray):
+        Second input array. May have any data type.
+    out (Union[usm_ndarray, None], optional):
+        Output array to populate.
+        Array must have the correct shape and the expected data type.
+    order ("C","F","A","K", optional):
+        Memory layout of the new output array, if parameter
+        `out` is ``None``.
+        Default: "K".
+
+Returns:
+    usm_ndarray:
+        An array containing the result of element-wise equality comparison.
+        The returned array has a data type of `bool`.
+"""
+
+equal = BinaryElementwiseFunc(
+    "equal",
+    ti._equal_result_type,
+    ti._equal,
+    _equal_docstring_,
+    weak_type_resolver=_resolve_weak_types_all_py_ints,
+)
+del _equal_docstring_
+
 # U13: ==== EXP           (x)
 _exp_docstring = r"""
 exp(x, /, \*, out=None, order='K')
@@ -664,6 +738,114 @@
 exp = UnaryElementwiseFunc("exp", ti._exp_result_type, ti._exp, _exp_docstring)
 del _exp_docstring
 
+# B10: ==== FLOOR_DIVIDE  (x1, x2)
+_floor_divide_docstring_ = r"""
+floor_divide(x1, x2, /, \*, out=None, order='K')
+
+Calculates the ratio for each element `x1_i` of the input array `x1` with
+the respective element `x2_i` of the input array `x2` to the greatest
+integer-value number that is not greater than the division result.
+
+Args:
+    x1 (usm_ndarray):
+        First input array, expected to have a real-valued data type.
+    x2 (usm_ndarray):
+        Second input array, also expected to have a real-valued data type.
+    out (Union[usm_ndarray, None], optional):
+        Output array to populate.
+        Array must have the correct shape and the expected data type.
+    order ("C","F","A","K", optional):
+        Memory layout of the new output array, if parameter
+        `out` is ``None``.
+        Default: "K".
+
+Returns:
+    usm_ndarray:
+        An array containing the result of element-wise floor of division.
+        The data type of the returned array is determined by the Type
+        Promotion Rules.
+"""
+
+floor_divide = BinaryElementwiseFunc(
+    "floor_divide",
+    ti._floor_divide_result_type,
+    ti._floor_divide,
+    _floor_divide_docstring_,
+    binary_inplace_fn=ti._floor_divide_inplace,
+)
+del _floor_divide_docstring_
+
+# B11: ==== GREATER       (x1, x2)
+_greater_docstring_ = r"""
+greater(x1, x2, /, \*, out=None, order='K')
+
+Computes the greater-than test results for each element `x1_i` of
+the input array `x1` with the respective element `x2_i` of the input array `x2`.
+
+Args:
+    x1 (usm_ndarray):
+        First input array. May have any data type.
+    x2 (usm_ndarray):
+        Second input array. May have any data type.
+    out (Union[usm_ndarray, None], optional):
+        Output array to populate.
+        Array must have the correct shape and the expected data type.
+    order ("C","F","A","K", optional):
+        Memory layout of the new output array, if parameter
+        `out` is ``None``.
+        Default: "K".
+
+Returns:
+    usm_ndarray:
+        An array containing the result of element-wise greater-than comparison.
+        The returned array has a data type of `bool`.
+"""
+
+greater = BinaryElementwiseFunc(
+    "greater",
+    ti._greater_result_type,
+    ti._greater,
+    _greater_docstring_,
+    weak_type_resolver=_resolve_weak_types_all_py_ints,
+)
+del _greater_docstring_
+
+# B12: ==== GREATER_EQUAL (x1, x2)
+_greater_equal_docstring_ = r"""
+greater_equal(x1, x2, /, \*, out=None, order='K')
+
+Computes the greater-than or equal-to test results for each element `x1_i` of
+the input array `x1` with the respective element `x2_i` of the input array `x2`.
+
+Args:
+    x1 (usm_ndarray):
+        First input array. May have any data type.
+    x2 (usm_ndarray):
+        Second input array. May have any data type.
+    out (Union[usm_ndarray, None], optional):
+        Output array to populate.
+        Array must have the correct shape and the expected data type.
+    order ("C","F","A","K", optional):
+        Memory layout of the new output array, if parameter
+        `out` is ``None``.
+        Default: "K".
+
+Returns:
+    usm_ndarray:
+        An array containing the result of element-wise greater-than or equal-to
+        comparison.
+        The returned array has a data type of `bool`.
+"""
+
+greater_equal = BinaryElementwiseFunc(
+    "greater_equal",
+    ti._greater_equal_result_type,
+    ti._greater_equal,
+    _greater_equal_docstring_,
+    weak_type_resolver=_resolve_weak_types_all_py_ints,
+)
+del _greater_equal_docstring_
+
 # U14: ==== EXPM1         (x)
 _expm1_docstring = r"""
 expm1(x, /, \*, out=None, order='K')
@@ -839,6 +1021,77 @@
 )
 del _isnan_docstring_
 
+# B13: ==== LESS        (x1, x2)
+_less_docstring_ = r"""
+less(x1, x2, /, \*, out=None, order='K')
+
+Computes the less-than test results for each element `x1_i` of
+the input array `x1` with the respective element `x2_i` of the input array `x2`.
+
+Args:
+    x1 (usm_ndarray):
+        First input array. May have any data type.
+    x2 (usm_ndarray):
+        Second input array. May have any data type.
+    out (Union[usm_ndarray, None], optional):
+        Output array to populate.
+        Array must have the correct shape and the expected data type.
+    order ("C","F","A","K", optional):
+        Memory layout of the new output array, if parameter
+        `out` is ``None``.
+        Default: "K".
+
+Returns:
+    usm_ndarray:
+        An array containing the result of element-wise less-than comparison.
+        The returned array has a data type of `bool`.
+"""
+
+less = BinaryElementwiseFunc(
+    "less",
+    ti._less_result_type,
+    ti._less,
+    _less_docstring_,
+    weak_type_resolver=_resolve_weak_types_all_py_ints,
+)
+del _less_docstring_
+
+
+# B14: ==== LESS_EQUAL  (x1, x2)
+_less_equal_docstring_ = r"""
+less_equal(x1, x2, /, \*, out=None, order='K')
+
+Computes the less-than or equal-to test results for each element `x1_i` of
+the input array `x1` with the respective element `x2_i` of the input array `x2`.
+
+Args:
+    x1 (usm_ndarray):
+        First input array. May have any data type.
+    x2 (usm_ndarray):
+        Second input array. May have any data type.
+    out (Union[usm_ndarray, None], optional):
+        Output array to populate.
+        Array must have the correct shape and the expected data type.
+    order ("C","F","A","K", optional):
+        Memory layout of the new output array, if parameter
+        `out` is ``None``.
+        Default: "K".
+
+Returns:
+    usm_ndarray:
+        An array containing the result of element-wise less-than or equal-to
+        comparison. The returned array has a data type of `bool`.
+"""
+
+less_equal = BinaryElementwiseFunc(
+    "less_equal",
+    ti._less_equal_result_type,
+    ti._less_equal,
+    _less_equal_docstring_,
+    weak_type_resolver=_resolve_weak_types_all_py_ints,
+)
+del _less_equal_docstring_
+
 # U20: ==== LOG         (x)
 _log_docstring = r"""
 log(x, /, \*, out=None, order='K')
@@ -953,6 +1206,43 @@
 )
 del _log10_docstring_
 
+# B15: ==== LOGADDEXP   (x1, x2)
+_logaddexp_docstring_ = r"""
+logaddexp(x1, x2, /, \*, out=None, order='K')
+
+Calculates the natural logarithm of the sum of exponentials for each element
+`x1_i` of the input array `x1` with the respective element `x2_i` of the input
+array `x2`.
+
+This function calculates `log(exp(x1) + exp(x2))` more accurately for small
+values of `x`.
+
+Args:
+    x1 (usm_ndarray):
+        First input array, expected to have a real-valued floating-point data
+        type.
+    x2 (usm_ndarray):
+        Second input array, also expected to have a real-valued floating-point
+        data type.
+    out (Union[usm_ndarray, None], optional):
+        Output array to populate.
+        Array must have the correct shape and the expected data type.
+    order ("C","F","A","K", optional):
+        Memory layout of the new output array, if parameter
+        `out` is ``None``.
+        Default: "K".
+
+Returns:
+    usm_ndarray:
+        An array containing the element-wise results. The data type
+        of the returned array is determined by the Type Promotion Rules.
+"""
+
+logaddexp = BinaryElementwiseFunc(
+    "logaddexp", ti._logaddexp_result_type, ti._logaddexp, _logaddexp_docstring_
+)
+del _logaddexp_docstring_
+
 # U24: ==== LOGICAL_NOT (x)
 _logical_not_docstring = r"""
 logical_not(x, /, \*, out=None, order='K')
@@ -1329,6 +1619,39 @@
 )
 del _trunc_docstring
 
+# B24: ==== HYPOT        (x1, x2)
+_hypot_docstring_ = r"""
+hypot(x1, x2, /, \*, out=None, order='K')
+
+Computes the square root of the sum of squares for each element `x1_i` of the
+input array `x1` with the respective element `x2_i` of the input array `x2`.
+
+Args:
+    x1 (usm_ndarray):
+        First input array, expected to have a real-valued floating-point data
+        type.
+    x2 (usm_ndarray):
+        Second input array, also expected to have a real-valued floating-point
+        data type.
+    out (Union[usm_ndarray, None], optional):
+        Output array to populate.
+        Array must have the correct shape and the expected data type.
+    order ("C","F","A","K", optional):
+        Memory layout of the new output array, if parameter
+        `out` is ``None``.
+        Default: "K".
+
+Returns:
+    usm_ndarray:
+        An array containing the element-wise hypotenuse. The data type
+        of the returned array is determined by the Type Promotion Rules.
+"""
+
+hypot = BinaryElementwiseFunc(
+    "hypot", ti._hypot_result_type, ti._hypot, _hypot_docstring_
+)
+del _hypot_docstring_
+
 # U37: ==== CBRT        (x)
 _cbrt_docstring_ = r"""
 cbrt(x, /, \*, out=None, order='K')
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/equal.hpp b/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/equal.hpp
new file mode 100644
index 000000000000..3a838e919369
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/equal.hpp
@@ -0,0 +1,318 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for elementwise evaluation of equality of
+/// tensor elements.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <complex>
+#include <cstddef>
+#include <cstdint>
+#include <type_traits>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "sycl_complex.hpp"
+#include "vec_size_util.hpp"
+
+#include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+
+#include "utils/type_dispatch_building.hpp"
+#include "utils/type_utils.hpp"
+
+namespace dpctl::tensor::kernels::equal
+{
+
+using dpctl::tensor::ssize_t;
+namespace td_ns = dpctl::tensor::type_dispatch;
+namespace tu_ns = dpctl::tensor::type_utils;
+
+template <typename argT1, typename argT2, typename resT>
+struct EqualFunctor
+{
+    static_assert(std::is_same_v<resT, bool>);
+
+    using supports_sg_loadstore = std::negation<
+        std::disjunction<tu_ns::is_complex<argT1>, tu_ns::is_complex<argT2>>>;
+    using supports_vec = std::conjunction<
+        std::is_same<argT1, argT2>,
+        std::negation<std::disjunction<tu_ns::is_complex<argT1>,
+                                       tu_ns::is_complex<argT2>>>>;
+
+    resT operator()(const argT1 &in1, const argT2 &in2) const
+    {
+        if constexpr (tu_ns::is_complex<argT1>::value &&
+                      tu_ns::is_complex<argT2>::value)
+        {
+            using realT1 = typename argT1::value_type;
+            using realT2 = typename argT2::value_type;
+
+            return exprm_ns::complex<realT1>(in1) ==
+                   exprm_ns::complex<realT2>(in2);
+        }
+        else {
+            if constexpr (std::is_integral_v<argT1> &&
+                          std::is_integral_v<argT2> &&
+                          std::is_signed_v<argT1> != std::is_signed_v<argT2>)
+            {
+                if constexpr (std::is_signed_v<argT1> &&
+                              !std::is_signed_v<argT2>) {
+                    return (in1 < 0) ? false : (static_cast<argT2>(in1) == in2);
+                }
+                else {
+                    if constexpr (!std::is_signed_v<argT1> &&
+                                  std::is_signed_v<argT2>) {
+                        return (in2 < 0) ? false
+                                         : (in1 == static_cast<argT1>(in2));
+                    }
+                }
+            }
+            else {
+                return (in1 == in2);
+            }
+        }
+    }
+
+    template <int vec_sz>
+    sycl::vec<resT, vec_sz>
+        operator()(const sycl::vec<argT1, vec_sz> &in1,
+                   const sycl::vec<argT2, vec_sz> &in2) const
+    {
+        auto tmp = (in1 == in2);
+        if constexpr (std::is_same_v<resT,
+                                     typename decltype(tmp)::element_type>) {
+            return tmp;
+        }
+        else {
+            using dpctl::tensor::type_utils::vec_cast;
+
+            return vec_cast<resT, typename decltype(tmp)::element_type, vec_sz>(
+                tmp);
+        }
+    }
+};
+
+template <typename argT1,
+          typename argT2,
+          typename resT,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
+          bool enable_sg_loadstore = true>
+using EqualContigFunctor =
+    elementwise_common::BinaryContigFunctor<argT1,
+                                            argT2,
+                                            resT,
+                                            EqualFunctor<argT1, argT2, resT>,
+                                            vec_sz,
+                                            n_vecs,
+                                            enable_sg_loadstore>;
+
+template <typename argT1, typename argT2, typename resT, typename IndexerT>
+using EqualStridedFunctor =
+    elementwise_common::BinaryStridedFunctor<argT1,
+                                             argT2,
+                                             resT,
+                                             IndexerT,
+                                             EqualFunctor<argT1, argT2, resT>>;
+
+template <typename T1, typename T2>
+struct EqualOutputType
+{
+    using value_type = typename std::disjunction<
+        td_ns::BinaryTypeMapResultEntry<T1, bool, T2, bool, bool>,
+        td_ns::
+            BinaryTypeMapResultEntry<T1, std::uint8_t, T2, std::uint8_t, bool>,
+        td_ns::BinaryTypeMapResultEntry<T1, std::int8_t, T2, std::int8_t, bool>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint16_t,
+                                        T2,
+                                        std::uint16_t,
+                                        bool>,
+        td_ns::
+            BinaryTypeMapResultEntry<T1, std::int16_t, T2, std::int16_t, bool>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint32_t,
+                                        T2,
+                                        std::uint32_t,
+                                        bool>,
+        td_ns::
+            BinaryTypeMapResultEntry<T1, std::int32_t, T2, std::int32_t, bool>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint64_t,
+                                        T2,
+                                        std::uint64_t,
+                                        bool>,
+        td_ns::
+            BinaryTypeMapResultEntry<T1, std::int64_t, T2, std::int64_t, bool>,
+        td_ns::
+            BinaryTypeMapResultEntry<T1, std::uint64_t, T2, std::int64_t, bool>,
+        td_ns::
+            BinaryTypeMapResultEntry<T1, std::int64_t, T2, std::uint64_t, bool>,
+        td_ns::BinaryTypeMapResultEntry<T1, sycl::half, T2, sycl::half, bool>,
+        td_ns::BinaryTypeMapResultEntry<T1, float, T2, float, bool>,
+        td_ns::BinaryTypeMapResultEntry<T1, double, T2, double, bool>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::complex<float>,
+                                        T2,
+                                        std::complex<float>,
+                                        bool>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::complex<double>,
+                                        T2,
+                                        std::complex<double>,
+                                        bool>,
+        td_ns::DefaultResultEntry<void>>::result_type;
+
+    static constexpr bool is_defined = !std::is_same_v<value_type, void>;
+};
+
+namespace hyperparam_detail
+{
+
+namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
+
+using vsu_ns::BinaryContigHyperparameterSetEntry;
+using vsu_ns::ContigHyperparameterSetDefault;
+
+template <typename argTy1, typename argTy2>
+struct EqualContigHyperparameterSet
+{
+    using value_type =
+        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
+
+    constexpr static auto vec_sz = value_type::vec_sz;
+    constexpr static auto n_vecs = value_type::n_vecs;
+};
+
+} // end of namespace hyperparam_detail
+
+template <typename argT1,
+          typename argT2,
+          typename resT,
+          std::uint8_t vec_sz,
+          std::uint8_t n_vecs>
+class equal_contig_kernel;
+
+template <typename argTy1, typename argTy2>
+sycl::event equal_contig_impl(sycl::queue &exec_q,
+                              std::size_t nelems,
+                              const char *arg1_p,
+                              ssize_t arg1_offset,
+                              const char *arg2_p,
+                              ssize_t arg2_offset,
+                              char *res_p,
+                              ssize_t res_offset,
+                              const std::vector<sycl::event> &depends = {})
+{
+    using EqualHS =
+        hyperparam_detail::EqualContigHyperparameterSet<argTy1, argTy2>;
+    static constexpr std::uint8_t vec_sz = EqualHS::vec_sz;
+    static constexpr std::uint8_t n_vecs = EqualHS::n_vecs;
+
+    return elementwise_common::binary_contig_impl<
+        argTy1, argTy2, EqualOutputType, EqualContigFunctor,
+        equal_contig_kernel, vec_sz, n_vecs>(exec_q, nelems, arg1_p,
+                                             arg1_offset, arg2_p, arg2_offset,
+                                             res_p, res_offset, depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct EqualContigFactory
+{
+    fnT get()
+    {
+        if constexpr (!EqualOutputType<T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = equal_contig_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+
+template <typename fnT, typename T1, typename T2>
+struct EqualTypeMapFactory
+{
+    /*! @brief get typeid for output type of operator()==(x, y), always bool */
+    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
+    {
+        using rT = typename EqualOutputType<T1, T2>::value_type;
+        return td_ns::GetTypeid<rT>{}.get();
+    }
+};
+
+template <typename T1, typename T2, typename resT, typename IndexerT>
+class equal_strided_kernel;
+
+template <typename argTy1, typename argTy2>
+sycl::event
+    equal_strided_impl(sycl::queue &exec_q,
+                       std::size_t nelems,
+                       int nd,
+                       const ssize_t *shape_and_strides,
+                       const char *arg1_p,
+                       ssize_t arg1_offset,
+                       const char *arg2_p,
+                       ssize_t arg2_offset,
+                       char *res_p,
+                       ssize_t res_offset,
+                       const std::vector<sycl::event> &depends,
+                       const std::vector<sycl::event> &additional_depends)
+{
+    return elementwise_common::binary_strided_impl<
+        argTy1, argTy2, EqualOutputType, EqualStridedFunctor,
+        equal_strided_kernel>(exec_q, nelems, nd, shape_and_strides, arg1_p,
+                              arg1_offset, arg2_p, arg2_offset, res_p,
+                              res_offset, depends, additional_depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct EqualStridedFactory
+{
+    fnT get()
+    {
+        if constexpr (!EqualOutputType<T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = equal_strided_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+
+} // namespace dpctl::tensor::kernels::equal
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/floor_divide.hpp b/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/floor_divide.hpp
new file mode 100644
index 000000000000..19ee9d268770
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/floor_divide.hpp
@@ -0,0 +1,552 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for elementwise evaluation of FLOOR_DIVIDE(x1, x2)
+/// function.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <cstddef>
+#include <cstdint>
+#include <type_traits>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "vec_size_util.hpp"
+
+#include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+#include "kernels/elementwise_functions/common_inplace.hpp"
+
+#include "utils/type_dispatch_building.hpp"
+#include "utils/type_utils.hpp"
+
+namespace dpctl::tensor::kernels::floor_divide
+{
+
+using dpctl::tensor::ssize_t;
+namespace td_ns = dpctl::tensor::type_dispatch;
+namespace tu_ns = dpctl::tensor::type_utils;
+
+template <typename argT1, typename argT2, typename resT>
+struct FloorDivideFunctor
+{
+    using supports_sg_loadstore = std::true_type;
+    using supports_vec = std::true_type;
+
+    resT operator()(const argT1 &in1, const argT2 &in2) const
+    {
+        if constexpr (std::is_integral_v<argT1> || std::is_integral_v<argT2>) {
+            if (in2 == argT2(0)) {
+                return resT(0);
+            }
+            if constexpr (std::is_signed_v<argT1> || std::is_signed_v<argT2>) {
+                auto div = in1 / in2;
+                auto mod = in1 % in2;
+                auto corr = (mod != 0 && l_xor(mod < 0, in2 < 0));
+                return (div - corr);
+            }
+            else {
+                return (in1 / in2);
+            }
+        }
+        else {
+            auto div = in1 / in2;
+            return (div == resT(0)) ? div : resT(sycl::floor(div));
+        }
+    }
+
+    template <int vec_sz>
+    sycl::vec<resT, vec_sz>
+        operator()(const sycl::vec<argT1, vec_sz> &in1,
+                   const sycl::vec<argT2, vec_sz> &in2) const
+    {
+        if constexpr (std::is_integral_v<resT>) {
+            sycl::vec<resT, vec_sz> res;
+#pragma unroll
+            for (int i = 0; i < vec_sz; ++i) {
+                if (in2[i] == argT2(0)) {
+                    res[i] = resT(0);
+                }
+                else {
+                    res[i] = in1[i] / in2[i];
+                    if constexpr (std::is_signed_v<resT>) {
+                        auto mod = in1[i] % in2[i];
+                        auto corr = (mod != 0 && l_xor(mod < 0, in2[i] < 0));
+                        res[i] -= corr;
+                    }
+                }
+            }
+            return res;
+        }
+        else {
+            auto tmp = in1 / in2;
+            using tmpT = typename decltype(tmp)::element_type;
+#pragma unroll
+            for (int i = 0; i < vec_sz; ++i) {
+                if (in2[i] != argT2(0)) {
+                    tmp[i] = sycl::floor(tmp[i]);
+                }
+            }
+            if constexpr (std::is_same_v<resT, tmpT>) {
+                return tmp;
+            }
+            else {
+                using dpctl::tensor::type_utils::vec_cast;
+                return vec_cast<resT, tmpT, vec_sz>(tmp);
+            }
+        }
+    }
+
+private:
+    bool l_xor(bool b1, bool b2) const
+    {
+        return (b1 != b2);
+    }
+};
+
+template <typename argT1,
+          typename argT2,
+          typename resT,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
+          bool enable_sg_loadstore = true>
+using FloorDivideContigFunctor = elementwise_common::BinaryContigFunctor<
+    argT1,
+    argT2,
+    resT,
+    FloorDivideFunctor<argT1, argT2, resT>,
+    vec_sz,
+    n_vecs,
+    enable_sg_loadstore>;
+
+template <typename argT1, typename argT2, typename resT, typename IndexerT>
+using FloorDivideStridedFunctor = elementwise_common::BinaryStridedFunctor<
+    argT1,
+    argT2,
+    resT,
+    IndexerT,
+    FloorDivideFunctor<argT1, argT2, resT>>;
+
+template <typename T1, typename T2>
+struct FloorDivideOutputType
+{
+    using value_type = typename std::disjunction<
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint8_t,
+                                        T2,
+                                        std::uint8_t,
+                                        std::uint8_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::int8_t,
+                                        T2,
+                                        std::int8_t,
+                                        std::int8_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint16_t,
+                                        T2,
+                                        std::uint16_t,
+                                        std::uint16_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::int16_t,
+                                        T2,
+                                        std::int16_t,
+                                        std::int16_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint32_t,
+                                        T2,
+                                        std::uint32_t,
+                                        std::uint32_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::int32_t,
+                                        T2,
+                                        std::int32_t,
+                                        std::int32_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint64_t,
+                                        T2,
+                                        std::uint64_t,
+                                        std::uint64_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::int64_t,
+                                        T2,
+                                        std::int64_t,
+                                        std::int64_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        sycl::half,
+                                        T2,
+                                        sycl::half,
+                                        sycl::half>,
+        td_ns::BinaryTypeMapResultEntry<T1, float, T2, float, float>,
+        td_ns::BinaryTypeMapResultEntry<T1, double, T2, double, double>,
+        td_ns::DefaultResultEntry<void>>::result_type;
+
+    static constexpr bool is_defined = !std::is_same_v<value_type, void>;
+};
+
+namespace hyperparam_detail
+{
+
+namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
+
+using vsu_ns::BinaryContigHyperparameterSetEntry;
+using vsu_ns::ContigHyperparameterSetDefault;
+
+template <typename argTy1, typename argTy2>
+struct FloorDivideContigHyperparameterSet
+{
+    using value_type =
+        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
+
+    constexpr static auto vec_sz = value_type::vec_sz;
+    constexpr static auto n_vecs = value_type::n_vecs;
+};
+
+} // end of namespace hyperparam_detail
+
+template <typename argT1,
+          typename argT2,
+          typename resT,
+          std::uint8_t vec_sz,
+          std::uint8_t n_vecs>
+class floor_divide_contig_kernel;
+
+template <typename argTy1, typename argTy2>
+sycl::event
+    floor_divide_contig_impl(sycl::queue &exec_q,
+                             std::size_t nelems,
+                             const char *arg1_p,
+                             ssize_t arg1_offset,
+                             const char *arg2_p,
+                             ssize_t arg2_offset,
+                             char *res_p,
+                             ssize_t res_offset,
+                             const std::vector<sycl::event> &depends = {})
+{
+    using FloorDivideHS =
+        hyperparam_detail::FloorDivideContigHyperparameterSet<argTy1, argTy2>;
+    static constexpr std::uint8_t vec_sz = FloorDivideHS::vec_sz;
+    static constexpr std::uint8_t n_vecs = FloorDivideHS::n_vecs;
+
+    return elementwise_common::binary_contig_impl<
+        argTy1, argTy2, FloorDivideOutputType, FloorDivideContigFunctor,
+        floor_divide_contig_kernel, vec_sz, n_vecs>(
+        exec_q, nelems, arg1_p, arg1_offset, arg2_p, arg2_offset, res_p,
+        res_offset, depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct FloorDivideContigFactory
+{
+    fnT get()
+    {
+        if constexpr (!FloorDivideOutputType<T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = floor_divide_contig_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+
+template <typename fnT, typename T1, typename T2>
+struct FloorDivideTypeMapFactory
+{
+    /*! @brief get typeid for output type of floor_divide(T1 x, T2 y) */
+    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
+    {
+        using rT = typename FloorDivideOutputType<T1, T2>::value_type;
+        return td_ns::GetTypeid<rT>{}.get();
+    }
+};
+
+template <typename T1, typename T2, typename resT, typename IndexerT>
+class floor_divide_strided_kernel;
+
+template <typename argTy1, typename argTy2>
+sycl::event floor_divide_strided_impl(
+    sycl::queue &exec_q,
+    std::size_t nelems,
+    int nd,
+    const ssize_t *shape_and_strides,
+    const char *arg1_p,
+    ssize_t arg1_offset,
+    const char *arg2_p,
+    ssize_t arg2_offset,
+    char *res_p,
+    ssize_t res_offset,
+    const std::vector<sycl::event> &depends,
+    const std::vector<sycl::event> &additional_depends)
+{
+    return elementwise_common::binary_strided_impl<
+        argTy1, argTy2, FloorDivideOutputType, FloorDivideStridedFunctor,
+        floor_divide_strided_kernel>(
+        exec_q, nelems, nd, shape_and_strides, arg1_p, arg1_offset, arg2_p,
+        arg2_offset, res_p, res_offset, depends, additional_depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct FloorDivideStridedFactory
+{
+    fnT get()
+    {
+        if constexpr (!FloorDivideOutputType<T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = floor_divide_strided_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+
+template <typename argT, typename resT>
+struct FloorDivideInplaceFunctor
+{
+    using supports_sg_loadstore = std::true_type;
+    using supports_vec = std::true_type;
+
+    void operator()(resT &in1, const argT &in2) const
+    {
+        if constexpr (std::is_integral_v<resT>) {
+            if (in2 == argT(0)) {
+                in1 = 0;
+                return;
+            }
+            if constexpr (std::is_signed_v<resT>) {
+                auto tmp = in1;
+                in1 /= in2;
+                auto mod = tmp % in2;
+                auto corr = (mod != 0 && l_xor(mod < 0, in2 < 0));
+                in1 -= corr;
+            }
+            else {
+                in1 /= in2;
+            }
+        }
+        else {
+            in1 /= in2;
+            if (in1 == resT(0)) {
+                return;
+            }
+            in1 = sycl::floor(in1);
+        }
+    }
+
+    template <int vec_sz>
+    void operator()(sycl::vec<resT, vec_sz> &in1,
+                    const sycl::vec<argT, vec_sz> &in2) const
+    {
+        if constexpr (std::is_integral_v<resT>) {
+#pragma unroll
+            for (int i = 0; i < vec_sz; ++i) {
+                if (in2[i] == argT(0)) {
+                    in1[i] = 0;
+                }
+                else {
+                    if constexpr (std::is_signed_v<resT>) {
+                        auto tmp = in1[i];
+                        in1[i] /= in2[i];
+                        auto mod = tmp % in2[i];
+                        auto corr = (mod != 0 && l_xor(mod < 0, in2[i] < 0));
+                        in1[i] -= corr;
+                    }
+                    else {
+                        in1[i] /= in2[i];
+                    }
+                }
+            }
+        }
+        else {
+            in1 /= in2;
+#pragma unroll
+            for (int i = 0; i < vec_sz; ++i) {
+                if (in2[i] != argT(0)) {
+                    in1[i] = sycl::floor(in1[i]);
+                }
+            }
+        }
+    }
+
+private:
+    bool l_xor(bool b1, bool b2) const
+    {
+        return (b1 != b2);
+    }
+};
+
+template <typename argT,
+          typename resT,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
+          bool enable_sg_loadstore = true>
+using FloorDivideInplaceContigFunctor =
+    elementwise_common::BinaryInplaceContigFunctor<
+        argT,
+        resT,
+        FloorDivideInplaceFunctor<argT, resT>,
+        vec_sz,
+        n_vecs,
+        enable_sg_loadstore>;
+
+template <typename argT, typename resT, typename IndexerT>
+using FloorDivideInplaceStridedFunctor =
+    elementwise_common::BinaryInplaceStridedFunctor<
+        argT,
+        resT,
+        IndexerT,
+        FloorDivideInplaceFunctor<argT, resT>>;
+
+template <typename argT,
+          typename resT,
+          std::uint8_t vec_sz,
+          std::uint8_t n_vecs>
+class floor_divide_inplace_contig_kernel;
+
+/* @brief Types supported by in-place floor division */
+template <typename argTy, typename resTy>
+struct FloorDivideInplaceTypePairSupport
+{
+    /* value if true a kernel for <argTy, resTy> must be instantiated  */
+    static constexpr bool is_defined = std::disjunction<
+        td_ns::TypePairDefinedEntry<argTy, std::int8_t, resTy, std::int8_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, resTy, std::uint8_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int16_t, resTy, std::int16_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, resTy, std::uint16_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int32_t, resTy, std::int32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint32_t, resTy, std::uint32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int64_t, resTy, std::int64_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint64_t, resTy, std::uint64_t>,
+        td_ns::TypePairDefinedEntry<argTy, sycl::half, resTy, sycl::half>,
+        td_ns::TypePairDefinedEntry<argTy, float, resTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, double, resTy, double>,
+        // fall-through
+        td_ns::NotDefinedEntry>::is_defined;
+};
+
+template <typename fnT, typename argT, typename resT>
+struct FloorDivideInplaceTypeMapFactory
+{
+    /*! @brief get typeid for output type of x //= y */
+    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
+    {
+        if constexpr (FloorDivideInplaceTypePairSupport<argT, resT>::is_defined)
+        {
+            return td_ns::GetTypeid<resT>{}.get();
+        }
+        else {
+            return td_ns::GetTypeid<void>{}.get();
+        }
+    }
+};
+
+template <typename argTy, typename resTy>
+sycl::event floor_divide_inplace_contig_impl(
+    sycl::queue &exec_q,
+    std::size_t nelems,
+    const char *arg_p,
+    ssize_t arg_offset,
+    char *res_p,
+    ssize_t res_offset,
+    const std::vector<sycl::event> &depends = {})
+{
+    using FloorDivideHS =
+        hyperparam_detail::FloorDivideContigHyperparameterSet<resTy, argTy>;
+
+    static constexpr std::uint8_t vec_sz = FloorDivideHS::vec_sz;
+    static constexpr std::uint8_t n_vecs = FloorDivideHS::n_vecs;
+
+    return elementwise_common::binary_inplace_contig_impl<
+        argTy, resTy, FloorDivideInplaceContigFunctor,
+        floor_divide_inplace_contig_kernel, vec_sz, n_vecs>(
+        exec_q, nelems, arg_p, arg_offset, res_p, res_offset, depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct FloorDivideInplaceContigFactory
+{
+    fnT get()
+    {
+        if constexpr (!FloorDivideInplaceTypePairSupport<T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = floor_divide_inplace_contig_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+
+template <typename resT, typename argT, typename IndexerT>
+class floor_divide_inplace_strided_kernel;
+
+template <typename argTy, typename resTy>
+sycl::event floor_divide_inplace_strided_impl(
+    sycl::queue &exec_q,
+    std::size_t nelems,
+    int nd,
+    const ssize_t *shape_and_strides,
+    const char *arg_p,
+    ssize_t arg_offset,
+    char *res_p,
+    ssize_t res_offset,
+    const std::vector<sycl::event> &depends,
+    const std::vector<sycl::event> &additional_depends)
+{
+    return elementwise_common::binary_inplace_strided_impl<
+        argTy, resTy, FloorDivideInplaceStridedFunctor,
+        floor_divide_inplace_strided_kernel>(
+        exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p,
+        res_offset, depends, additional_depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct FloorDivideInplaceStridedFactory
+{
+    fnT get()
+    {
+        if constexpr (!FloorDivideInplaceTypePairSupport<T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = floor_divide_inplace_strided_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+
+} // namespace dpctl::tensor::kernels::floor_divide
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/greater.hpp b/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/greater.hpp
new file mode 100644
index 000000000000..3e38b5f4deca
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/greater.hpp
@@ -0,0 +1,319 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for elementwise evaluation of comparison of
+/// tensor elements.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <complex>
+#include <cstddef>
+#include <cstdint>
+#include <type_traits>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "vec_size_util.hpp"
+
+#include "utils/math_utils.hpp"
+#include "utils/type_dispatch_building.hpp"
+#include "utils/type_utils.hpp"
+
+#include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+
+namespace dpctl::tensor::kernels::greater
+{
+
+using dpctl::tensor::ssize_t;
+namespace td_ns = dpctl::tensor::type_dispatch;
+namespace tu_ns = dpctl::tensor::type_utils;
+
+template <typename argT1, typename argT2, typename resT>
+struct GreaterFunctor
+{
+    static_assert(std::is_same_v<resT, bool>);
+
+    using supports_sg_loadstore = std::negation<
+        std::disjunction<tu_ns::is_complex<argT1>, tu_ns::is_complex<argT2>>>;
+    using supports_vec = std::conjunction<
+        std::is_same<argT1, argT2>,
+        std::negation<std::disjunction<tu_ns::is_complex<argT1>,
+                                       tu_ns::is_complex<argT2>>>>;
+
+    resT operator()(const argT1 &in1, const argT2 &in2) const
+    {
+        if constexpr (tu_ns::is_complex<argT1>::value ||
+                      tu_ns::is_complex<argT2>::value)
+        {
+            static_assert(std::is_same_v<argT1, argT2>);
+            using dpctl::tensor::math_utils::greater_complex;
+            return greater_complex<argT1>(in1, in2);
+        }
+        else {
+            if constexpr (std::is_integral_v<argT1> &&
+                          std::is_integral_v<argT2> &&
+                          std::is_signed_v<argT1> != std::is_signed_v<argT2>)
+            {
+                if constexpr (std::is_signed_v<argT1> &&
+                              !std::is_signed_v<argT2>) {
+                    return (in1 < 0) ? false : (static_cast<argT2>(in1) > in2);
+                }
+                else {
+                    if constexpr (!std::is_signed_v<argT1> &&
+                                  std::is_signed_v<argT2>) {
+                        return (in2 < 0) ? true
+                                         : (in1 > static_cast<argT1>(in2));
+                    }
+                }
+            }
+            else {
+                return (in1 > in2);
+            }
+        }
+    }
+
+    template <int vec_sz>
+    sycl::vec<resT, vec_sz>
+        operator()(const sycl::vec<argT1, vec_sz> &in1,
+                   const sycl::vec<argT2, vec_sz> &in2) const
+    {
+
+        auto tmp = (in1 > in2);
+
+        if constexpr (std::is_same_v<resT,
+                                     typename decltype(tmp)::element_type>) {
+            return tmp;
+        }
+        else {
+            using dpctl::tensor::type_utils::vec_cast;
+
+            return vec_cast<resT, typename decltype(tmp)::element_type, vec_sz>(
+                tmp);
+        }
+    }
+};
+
+template <typename argT1,
+          typename argT2,
+          typename resT,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
+          bool enable_sg_loadstore = true>
+using GreaterContigFunctor =
+    elementwise_common::BinaryContigFunctor<argT1,
+                                            argT2,
+                                            resT,
+                                            GreaterFunctor<argT1, argT2, resT>,
+                                            vec_sz,
+                                            n_vecs,
+                                            enable_sg_loadstore>;
+
+template <typename argT1, typename argT2, typename resT, typename IndexerT>
+using GreaterStridedFunctor = elementwise_common::BinaryStridedFunctor<
+    argT1,
+    argT2,
+    resT,
+    IndexerT,
+    GreaterFunctor<argT1, argT2, resT>>;
+
+template <typename T1, typename T2>
+struct GreaterOutputType
+{
+    using value_type = typename std::disjunction<
+        td_ns::BinaryTypeMapResultEntry<T1, bool, T2, bool, bool>,
+        td_ns::
+            BinaryTypeMapResultEntry<T1, std::uint8_t, T2, std::uint8_t, bool>,
+        td_ns::BinaryTypeMapResultEntry<T1, std::int8_t, T2, std::int8_t, bool>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint16_t,
+                                        T2,
+                                        std::uint16_t,
+                                        bool>,
+        td_ns::
+            BinaryTypeMapResultEntry<T1, std::int16_t, T2, std::int16_t, bool>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint32_t,
+                                        T2,
+                                        std::uint32_t,
+                                        bool>,
+        td_ns::
+            BinaryTypeMapResultEntry<T1, std::int32_t, T2, std::int32_t, bool>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint64_t,
+                                        T2,
+                                        std::uint64_t,
+                                        bool>,
+        td_ns::
+            BinaryTypeMapResultEntry<T1, std::int64_t, T2, std::int64_t, bool>,
+        td_ns::
+            BinaryTypeMapResultEntry<T1, std::uint64_t, T2, std::int64_t, bool>,
+        td_ns::
+            BinaryTypeMapResultEntry<T1, std::int64_t, T2, std::uint64_t, bool>,
+        td_ns::BinaryTypeMapResultEntry<T1, sycl::half, T2, sycl::half, bool>,
+        td_ns::BinaryTypeMapResultEntry<T1, float, T2, float, bool>,
+        td_ns::BinaryTypeMapResultEntry<T1, double, T2, double, bool>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::complex<float>,
+                                        T2,
+                                        std::complex<float>,
+                                        bool>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::complex<double>,
+                                        T2,
+                                        std::complex<double>,
+                                        bool>,
+        td_ns::DefaultResultEntry<void>>::result_type;
+
+    static constexpr bool is_defined = !std::is_same_v<value_type, void>;
+};
+
+namespace hyperparam_detail
+{
+
+namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
+
+using vsu_ns::BinaryContigHyperparameterSetEntry;
+using vsu_ns::ContigHyperparameterSetDefault;
+
+template <typename argTy1, typename argTy2>
+struct GreaterContigHyperparameterSet
+{
+    using value_type =
+        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
+
+    constexpr static auto vec_sz = value_type::vec_sz;
+    constexpr static auto n_vecs = value_type::n_vecs;
+};
+
+} // end of namespace hyperparam_detail
+
+template <typename argT1,
+          typename argT2,
+          typename resT,
+          std::uint8_t vec_sz,
+          std::uint8_t n_vecs>
+class greater_contig_kernel;
+
+template <typename argTy1, typename argTy2>
+sycl::event greater_contig_impl(sycl::queue &exec_q,
+                                std::size_t nelems,
+                                const char *arg1_p,
+                                ssize_t arg1_offset,
+                                const char *arg2_p,
+                                ssize_t arg2_offset,
+                                char *res_p,
+                                ssize_t res_offset,
+                                const std::vector<sycl::event> &depends = {})
+{
+    using GreaterHS =
+        hyperparam_detail::GreaterContigHyperparameterSet<argTy1, argTy2>;
+
+    static constexpr std::uint8_t vec_sz = GreaterHS::vec_sz;
+    static constexpr std::uint8_t n_vecs = GreaterHS::n_vecs;
+
+    return elementwise_common::binary_contig_impl<
+        argTy1, argTy2, GreaterOutputType, GreaterContigFunctor,
+        greater_contig_kernel, vec_sz, n_vecs>(exec_q, nelems, arg1_p,
+                                               arg1_offset, arg2_p, arg2_offset,
+                                               res_p, res_offset, depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct GreaterContigFactory
+{
+    fnT get()
+    {
+        if constexpr (!GreaterOutputType<T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = greater_contig_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+
+template <typename fnT, typename T1, typename T2>
+struct GreaterTypeMapFactory
+{
+    /*! @brief get typeid for output type of operator()>(x, y), always bool */
+    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
+    {
+        using rT = typename GreaterOutputType<T1, T2>::value_type;
+        return td_ns::GetTypeid<rT>{}.get();
+    }
+};
+
+template <typename T1, typename T2, typename resT, typename IndexerT>
+class greater_strided_kernel;
+
+template <typename argTy1, typename argTy2>
+sycl::event
+    greater_strided_impl(sycl::queue &exec_q,
+                         std::size_t nelems,
+                         int nd,
+                         const ssize_t *shape_and_strides,
+                         const char *arg1_p,
+                         ssize_t arg1_offset,
+                         const char *arg2_p,
+                         ssize_t arg2_offset,
+                         char *res_p,
+                         ssize_t res_offset,
+                         const std::vector<sycl::event> &depends,
+                         const std::vector<sycl::event> &additional_depends)
+{
+    return elementwise_common::binary_strided_impl<
+        argTy1, argTy2, GreaterOutputType, GreaterStridedFunctor,
+        greater_strided_kernel>(exec_q, nelems, nd, shape_and_strides, arg1_p,
+                                arg1_offset, arg2_p, arg2_offset, res_p,
+                                res_offset, depends, additional_depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct GreaterStridedFactory
+{
+    fnT get()
+    {
+        if constexpr (!GreaterOutputType<T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = greater_strided_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+
+} // namespace dpctl::tensor::kernels::greater
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/greater_equal.hpp b/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/greater_equal.hpp
new file mode 100644
index 000000000000..029741b02600
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/greater_equal.hpp
@@ -0,0 +1,319 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for elementwise evaluation of comparison of
+/// tensor elements.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <complex>
+#include <cstddef>
+#include <cstdint>
+#include <type_traits>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "vec_size_util.hpp"
+
+#include "utils/math_utils.hpp"
+#include "utils/type_dispatch_building.hpp"
+#include "utils/type_utils.hpp"
+
+#include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+
+namespace dpctl::tensor::kernels::greater_equal
+{
+
+using dpctl::tensor::ssize_t;
+namespace td_ns = dpctl::tensor::type_dispatch;
+namespace tu_ns = dpctl::tensor::type_utils;
+
+template <typename argT1, typename argT2, typename resT>
+struct GreaterEqualFunctor
+{
+    static_assert(std::is_same_v<resT, bool>);
+
+    using supports_sg_loadstore = std::negation<
+        std::disjunction<tu_ns::is_complex<argT1>, tu_ns::is_complex<argT2>>>;
+    using supports_vec = std::conjunction<
+        std::is_same<argT1, argT2>,
+        std::negation<std::disjunction<tu_ns::is_complex<argT1>,
+                                       tu_ns::is_complex<argT2>>>>;
+
+    resT operator()(const argT1 &in1, const argT2 &in2) const
+    {
+        if constexpr (tu_ns::is_complex<argT1>::value ||
+                      tu_ns::is_complex<argT2>::value)
+        {
+            static_assert(std::is_same_v<argT1, argT2>);
+            using dpctl::tensor::math_utils::greater_equal_complex;
+            return greater_equal_complex<argT1>(in1, in2);
+        }
+        else {
+            if constexpr (std::is_integral_v<argT1> &&
+                          std::is_integral_v<argT2> &&
+                          std::is_signed_v<argT1> != std::is_signed_v<argT2>)
+            {
+                if constexpr (std::is_signed_v<argT1> &&
+                              !std::is_signed_v<argT2>) {
+                    return (in1 < 0) ? false : (static_cast<argT2>(in1) >= in2);
+                }
+                else {
+                    if constexpr (!std::is_signed_v<argT1> &&
+                                  std::is_signed_v<argT2>) {
+                        return (in2 < 0) ? true
+                                         : (in1 >= static_cast<argT1>(in2));
+                    }
+                }
+            }
+            else {
+                return (in1 >= in2);
+            }
+        }
+    }
+
+    template <int vec_sz>
+    sycl::vec<resT, vec_sz>
+        operator()(const sycl::vec<argT1, vec_sz> &in1,
+                   const sycl::vec<argT2, vec_sz> &in2) const
+    {
+
+        auto tmp = (in1 >= in2);
+
+        if constexpr (std::is_same_v<resT,
+                                     typename decltype(tmp)::element_type>) {
+            return tmp;
+        }
+        else {
+            using dpctl::tensor::type_utils::vec_cast;
+
+            return vec_cast<resT, typename decltype(tmp)::element_type, vec_sz>(
+                tmp);
+        }
+    }
+};
+
+template <typename argT1,
+          typename argT2,
+          typename resT,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
+          bool enable_sg_loadstore = true>
+using GreaterEqualContigFunctor = elementwise_common::BinaryContigFunctor<
+    argT1,
+    argT2,
+    resT,
+    GreaterEqualFunctor<argT1, argT2, resT>,
+    vec_sz,
+    n_vecs,
+    enable_sg_loadstore>;
+
+template <typename argT1, typename argT2, typename resT, typename IndexerT>
+using GreaterEqualStridedFunctor = elementwise_common::BinaryStridedFunctor<
+    argT1,
+    argT2,
+    resT,
+    IndexerT,
+    GreaterEqualFunctor<argT1, argT2, resT>>;
+
+template <typename T1, typename T2>
+struct GreaterEqualOutputType
+{
+    using value_type = typename std::disjunction<
+        td_ns::BinaryTypeMapResultEntry<T1, bool, T2, bool, bool>,
+        td_ns::
+            BinaryTypeMapResultEntry<T1, std::uint8_t, T2, std::uint8_t, bool>,
+        td_ns::BinaryTypeMapResultEntry<T1, std::int8_t, T2, std::int8_t, bool>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint16_t,
+                                        T2,
+                                        std::uint16_t,
+                                        bool>,
+        td_ns::
+            BinaryTypeMapResultEntry<T1, std::int16_t, T2, std::int16_t, bool>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint32_t,
+                                        T2,
+                                        std::uint32_t,
+                                        bool>,
+        td_ns::
+            BinaryTypeMapResultEntry<T1, std::int32_t, T2, std::int32_t, bool>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint64_t,
+                                        T2,
+                                        std::uint64_t,
+                                        bool>,
+        td_ns::
+            BinaryTypeMapResultEntry<T1, std::int64_t, T2, std::int64_t, bool>,
+        td_ns::
+            BinaryTypeMapResultEntry<T1, std::uint64_t, T2, std::int64_t, bool>,
+        td_ns::
+            BinaryTypeMapResultEntry<T1, std::int64_t, T2, std::uint64_t, bool>,
+        td_ns::BinaryTypeMapResultEntry<T1, sycl::half, T2, sycl::half, bool>,
+        td_ns::BinaryTypeMapResultEntry<T1, float, T2, float, bool>,
+        td_ns::BinaryTypeMapResultEntry<T1, double, T2, double, bool>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::complex<float>,
+                                        T2,
+                                        std::complex<float>,
+                                        bool>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::complex<double>,
+                                        T2,
+                                        std::complex<double>,
+                                        bool>,
+        td_ns::DefaultResultEntry<void>>::result_type;
+
+    static constexpr bool is_defined = !std::is_same_v<value_type, void>;
+};
+
+namespace hyperparam_detail
+{
+
+namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
+
+using vsu_ns::BinaryContigHyperparameterSetEntry;
+using vsu_ns::ContigHyperparameterSetDefault;
+
+template <typename argTy1, typename argTy2>
+struct GreaterEqualContigHyperparameterSet
+{
+    using value_type =
+        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
+
+    constexpr static auto vec_sz = value_type::vec_sz;
+    constexpr static auto n_vecs = value_type::n_vecs;
+};
+
+} // end of namespace hyperparam_detail
+
+template <typename argT1,
+          typename argT2,
+          typename resT,
+          std::uint8_t vec_sz,
+          std::uint8_t n_vecs>
+class greater_equal_contig_kernel;
+
+template <typename argTy1, typename argTy2>
+sycl::event
+    greater_equal_contig_impl(sycl::queue &exec_q,
+                              std::size_t nelems,
+                              const char *arg1_p,
+                              ssize_t arg1_offset,
+                              const char *arg2_p,
+                              ssize_t arg2_offset,
+                              char *res_p,
+                              ssize_t res_offset,
+                              const std::vector<sycl::event> &depends = {})
+{
+    using GreaterEqHS =
+        hyperparam_detail::GreaterEqualContigHyperparameterSet<argTy1, argTy2>;
+    static constexpr std::uint8_t vec_sz = GreaterEqHS::vec_sz;
+    static constexpr std::uint8_t n_vecs = GreaterEqHS::n_vecs;
+
+    return elementwise_common::binary_contig_impl<
+        argTy1, argTy2, GreaterEqualOutputType, GreaterEqualContigFunctor,
+        greater_equal_contig_kernel, vec_sz, n_vecs>(
+        exec_q, nelems, arg1_p, arg1_offset, arg2_p, arg2_offset, res_p,
+        res_offset, depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct GreaterEqualContigFactory
+{
+    fnT get()
+    {
+        if constexpr (!GreaterEqualOutputType<T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = greater_equal_contig_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+
+template <typename fnT, typename T1, typename T2>
+struct GreaterEqualTypeMapFactory
+{
+    /*! @brief get typeid for output type of operator()>(x, y), always bool */
+    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
+    {
+        using rT = typename GreaterEqualOutputType<T1, T2>::value_type;
+        return td_ns::GetTypeid<rT>{}.get();
+    }
+};
+
+template <typename T1, typename T2, typename resT, typename IndexerT>
+class greater_equal_strided_kernel;
+
+template <typename argTy1, typename argTy2>
+sycl::event greater_equal_strided_impl(
+    sycl::queue &exec_q,
+    std::size_t nelems,
+    int nd,
+    const ssize_t *shape_and_strides,
+    const char *arg1_p,
+    ssize_t arg1_offset,
+    const char *arg2_p,
+    ssize_t arg2_offset,
+    char *res_p,
+    ssize_t res_offset,
+    const std::vector<sycl::event> &depends,
+    const std::vector<sycl::event> &additional_depends)
+{
+    return elementwise_common::binary_strided_impl<
+        argTy1, argTy2, GreaterEqualOutputType, GreaterEqualStridedFunctor,
+        greater_equal_strided_kernel>(
+        exec_q, nelems, nd, shape_and_strides, arg1_p, arg1_offset, arg2_p,
+        arg2_offset, res_p, res_offset, depends, additional_depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct GreaterEqualStridedFactory
+{
+    fnT get()
+    {
+        if constexpr (!GreaterEqualOutputType<T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = greater_equal_strided_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+
+} // namespace dpctl::tensor::kernels::greater_equal
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/hypot.hpp b/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/hypot.hpp
new file mode 100644
index 000000000000..438a5eea3ae8
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/hypot.hpp
@@ -0,0 +1,249 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for elementwise evaluation of HYPOT(x1, x2)
+/// function.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <cstddef>
+#include <cstdint>
+#include <type_traits>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "vec_size_util.hpp"
+
+#include "utils/type_dispatch_building.hpp"
+#include "utils/type_utils.hpp"
+
+#include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+
+namespace dpctl::tensor::kernels::hypot
+{
+
+using dpctl::tensor::ssize_t;
+namespace td_ns = dpctl::tensor::type_dispatch;
+namespace tu_ns = dpctl::tensor::type_utils;
+
+template <typename argT1, typename argT2, typename resT>
+struct HypotFunctor
+{
+
+    using supports_sg_loadstore = std::negation<
+        std::disjunction<tu_ns::is_complex<argT1>, tu_ns::is_complex<argT2>>>;
+    using supports_vec = std::negation<
+        std::disjunction<tu_ns::is_complex<argT1>, tu_ns::is_complex<argT2>>>;
+
+    resT operator()(const argT1 &in1, const argT2 &in2) const
+    {
+        return sycl::hypot(in1, in2);
+    }
+
+    template <int vec_sz>
+    sycl::vec<resT, vec_sz>
+        operator()(const sycl::vec<argT1, vec_sz> &in1,
+                   const sycl::vec<argT2, vec_sz> &in2) const
+    {
+        auto res = sycl::hypot(in1, in2);
+        if constexpr (std::is_same_v<resT,
+                                     typename decltype(res)::element_type>) {
+            return res;
+        }
+        else {
+            using dpctl::tensor::type_utils::vec_cast;
+
+            return vec_cast<resT, typename decltype(res)::element_type, vec_sz>(
+                res);
+        }
+    }
+};
+
+template <typename argT1,
+          typename argT2,
+          typename resT,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
+          bool enable_sg_loadstore = true>
+using HypotContigFunctor =
+    elementwise_common::BinaryContigFunctor<argT1,
+                                            argT2,
+                                            resT,
+                                            HypotFunctor<argT1, argT2, resT>,
+                                            vec_sz,
+                                            n_vecs,
+                                            enable_sg_loadstore>;
+
+template <typename argT1, typename argT2, typename resT, typename IndexerT>
+using HypotStridedFunctor =
+    elementwise_common::BinaryStridedFunctor<argT1,
+                                             argT2,
+                                             resT,
+                                             IndexerT,
+                                             HypotFunctor<argT1, argT2, resT>>;
+
+template <typename T1, typename T2>
+struct HypotOutputType
+{
+    using value_type = typename std::disjunction<
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        sycl::half,
+                                        T2,
+                                        sycl::half,
+                                        sycl::half>,
+        td_ns::BinaryTypeMapResultEntry<T1, float, T2, float, float>,
+        td_ns::BinaryTypeMapResultEntry<T1, double, T2, double, double>,
+        td_ns::DefaultResultEntry<void>>::result_type;
+
+    static constexpr bool is_defined = !std::is_same_v<value_type, void>;
+};
+
+namespace hyperparam_detail
+{
+
+namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
+
+using vsu_ns::BinaryContigHyperparameterSetEntry;
+using vsu_ns::ContigHyperparameterSetDefault;
+
+template <typename argTy1, typename argTy2>
+struct HypotContigHyperparameterSet
+{
+    using value_type =
+        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
+
+    constexpr static auto vec_sz = value_type::vec_sz;
+    constexpr static auto n_vecs = value_type::n_vecs;
+};
+
+} // end of namespace hyperparam_detail
+
+template <typename argT1,
+          typename argT2,
+          typename resT,
+          std::uint8_t vec_sz,
+          std::uint8_t n_vecs>
+class hypot_contig_kernel;
+
+template <typename argTy1, typename argTy2>
+sycl::event hypot_contig_impl(sycl::queue &exec_q,
+                              std::size_t nelems,
+                              const char *arg1_p,
+                              ssize_t arg1_offset,
+                              const char *arg2_p,
+                              ssize_t arg2_offset,
+                              char *res_p,
+                              ssize_t res_offset,
+                              const std::vector<sycl::event> &depends = {})
+{
+    using HypotHS =
+        hyperparam_detail::HypotContigHyperparameterSet<argTy1, argTy2>;
+    static constexpr std::uint8_t vec_sz = HypotHS::vec_sz;
+    static constexpr std::uint8_t n_vecs = HypotHS::n_vecs;
+
+    return elementwise_common::binary_contig_impl<
+        argTy1, argTy2, HypotOutputType, HypotContigFunctor,
+        hypot_contig_kernel, vec_sz, n_vecs>(exec_q, nelems, arg1_p,
+                                             arg1_offset, arg2_p, arg2_offset,
+                                             res_p, res_offset, depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct HypotContigFactory
+{
+    fnT get()
+    {
+        if constexpr (!HypotOutputType<T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = hypot_contig_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+
+template <typename fnT, typename T1, typename T2>
+struct HypotTypeMapFactory
+{
+    /*! @brief get typeid for output type of sycl::hypot(T1 x, T2 y) */
+    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
+    {
+        using rT = typename HypotOutputType<T1, T2>::value_type;
+        return td_ns::GetTypeid<rT>{}.get();
+    }
+};
+
+template <typename T1, typename T2, typename resT, typename IndexerT>
+class hypot_strided_kernel;
+
+template <typename argTy1, typename argTy2>
+sycl::event
+    hypot_strided_impl(sycl::queue &exec_q,
+                       std::size_t nelems,
+                       int nd,
+                       const ssize_t *shape_and_strides,
+                       const char *arg1_p,
+                       ssize_t arg1_offset,
+                       const char *arg2_p,
+                       ssize_t arg2_offset,
+                       char *res_p,
+                       ssize_t res_offset,
+                       const std::vector<sycl::event> &depends,
+                       const std::vector<sycl::event> &additional_depends)
+{
+    return elementwise_common::binary_strided_impl<
+        argTy1, argTy2, HypotOutputType, HypotStridedFunctor,
+        hypot_strided_kernel>(exec_q, nelems, nd, shape_and_strides, arg1_p,
+                              arg1_offset, arg2_p, arg2_offset, res_p,
+                              res_offset, depends, additional_depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct HypotStridedFactory
+{
+    fnT get()
+    {
+        if constexpr (!HypotOutputType<T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = hypot_strided_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+
+} // namespace dpctl::tensor::kernels::hypot
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/less.hpp b/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/less.hpp
new file mode 100644
index 000000000000..7f1c68c5c65c
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/less.hpp
@@ -0,0 +1,316 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for elementwise evaluation of comparison of
+/// tensor elements.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <complex>
+#include <cstddef>
+#include <cstdint>
+#include <type_traits>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "vec_size_util.hpp"
+
+#include "utils/math_utils.hpp"
+#include "utils/type_dispatch_building.hpp"
+#include "utils/type_utils.hpp"
+
+#include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+
+namespace dpctl::tensor::kernels::less
+{
+
+using dpctl::tensor::ssize_t;
+namespace td_ns = dpctl::tensor::type_dispatch;
+namespace tu_ns = dpctl::tensor::type_utils;
+
+template <typename argT1, typename argT2, typename resT>
+struct LessFunctor
+{
+    static_assert(std::is_same_v<resT, bool>);
+
+    using supports_sg_loadstore = std::negation<
+        std::disjunction<tu_ns::is_complex<argT1>, tu_ns::is_complex<argT2>>>;
+    using supports_vec = std::conjunction<
+        std::is_same<argT1, argT2>,
+        std::negation<std::disjunction<tu_ns::is_complex<argT1>,
+                                       tu_ns::is_complex<argT2>>>>;
+
+    resT operator()(const argT1 &in1, const argT2 &in2) const
+    {
+        if constexpr (tu_ns::is_complex<argT1>::value ||
+                      tu_ns::is_complex<argT2>::value)
+        {
+            static_assert(std::is_same_v<argT1, argT2>);
+            using dpctl::tensor::math_utils::less_complex;
+            return less_complex<argT1>(in1, in2);
+        }
+        else {
+            if constexpr (std::is_integral_v<argT1> &&
+                          std::is_integral_v<argT2> &&
+                          std::is_signed_v<argT1> != std::is_signed_v<argT2>)
+            {
+                if constexpr (std::is_signed_v<argT1> &&
+                              !std::is_signed_v<argT2>) {
+                    return (in1 < 0) ? true : (static_cast<argT2>(in1) < in2);
+                }
+                else {
+                    if constexpr (!std::is_signed_v<argT1> &&
+                                  std::is_signed_v<argT2>) {
+                        return (in2 < 0) ? false
+                                         : (in1 < static_cast<argT1>(in2));
+                    }
+                }
+            }
+            else {
+                return (in1 < in2);
+            }
+        }
+    }
+
+    template <int vec_sz>
+    sycl::vec<resT, vec_sz>
+        operator()(const sycl::vec<argT1, vec_sz> &in1,
+                   const sycl::vec<argT2, vec_sz> &in2) const
+    {
+        auto tmp = (in1 < in2);
+
+        if constexpr (std::is_same_v<resT,
+                                     typename decltype(tmp)::element_type>) {
+            return tmp;
+        }
+        else {
+            using dpctl::tensor::type_utils::vec_cast;
+
+            return vec_cast<resT, typename decltype(tmp)::element_type, vec_sz>(
+                tmp);
+        }
+    }
+};
+
+template <typename argT1,
+          typename argT2,
+          typename resT,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
+          bool enable_sg_loadstore = true>
+using LessContigFunctor =
+    elementwise_common::BinaryContigFunctor<argT1,
+                                            argT2,
+                                            resT,
+                                            LessFunctor<argT1, argT2, resT>,
+                                            vec_sz,
+                                            n_vecs,
+                                            enable_sg_loadstore>;
+
+template <typename argT1, typename argT2, typename resT, typename IndexerT>
+using LessStridedFunctor =
+    elementwise_common::BinaryStridedFunctor<argT1,
+                                             argT2,
+                                             resT,
+                                             IndexerT,
+                                             LessFunctor<argT1, argT2, resT>>;
+
+template <typename T1, typename T2>
+struct LessOutputType
+{
+    using value_type = typename std::disjunction<
+        td_ns::BinaryTypeMapResultEntry<T1, bool, T2, bool, bool>,
+        td_ns::
+            BinaryTypeMapResultEntry<T1, std::uint8_t, T2, std::uint8_t, bool>,
+        td_ns::BinaryTypeMapResultEntry<T1, std::int8_t, T2, std::int8_t, bool>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint16_t,
+                                        T2,
+                                        std::uint16_t,
+                                        bool>,
+        td_ns::
+            BinaryTypeMapResultEntry<T1, std::int16_t, T2, std::int16_t, bool>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint32_t,
+                                        T2,
+                                        std::uint32_t,
+                                        bool>,
+        td_ns::
+            BinaryTypeMapResultEntry<T1, std::int32_t, T2, std::int32_t, bool>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint64_t,
+                                        T2,
+                                        std::uint64_t,
+                                        bool>,
+        td_ns::
+            BinaryTypeMapResultEntry<T1, std::int64_t, T2, std::int64_t, bool>,
+        td_ns::
+            BinaryTypeMapResultEntry<T1, std::uint64_t, T2, std::int64_t, bool>,
+        td_ns::
+            BinaryTypeMapResultEntry<T1, std::int64_t, T2, std::uint64_t, bool>,
+        td_ns::BinaryTypeMapResultEntry<T1, sycl::half, T2, sycl::half, bool>,
+        td_ns::BinaryTypeMapResultEntry<T1, float, T2, float, bool>,
+        td_ns::BinaryTypeMapResultEntry<T1, double, T2, double, bool>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::complex<float>,
+                                        T2,
+                                        std::complex<float>,
+                                        bool>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::complex<double>,
+                                        T2,
+                                        std::complex<double>,
+                                        bool>,
+        td_ns::DefaultResultEntry<void>>::result_type;
+
+    static constexpr bool is_defined = !std::is_same_v<value_type, void>;
+};
+
+namespace hyperparam_detail
+{
+
+namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
+
+using vsu_ns::BinaryContigHyperparameterSetEntry;
+using vsu_ns::ContigHyperparameterSetDefault;
+
+template <typename argTy1, typename argTy2>
+struct LessContigHyperparameterSet
+{
+    using value_type =
+        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
+
+    constexpr static auto vec_sz = value_type::vec_sz;
+    constexpr static auto n_vecs = value_type::n_vecs;
+};
+
+} // end of namespace hyperparam_detail
+
+template <typename argT1,
+          typename argT2,
+          typename resT,
+          std::uint8_t vec_sz,
+          std::uint8_t n_vecs>
+class less_contig_kernel;
+
+template <typename argTy1, typename argTy2>
+sycl::event less_contig_impl(sycl::queue &exec_q,
+                             std::size_t nelems,
+                             const char *arg1_p,
+                             ssize_t arg1_offset,
+                             const char *arg2_p,
+                             ssize_t arg2_offset,
+                             char *res_p,
+                             ssize_t res_offset,
+                             const std::vector<sycl::event> &depends = {})
+{
+    using LessHS =
+        hyperparam_detail::LessContigHyperparameterSet<argTy1, argTy2>;
+    static constexpr std::uint8_t vec_sz = LessHS::vec_sz;
+    static constexpr std::uint8_t n_vecs = LessHS::n_vecs;
+
+    return elementwise_common::binary_contig_impl<
+        argTy1, argTy2, LessOutputType, LessContigFunctor, less_contig_kernel,
+        vec_sz, n_vecs>(exec_q, nelems, arg1_p, arg1_offset, arg2_p,
+                        arg2_offset, res_p, res_offset, depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct LessContigFactory
+{
+    fnT get()
+    {
+        if constexpr (!LessOutputType<T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = less_contig_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+
+template <typename fnT, typename T1, typename T2>
+struct LessTypeMapFactory
+{
+    /*! @brief get typeid for output type of operator()>(x, y), always bool */
+    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
+    {
+        using rT = typename LessOutputType<T1, T2>::value_type;
+        return td_ns::GetTypeid<rT>{}.get();
+    }
+};
+
+template <typename T1, typename T2, typename resT, typename IndexerT>
+class less_strided_kernel;
+
+template <typename argTy1, typename argTy2>
+sycl::event
+    less_strided_impl(sycl::queue &exec_q,
+                      std::size_t nelems,
+                      int nd,
+                      const ssize_t *shape_and_strides,
+                      const char *arg1_p,
+                      ssize_t arg1_offset,
+                      const char *arg2_p,
+                      ssize_t arg2_offset,
+                      char *res_p,
+                      ssize_t res_offset,
+                      const std::vector<sycl::event> &depends,
+                      const std::vector<sycl::event> &additional_depends)
+{
+    return elementwise_common::binary_strided_impl<
+        argTy1, argTy2, LessOutputType, LessStridedFunctor,
+        less_strided_kernel>(exec_q, nelems, nd, shape_and_strides, arg1_p,
+                             arg1_offset, arg2_p, arg2_offset, res_p,
+                             res_offset, depends, additional_depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct LessStridedFactory
+{
+    fnT get()
+    {
+        if constexpr (!LessOutputType<T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = less_strided_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+
+} // namespace dpctl::tensor::kernels::less
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/less_equal.hpp b/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/less_equal.hpp
new file mode 100644
index 000000000000..a8c58ee31277
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/less_equal.hpp
@@ -0,0 +1,318 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for elementwise evaluation of comparison of
+/// tensor elements.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <complex>
+#include <cstddef>
+#include <cstdint>
+#include <type_traits>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "vec_size_util.hpp"
+
+#include "utils/math_utils.hpp"
+#include "utils/type_dispatch_building.hpp"
+#include "utils/type_utils.hpp"
+
+#include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+
+namespace dpctl::tensor::kernels::less_equal
+{
+
+using dpctl::tensor::ssize_t;
+namespace td_ns = dpctl::tensor::type_dispatch;
+namespace tu_ns = dpctl::tensor::type_utils;
+
+template <typename argT1, typename argT2, typename resT>
+struct LessEqualFunctor
+{
+    static_assert(std::is_same_v<resT, bool>);
+
+    using supports_sg_loadstore = std::negation<
+        std::disjunction<tu_ns::is_complex<argT1>, tu_ns::is_complex<argT2>>>;
+    using supports_vec = std::conjunction<
+        std::is_same<argT1, argT2>,
+        std::negation<std::disjunction<tu_ns::is_complex<argT1>,
+                                       tu_ns::is_complex<argT2>>>>;
+
+    resT operator()(const argT1 &in1, const argT2 &in2) const
+    {
+        if constexpr (tu_ns::is_complex<argT1>::value ||
+                      tu_ns::is_complex<argT2>::value)
+        {
+            static_assert(std::is_same_v<argT1, argT2>);
+            using dpctl::tensor::math_utils::less_equal_complex;
+            return less_equal_complex<argT1>(in1, in2);
+        }
+        else {
+            if constexpr (std::is_integral_v<argT1> &&
+                          std::is_integral_v<argT2> &&
+                          std::is_signed_v<argT1> != std::is_signed_v<argT2>)
+            {
+                if constexpr (std::is_signed_v<argT1> &&
+                              !std::is_signed_v<argT2>) {
+                    return (in1 < 0) ? true : (static_cast<argT2>(in1) <= in2);
+                }
+                else {
+                    if constexpr (!std::is_signed_v<argT1> &&
+                                  std::is_signed_v<argT2>) {
+                        return (in2 < 0) ? false
+                                         : (in1 <= static_cast<argT1>(in2));
+                    }
+                }
+            }
+            else {
+                return (in1 <= in2);
+            }
+        }
+    }
+
+    template <int vec_sz>
+    sycl::vec<resT, vec_sz>
+        operator()(const sycl::vec<argT1, vec_sz> &in1,
+                   const sycl::vec<argT2, vec_sz> &in2) const
+    {
+
+        auto tmp = (in1 <= in2);
+
+        if constexpr (std::is_same_v<resT,
+                                     typename decltype(tmp)::element_type>) {
+            return tmp;
+        }
+        else {
+            using dpctl::tensor::type_utils::vec_cast;
+
+            return vec_cast<resT, typename decltype(tmp)::element_type, vec_sz>(
+                tmp);
+        }
+    }
+};
+
+template <typename argT1,
+          typename argT2,
+          typename resT,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
+          bool enable_sg_loadstore = true>
+using LessEqualContigFunctor = elementwise_common::BinaryContigFunctor<
+    argT1,
+    argT2,
+    resT,
+    LessEqualFunctor<argT1, argT2, resT>,
+    vec_sz,
+    n_vecs,
+    enable_sg_loadstore>;
+
+template <typename argT1, typename argT2, typename resT, typename IndexerT>
+using LessEqualStridedFunctor = elementwise_common::BinaryStridedFunctor<
+    argT1,
+    argT2,
+    resT,
+    IndexerT,
+    LessEqualFunctor<argT1, argT2, resT>>;
+
+template <typename T1, typename T2>
+struct LessEqualOutputType
+{
+    using value_type = typename std::disjunction<
+        td_ns::BinaryTypeMapResultEntry<T1, bool, T2, bool, bool>,
+        td_ns::
+            BinaryTypeMapResultEntry<T1, std::uint8_t, T2, std::uint8_t, bool>,
+        td_ns::BinaryTypeMapResultEntry<T1, std::int8_t, T2, std::int8_t, bool>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint16_t,
+                                        T2,
+                                        std::uint16_t,
+                                        bool>,
+        td_ns::
+            BinaryTypeMapResultEntry<T1, std::int16_t, T2, std::int16_t, bool>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint32_t,
+                                        T2,
+                                        std::uint32_t,
+                                        bool>,
+        td_ns::
+            BinaryTypeMapResultEntry<T1, std::int32_t, T2, std::int32_t, bool>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint64_t,
+                                        T2,
+                                        std::uint64_t,
+                                        bool>,
+        td_ns::
+            BinaryTypeMapResultEntry<T1, std::int64_t, T2, std::int64_t, bool>,
+        td_ns::
+            BinaryTypeMapResultEntry<T1, std::uint64_t, T2, std::int64_t, bool>,
+        td_ns::
+            BinaryTypeMapResultEntry<T1, std::int64_t, T2, std::uint64_t, bool>,
+        td_ns::BinaryTypeMapResultEntry<T1, sycl::half, T2, sycl::half, bool>,
+        td_ns::BinaryTypeMapResultEntry<T1, float, T2, float, bool>,
+        td_ns::BinaryTypeMapResultEntry<T1, double, T2, double, bool>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::complex<float>,
+                                        T2,
+                                        std::complex<float>,
+                                        bool>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::complex<double>,
+                                        T2,
+                                        std::complex<double>,
+                                        bool>,
+        td_ns::DefaultResultEntry<void>>::result_type;
+
+    static constexpr bool is_defined = !std::is_same_v<value_type, void>;
+};
+
+namespace hyperparam_detail
+{
+
+namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
+
+using vsu_ns::BinaryContigHyperparameterSetEntry;
+using vsu_ns::ContigHyperparameterSetDefault;
+
+template <typename argTy1, typename argTy2>
+struct LessEqualContigHyperparameterSet
+{
+    using value_type =
+        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
+
+    constexpr static auto vec_sz = value_type::vec_sz;
+    constexpr static auto n_vecs = value_type::n_vecs;
+};
+
+} // end of namespace hyperparam_detail
+
+template <typename argT1,
+          typename argT2,
+          typename resT,
+          std::uint8_t vec_sz,
+          std::uint8_t n_vecs>
+class less_equal_contig_kernel;
+
+template <typename argTy1, typename argTy2>
+sycl::event less_equal_contig_impl(sycl::queue &exec_q,
+                                   std::size_t nelems,
+                                   const char *arg1_p,
+                                   ssize_t arg1_offset,
+                                   const char *arg2_p,
+                                   ssize_t arg2_offset,
+                                   char *res_p,
+                                   ssize_t res_offset,
+                                   const std::vector<sycl::event> &depends = {})
+{
+    using LessEqHS =
+        hyperparam_detail::LessEqualContigHyperparameterSet<argTy1, argTy2>;
+    static constexpr std::uint8_t vec_sz = LessEqHS::vec_sz;
+    static constexpr std::uint8_t n_vecs = LessEqHS::n_vecs;
+
+    return elementwise_common::binary_contig_impl<
+        argTy1, argTy2, LessEqualOutputType, LessEqualContigFunctor,
+        less_equal_contig_kernel, vec_sz, n_vecs>(
+        exec_q, nelems, arg1_p, arg1_offset, arg2_p, arg2_offset, res_p,
+        res_offset, depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct LessEqualContigFactory
+{
+    fnT get()
+    {
+        if constexpr (!LessEqualOutputType<T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = less_equal_contig_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+
+template <typename fnT, typename T1, typename T2>
+struct LessEqualTypeMapFactory
+{
+    /*! @brief get typeid for output type of operator()>(x, y), always bool */
+    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
+    {
+        using rT = typename LessEqualOutputType<T1, T2>::value_type;
+        return td_ns::GetTypeid<rT>{}.get();
+    }
+};
+
+template <typename T1, typename T2, typename resT, typename IndexerT>
+class less_equal_strided_kernel;
+
+template <typename argTy1, typename argTy2>
+sycl::event
+    less_equal_strided_impl(sycl::queue &exec_q,
+                            std::size_t nelems,
+                            int nd,
+                            const ssize_t *shape_and_strides,
+                            const char *arg1_p,
+                            ssize_t arg1_offset,
+                            const char *arg2_p,
+                            ssize_t arg2_offset,
+                            char *res_p,
+                            ssize_t res_offset,
+                            const std::vector<sycl::event> &depends,
+                            const std::vector<sycl::event> &additional_depends)
+{
+    return elementwise_common::binary_strided_impl<
+        argTy1, argTy2, LessEqualOutputType, LessEqualStridedFunctor,
+        less_equal_strided_kernel>(
+        exec_q, nelems, nd, shape_and_strides, arg1_p, arg1_offset, arg2_p,
+        arg2_offset, res_p, res_offset, depends, additional_depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct LessEqualStridedFactory
+{
+    fnT get()
+    {
+        if constexpr (!LessEqualOutputType<T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = less_equal_strided_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+
+} // namespace dpctl::tensor::kernels::less_equal
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/logaddexp.hpp b/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/logaddexp.hpp
index af93b089f0b2..3a79950672d2 100644
--- a/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/logaddexp.hpp
+++ b/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/logaddexp.hpp
@@ -48,7 +48,6 @@
 
 #include "utils/math_utils.hpp"
 #include "utils/type_dispatch_building.hpp"
-#include "utils/type_utils.hpp"
 
 #include "kernels/dpctl_tensor_types.hpp"
 
@@ -56,10 +55,6 @@ namespace dpctl::tensor::kernels::logaddexp
 {
 using dpctl::tensor::ssize_t;
 namespace td_ns = dpctl::tensor::type_dispatch;
-namespace tu_ns = dpctl::tensor::type_utils;
-
-using dpctl::tensor::type_utils::is_complex;
-using dpctl::tensor::type_utils::vec_cast;
 
 template <typename argT1, typename argT2, typename resT>
 struct LogAddExpFunctor
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/true_divide.hpp b/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/true_divide.hpp
new file mode 100644
index 000000000000..f8219764071f
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/true_divide.hpp
@@ -0,0 +1,668 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for elementwise evaluation of DIVIDE(x1, x2)
+/// function.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <complex>
+#include <cstddef>
+#include <cstdint>
+#include <type_traits>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "sycl_complex.hpp"
+#include "vec_size_util.hpp"
+
+#include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+#include "kernels/elementwise_functions/common_inplace.hpp"
+
+#include "utils/type_dispatch_building.hpp"
+#include "utils/type_utils.hpp"
+
+namespace dpctl::tensor::kernels::true_divide
+{
+
+using dpctl::tensor::ssize_t;
+namespace td_ns = dpctl::tensor::type_dispatch;
+namespace tu_ns = dpctl::tensor::type_utils;
+
+template <typename argT1, typename argT2, typename resT>
+struct TrueDivideFunctor
+{
+
+    using supports_sg_loadstore = std::negation<
+        std::disjunction<tu_ns::is_complex<argT1>, tu_ns::is_complex<argT2>>>;
+    using supports_vec = std::negation<
+        std::disjunction<tu_ns::is_complex<argT1>, tu_ns::is_complex<argT2>>>;
+
+    resT operator()(const argT1 &in1, const argT2 &in2) const
+    {
+        if constexpr (tu_ns::is_complex<argT1>::value &&
+                      tu_ns::is_complex<argT2>::value)
+        {
+            using realT1 = typename argT1::value_type;
+            using realT2 = typename argT2::value_type;
+
+            return exprm_ns::complex<realT1>(in1) /
+                   exprm_ns::complex<realT2>(in2);
+        }
+        else if constexpr (tu_ns::is_complex<argT1>::value &&
+                           !tu_ns::is_complex<argT2>::value)
+        {
+            using realT1 = typename argT1::value_type;
+
+            return exprm_ns::complex<realT1>(in1) / in2;
+        }
+        else if constexpr (!tu_ns::is_complex<argT1>::value &&
+                           tu_ns::is_complex<argT2>::value)
+        {
+            using realT2 = typename argT2::value_type;
+
+            return in1 / exprm_ns::complex<realT2>(in2);
+        }
+        else {
+            return in1 / in2;
+        }
+    }
+
+    template <int vec_sz>
+    sycl::vec<resT, vec_sz>
+        operator()(const sycl::vec<argT1, vec_sz> &in1,
+                   const sycl::vec<argT2, vec_sz> &in2) const
+    {
+        auto tmp = in1 / in2;
+        if constexpr (std::is_same_v<resT,
+                                     typename decltype(tmp)::element_type>) {
+            return tmp;
+        }
+        else {
+            using dpctl::tensor::type_utils::vec_cast;
+
+            return vec_cast<resT, typename decltype(tmp)::element_type, vec_sz>(
+                tmp);
+        }
+    }
+};
+
+template <typename argT1,
+          typename argT2,
+          typename resT,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
+          bool enable_sg_loadstore = true>
+using TrueDivideContigFunctor = elementwise_common::BinaryContigFunctor<
+    argT1,
+    argT2,
+    resT,
+    TrueDivideFunctor<argT1, argT2, resT>,
+    vec_sz,
+    n_vecs,
+    enable_sg_loadstore>;
+
+template <typename argT1, typename argT2, typename resT, typename IndexerT>
+using TrueDivideStridedFunctor = elementwise_common::BinaryStridedFunctor<
+    argT1,
+    argT2,
+    resT,
+    IndexerT,
+    TrueDivideFunctor<argT1, argT2, resT>>;
+
+template <typename T1, typename T2>
+struct TrueDivideOutputType
+{
+    using value_type = typename std::disjunction<
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        sycl::half,
+                                        T2,
+                                        sycl::half,
+                                        sycl::half>,
+        td_ns::BinaryTypeMapResultEntry<T1, float, T2, float, float>,
+        td_ns::BinaryTypeMapResultEntry<T1, double, T2, double, double>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::complex<float>,
+                                        T2,
+                                        std::complex<float>,
+                                        std::complex<float>>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::complex<float>,
+                                        T2,
+                                        float,
+                                        std::complex<float>>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        float,
+                                        T2,
+                                        std::complex<float>,
+                                        std::complex<float>>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::complex<double>,
+                                        T2,
+                                        std::complex<double>,
+                                        std::complex<double>>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        double,
+                                        T2,
+                                        std::complex<double>,
+                                        std::complex<double>>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::complex<double>,
+                                        T2,
+                                        double,
+                                        std::complex<double>>,
+        td_ns::DefaultResultEntry<void>>::result_type;
+
+    static constexpr bool is_defined = !std::is_same_v<value_type, void>;
+};
+
+namespace hyperparam_detail
+{
+
+namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
+
+using vsu_ns::BinaryContigHyperparameterSetEntry;
+using vsu_ns::ContigHyperparameterSetDefault;
+
+template <typename argTy1, typename argTy2>
+struct TrueDivideContigHyperparameterSet
+{
+    using value_type =
+        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
+
+    constexpr static auto vec_sz = value_type::vec_sz;
+    constexpr static auto n_vecs = value_type::n_vecs;
+};
+
+} // end of namespace hyperparam_detail
+
+template <typename argT1,
+          typename argT2,
+          typename resT,
+          std::uint8_t vec_sz,
+          std::uint8_t n_vecs>
+class true_divide_contig_kernel;
+
+template <typename argTy1, typename argTy2>
+sycl::event
+    true_divide_contig_impl(sycl::queue &exec_q,
+                            std::size_t nelems,
+                            const char *arg1_p,
+                            ssize_t arg1_offset,
+                            const char *arg2_p,
+                            ssize_t arg2_offset,
+                            char *res_p,
+                            ssize_t res_offset,
+                            const std::vector<sycl::event> &depends = {})
+{
+    using DivHS =
+        hyperparam_detail::TrueDivideContigHyperparameterSet<argTy1, argTy2>;
+    static constexpr std::uint8_t vec_sz = DivHS::vec_sz;
+    static constexpr std::uint8_t n_vecs = DivHS::n_vecs;
+
+    return elementwise_common::binary_contig_impl<
+        argTy1, argTy2, TrueDivideOutputType, TrueDivideContigFunctor,
+        true_divide_contig_kernel, vec_sz, n_vecs>(
+        exec_q, nelems, arg1_p, arg1_offset, arg2_p, arg2_offset, res_p,
+        res_offset, depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct TrueDivideContigFactory
+{
+    fnT get()
+    {
+        if constexpr (!TrueDivideOutputType<T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = true_divide_contig_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+
+template <typename fnT, typename T1, typename T2>
+struct TrueDivideTypeMapFactory
+{
+    /*! @brief get typeid for output type of divide(T1 x, T2 y) */
+    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
+    {
+        using rT = typename TrueDivideOutputType<T1, T2>::value_type;
+        return td_ns::GetTypeid<rT>{}.get();
+    }
+};
+
+template <typename T1, typename T2, typename resT, typename IndexerT>
+class true_divide_strided_kernel;
+
+template <typename argTy1, typename argTy2>
+sycl::event
+    true_divide_strided_impl(sycl::queue &exec_q,
+                             std::size_t nelems,
+                             int nd,
+                             const ssize_t *shape_and_strides,
+                             const char *arg1_p,
+                             ssize_t arg1_offset,
+                             const char *arg2_p,
+                             ssize_t arg2_offset,
+                             char *res_p,
+                             ssize_t res_offset,
+                             const std::vector<sycl::event> &depends,
+                             const std::vector<sycl::event> &additional_depends)
+{
+    return elementwise_common::binary_strided_impl<
+        argTy1, argTy2, TrueDivideOutputType, TrueDivideStridedFunctor,
+        true_divide_strided_kernel>(
+        exec_q, nelems, nd, shape_and_strides, arg1_p, arg1_offset, arg2_p,
+        arg2_offset, res_p, res_offset, depends, additional_depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct TrueDivideStridedFactory
+{
+    fnT get()
+    {
+        if constexpr (!TrueDivideOutputType<T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = true_divide_strided_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+
+template <typename argT1, typename argT2, typename resT>
+using TrueDivideContigMatrixContigRowBroadcastingFunctor =
+    elementwise_common::BinaryContigMatrixContigRowBroadcastingFunctor<
+        argT1,
+        argT2,
+        resT,
+        TrueDivideFunctor<argT1, argT2, resT>>;
+
+template <typename argT1, typename argT2, typename resT>
+using TrueDivideContigRowContigMatrixBroadcastingFunctor =
+    elementwise_common::BinaryContigRowContigMatrixBroadcastingFunctor<
+        argT1,
+        argT2,
+        resT,
+        TrueDivideFunctor<argT1, argT2, resT>>;
+
+template <typename argT1, typename argT2, typename resT>
+class true_divide_matrix_row_broadcast_sg_krn;
+
+template <typename argT1, typename argT2, typename resT>
+class true_divide_row_matrix_broadcast_sg_krn;
+
+template <typename argT1, typename argT2, typename resT>
+sycl::event true_divide_contig_matrix_contig_row_broadcast_impl(
+    sycl::queue &exec_q,
+    std::vector<sycl::event> &host_tasks,
+    std::size_t n0,
+    std::size_t n1,
+    const char *mat_p, // typeless pointer to (n0, n1) C-contiguous matrix
+    ssize_t mat_offset,
+    const char *vec_p, // typeless pointer to (n1,) contiguous row
+    ssize_t vec_offset,
+    char *res_p, // typeless pointer to (n0, n1) result C-contig. matrix,
+                 //    res[i,j] = mat[i,j] / vec[j]
+    ssize_t res_offset,
+    const std::vector<sycl::event> &depends = {})
+{
+    return elementwise_common::binary_contig_matrix_contig_row_broadcast_impl<
+        argT1, argT2, resT, TrueDivideContigMatrixContigRowBroadcastingFunctor,
+        true_divide_matrix_row_broadcast_sg_krn>(
+        exec_q, host_tasks, n0, n1, mat_p, mat_offset, vec_p, vec_offset, res_p,
+        res_offset, depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct TrueDivideContigMatrixContigRowBroadcastFactory
+{
+    fnT get()
+    {
+        if constexpr (!TrueDivideOutputType<T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            using resT = typename TrueDivideOutputType<T1, T2>::value_type;
+            if constexpr (dpctl::tensor::type_utils::is_complex<T1>::value ||
+                          dpctl::tensor::type_utils::is_complex<T2>::value ||
+                          dpctl::tensor::type_utils::is_complex<resT>::value)
+            {
+                fnT fn = nullptr;
+                return fn;
+            }
+            else {
+                fnT fn =
+                    true_divide_contig_matrix_contig_row_broadcast_impl<T1, T2,
+                                                                        resT>;
+                return fn;
+            }
+        }
+    }
+};
+
+template <typename argT1, typename argT2, typename resT>
+sycl::event true_divide_contig_row_contig_matrix_broadcast_impl(
+    sycl::queue &exec_q,
+    std::vector<sycl::event> &host_tasks,
+    std::size_t n0,
+    std::size_t n1,
+    const char *vec_p, // typeless pointer to (n1,) contiguous row
+    ssize_t vec_offset,
+    const char *mat_p, // typeless pointer to (n0, n1) C-contiguous matrix
+    ssize_t mat_offset,
+    char *res_p, // typeless pointer to (n0, n1) result C-contig. matrix,
+                 //    res[i,j] = mat[i,j] + vec[j]
+    ssize_t res_offset,
+    const std::vector<sycl::event> &depends = {})
+{
+    return elementwise_common::binary_contig_row_contig_matrix_broadcast_impl<
+        argT1, argT2, resT, TrueDivideContigRowContigMatrixBroadcastingFunctor,
+        true_divide_row_matrix_broadcast_sg_krn>(
+        exec_q, host_tasks, n0, n1, vec_p, vec_offset, mat_p, mat_offset, res_p,
+        res_offset, depends);
+};
+
+template <typename fnT, typename T1, typename T2>
+struct TrueDivideContigRowContigMatrixBroadcastFactory
+{
+    fnT get()
+    {
+        if constexpr (!TrueDivideOutputType<T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            using resT = typename TrueDivideOutputType<T1, T2>::value_type;
+            if constexpr (dpctl::tensor::type_utils::is_complex<T1>::value ||
+                          dpctl::tensor::type_utils::is_complex<T2>::value ||
+                          dpctl::tensor::type_utils::is_complex<resT>::value)
+            {
+                fnT fn = nullptr;
+                return fn;
+            }
+            else {
+                fnT fn =
+                    true_divide_contig_row_contig_matrix_broadcast_impl<T1, T2,
+                                                                        resT>;
+                return fn;
+            }
+        }
+    }
+};
+
+template <typename argT, typename resT>
+struct TrueDivideInplaceFunctor
+{
+
+    using supports_sg_loadstore = std::negation<
+        std::disjunction<tu_ns::is_complex<argT>, tu_ns::is_complex<resT>>>;
+    using supports_vec = std::negation<
+        std::disjunction<tu_ns::is_complex<argT>, tu_ns::is_complex<resT>>>;
+
+    void operator()(resT &res, const argT &in)
+    {
+        if constexpr (tu_ns::is_complex<resT>::value) {
+            if constexpr (tu_ns::is_complex<argT>::value) {
+                using res_rT = typename resT::value_type;
+                using arg_rT = typename argT::value_type;
+
+                auto res1 = exprm_ns::complex<res_rT>(res);
+                res1 /= exprm_ns::complex<arg_rT>(in);
+                res = res1;
+            }
+            else {
+                using res_rT = typename resT::value_type;
+
+                auto res1 = exprm_ns::complex<res_rT>(res);
+                res1 /= in;
+                res = res1;
+            }
+        }
+        else {
+            res /= in;
+        }
+    }
+
+    template <int vec_sz>
+    void operator()(sycl::vec<resT, vec_sz> &res,
+                    const sycl::vec<argT, vec_sz> &in)
+    {
+        res /= in;
+    }
+};
+
+/* @brief Types supported by in-place divide */
+template <typename argTy, typename resTy>
+struct TrueDivideInplaceTypePairSupport
+{
+
+    /* value if true a kernel for <argTy, resTy> must be instantiated  */
+    static constexpr bool is_defined = std::disjunction<
+        td_ns::TypePairDefinedEntry<argTy, sycl::half, resTy, sycl::half>,
+        td_ns::TypePairDefinedEntry<argTy, float, resTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, double, resTy, double>,
+        td_ns::TypePairDefinedEntry<argTy, float, resTy, std::complex<float>>,
+        td_ns::TypePairDefinedEntry<argTy,
+                                    std::complex<float>,
+                                    resTy,
+                                    std::complex<float>>,
+        td_ns::TypePairDefinedEntry<argTy, double, resTy, std::complex<double>>,
+        td_ns::TypePairDefinedEntry<argTy,
+                                    std::complex<double>,
+                                    resTy,
+                                    std::complex<double>>,
+        // fall-through
+        td_ns::NotDefinedEntry>::is_defined;
+};
+
+template <typename fnT, typename argT, typename resT>
+struct TrueDivideInplaceTypeMapFactory
+{
+    /*! @brief get typeid for output type of divide(T1 x, T2 y) */
+    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
+    {
+        if constexpr (TrueDivideInplaceTypePairSupport<argT, resT>::is_defined)
+        {
+            return td_ns::GetTypeid<resT>{}.get();
+        }
+        else {
+            return td_ns::GetTypeid<void>{}.get();
+        }
+    }
+};
+
+template <typename argT,
+          typename resT,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
+          bool enable_sg_loadstore = true>
+using TrueDivideInplaceContigFunctor =
+    elementwise_common::BinaryInplaceContigFunctor<
+        argT,
+        resT,
+        TrueDivideInplaceFunctor<argT, resT>,
+        vec_sz,
+        n_vecs,
+        enable_sg_loadstore>;
+
+template <typename argT, typename resT, typename IndexerT>
+using TrueDivideInplaceStridedFunctor =
+    elementwise_common::BinaryInplaceStridedFunctor<
+        argT,
+        resT,
+        IndexerT,
+        TrueDivideInplaceFunctor<argT, resT>>;
+
+template <typename argT,
+          typename resT,
+          std::uint8_t vec_sz,
+          std::uint8_t n_vecs>
+class true_divide_inplace_contig_kernel;
+
+template <typename argTy, typename resTy>
+sycl::event true_divide_inplace_contig_impl(
+    sycl::queue &exec_q,
+    std::size_t nelems,
+    const char *arg_p,
+    ssize_t arg_offset,
+    char *res_p,
+    ssize_t res_offset,
+    const std::vector<sycl::event> &depends = {})
+{
+    using DivHS =
+        hyperparam_detail::TrueDivideContigHyperparameterSet<resTy, argTy>;
+    static constexpr std::uint8_t vec_sz = DivHS::vec_sz;
+    static constexpr std::uint8_t n_vecs = DivHS::vec_sz;
+
+    return elementwise_common::binary_inplace_contig_impl<
+        argTy, resTy, TrueDivideInplaceContigFunctor,
+        true_divide_inplace_contig_kernel, vec_sz, n_vecs>(
+        exec_q, nelems, arg_p, arg_offset, res_p, res_offset, depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct TrueDivideInplaceContigFactory
+{
+    fnT get()
+    {
+        if constexpr (!TrueDivideInplaceTypePairSupport<T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = true_divide_inplace_contig_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+
+template <typename resT, typename argT, typename IndexerT>
+class true_divide_inplace_strided_kernel;
+
+template <typename argTy, typename resTy>
+sycl::event true_divide_inplace_strided_impl(
+    sycl::queue &exec_q,
+    std::size_t nelems,
+    int nd,
+    const ssize_t *shape_and_strides,
+    const char *arg_p,
+    ssize_t arg_offset,
+    char *res_p,
+    ssize_t res_offset,
+    const std::vector<sycl::event> &depends,
+    const std::vector<sycl::event> &additional_depends)
+{
+    return elementwise_common::binary_inplace_strided_impl<
+        argTy, resTy, TrueDivideInplaceStridedFunctor,
+        true_divide_inplace_strided_kernel>(
+        exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p,
+        res_offset, depends, additional_depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct TrueDivideInplaceStridedFactory
+{
+    fnT get()
+    {
+        if constexpr (!TrueDivideInplaceTypePairSupport<T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = true_divide_inplace_strided_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+
+template <typename argT, typename resT>
+class true_divide_inplace_row_matrix_broadcast_sg_krn;
+
+template <typename argT, typename resT>
+using TrueDivideInplaceRowMatrixBroadcastingFunctor =
+    elementwise_common::BinaryInplaceRowMatrixBroadcastingFunctor<
+        argT,
+        resT,
+        TrueDivideInplaceFunctor<argT, resT>>;
+
+template <typename argT, typename resT>
+sycl::event true_divide_inplace_row_matrix_broadcast_impl(
+    sycl::queue &exec_q,
+    std::vector<sycl::event> &host_tasks,
+    std::size_t n0,
+    std::size_t n1,
+    const char *vec_p, // typeless pointer to (n1,) contiguous row
+    ssize_t vec_offset,
+    char *mat_p, // typeless pointer to (n0, n1) C-contiguous matrix
+    ssize_t mat_offset,
+    const std::vector<sycl::event> &depends = {})
+{
+    return elementwise_common::binary_inplace_row_matrix_broadcast_impl<
+        argT, resT, TrueDivideInplaceRowMatrixBroadcastingFunctor,
+        true_divide_inplace_row_matrix_broadcast_sg_krn>(
+        exec_q, host_tasks, n0, n1, vec_p, vec_offset, mat_p, mat_offset,
+        depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct TrueDivideInplaceRowMatrixBroadcastFactory
+{
+    fnT get()
+    {
+        if constexpr (!TrueDivideInplaceTypePairSupport<T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            if constexpr (dpctl::tensor::type_utils::is_complex<T1>::value ||
+                          dpctl::tensor::type_utils::is_complex<T2>::value)
+            {
+                fnT fn = nullptr;
+                return fn;
+            }
+            else {
+                fnT fn = true_divide_inplace_row_matrix_broadcast_impl<T1, T2>;
+                return fn;
+            }
+        }
+    }
+};
+
+} // namespace dpctl::tensor::kernels::true_divide
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/linalg_functions/gemm.hpp b/dpctl_ext/tensor/libtensor/include/kernels/linalg_functions/gemm.hpp
index 873c4dc89b44..8f84d950c0cd 100644
--- a/dpctl_ext/tensor/libtensor/include/kernels/linalg_functions/gemm.hpp
+++ b/dpctl_ext/tensor/libtensor/include/kernels/linalg_functions/gemm.hpp
@@ -35,7 +35,7 @@
 #pragma once
 
 #include <algorithm>
-#include <array
+#include <array>
 #include <cassert>
 #include <complex>
 #include <cstddef>
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/elementwise_common.cpp b/dpctl_ext/tensor/libtensor/source/elementwise_functions/elementwise_common.cpp
index e4e730a1da6b..8170f047c488 100644
--- a/dpctl_ext/tensor/libtensor/source/elementwise_functions/elementwise_common.cpp
+++ b/dpctl_ext/tensor/libtensor/source/elementwise_functions/elementwise_common.cpp
@@ -57,26 +57,26 @@
 // #include "copysign.hpp"
 #include "cos.hpp"
 #include "cosh.hpp"
-// #include "equal.hpp"
+#include "equal.hpp"
 #include "exp.hpp"
 #include "exp2.hpp"
 #include "expm1.hpp"
 #include "floor.hpp"
-// #include "floor_divide.hpp"
-// #include "greater.hpp"
-// #include "greater_equal.hpp"
-// #include "hypot.hpp"
+#include "floor_divide.hpp"
+#include "greater.hpp"
+#include "greater_equal.hpp"
+#include "hypot.hpp"
 #include "imag.hpp"
 #include "isfinite.hpp"
 #include "isinf.hpp"
 #include "isnan.hpp"
-// #include "less.hpp"
-// #include "less_equal.hpp"
+#include "less.hpp"
+#include "less_equal.hpp"
 #include "log.hpp"
 #include "log10.hpp"
 #include "log1p.hpp"
 #include "log2.hpp"
-// #include "logaddexp.hpp"
+#include "logaddexp.hpp"
 // #include "logical_and.hpp"
 #include "logical_not.hpp"
 // #include "logical_or.hpp"
@@ -104,7 +104,7 @@
 // #include "subtract.hpp"
 #include "tan.hpp"
 #include "tanh.hpp"
-// #include "true_divide.hpp"
+#include "true_divide.hpp"
 #include "trunc.hpp"
 
 namespace dpctl::tensor::py_internal
@@ -137,32 +137,31 @@ void init_elementwise_functions(py::module_ m)
     // init_copysign(m);
     init_cos(m);
     init_cosh(m);
-    // init_divide(m);
-    // init_equal(m);
+    init_divide(m);
+    init_equal(m);
     init_exp(m);
     init_exp2(m);
     init_expm1(m);
     init_floor(m);
-    // init_floor_divide(m);
-    // init_greater(m);
-    // init_greater_equal(m);
-    // init_hypot(m);
+    init_floor_divide(m);
+    init_greater(m);
+    init_greater_equal(m);
+    init_hypot(m);
     init_imag(m);
     init_isfinite(m);
     init_isinf(m);
     init_isnan(m);
-    // init_less(m);
-    // init_less_equal(m);
+    init_less(m);
+    init_less_equal(m);
     init_log(m);
     init_log10(m);
     init_log1p(m);
     init_log2(m);
-    // init_logaddexp(m);
+    init_logaddexp(m);
     // init_logical_and(m);
     init_logical_not(m);
     // init_logical_or(m);
     // init_logical_xor(m);
-    // init_maximum(m);
     // init_minimum(m);
     // init_multiply(m);
     // init_nextafter(m);
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/equal.cpp b/dpctl_ext/tensor/libtensor/source/elementwise_functions/equal.cpp
new file mode 100644
index 000000000000..863501bea367
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/elementwise_functions/equal.cpp
@@ -0,0 +1,145 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "elementwise_functions.hpp"
+#include "equal.hpp"
+#include "utils/type_dispatch.hpp"
+
+#include "kernels/elementwise_functions/common.hpp"
+#include "kernels/elementwise_functions/equal.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
+using ew_cmn_ns::binary_contig_impl_fn_ptr_t;
+using ew_cmn_ns::binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t;
+using ew_cmn_ns::binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t;
+using ew_cmn_ns::binary_strided_impl_fn_ptr_t;
+
+// B09: ===== EQUAL (x1, x2)
+namespace impl
+{
+namespace equal_fn_ns = dpctl::tensor::kernels::equal;
+
+static binary_contig_impl_fn_ptr_t
+    equal_contig_dispatch_table[td_ns::num_types][td_ns::num_types];
+static int equal_output_id_table[td_ns::num_types][td_ns::num_types];
+
+static binary_strided_impl_fn_ptr_t
+    equal_strided_dispatch_table[td_ns::num_types][td_ns::num_types];
+
+void populate_equal_dispatch_tables(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = equal_fn_ns;
+
+    // which input types are supported, and what is the type of the result
+    using fn_ns::EqualTypeMapFactory;
+    DispatchTableBuilder<int, EqualTypeMapFactory, num_types> dtb1;
+    dtb1.populate_dispatch_table(equal_output_id_table);
+
+    // function pointers for operation on general strided arrays
+    using fn_ns::EqualStridedFactory;
+    DispatchTableBuilder<binary_strided_impl_fn_ptr_t, EqualStridedFactory,
+                         num_types>
+        dtb2;
+    dtb2.populate_dispatch_table(equal_strided_dispatch_table);
+
+    // function pointers for operation on contiguous inputs and output
+    using fn_ns::EqualContigFactory;
+    DispatchTableBuilder<binary_contig_impl_fn_ptr_t, EqualContigFactory,
+                         num_types>
+        dtb3;
+    dtb3.populate_dispatch_table(equal_contig_dispatch_table);
+};
+
+} // namespace impl
+
+void init_equal(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        impl::populate_equal_dispatch_tables();
+        using impl::equal_contig_dispatch_table;
+        using impl::equal_output_id_table;
+        using impl::equal_strided_dispatch_table;
+
+        auto equal_pyapi = [&](const arrayT &src1, const arrayT &src2,
+                               const arrayT &dst, sycl::queue &exec_q,
+                               const event_vecT &depends = {}) {
+            return py_binary_ufunc(
+                src1, src2, dst, exec_q, depends, equal_output_id_table,
+                // function pointers to handle operation on contiguous arrays
+                // (pointers may be nullptr)
+                equal_contig_dispatch_table,
+                // function pointers to handle operation on strided arrays (most
+                // general case)
+                equal_strided_dispatch_table,
+                // function pointers to handle operation of c-contig matrix and
+                // c-contig row with broadcasting (may be nullptr)
+                td_ns::NullPtrTable<
+                    binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{},
+                // function pointers to handle operation of c-contig matrix and
+                // c-contig row with broadcasting (may be nullptr)
+                td_ns::NullPtrTable<
+                    binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{});
+        };
+        auto equal_result_type_pyapi = [&](const py::dtype &dtype1,
+                                           const py::dtype &dtype2) {
+            return py_binary_ufunc_result_type(dtype1, dtype2,
+                                               equal_output_id_table);
+        };
+        m.def("_equal", equal_pyapi, "", py::arg("src1"), py::arg("src2"),
+              py::arg("dst"), py::arg("sycl_queue"),
+              py::arg("depends") = py::list());
+        m.def("_equal_result_type", equal_result_type_pyapi, "");
+    }
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/equal.hpp b/dpctl_ext/tensor/libtensor/source/elementwise_functions/equal.hpp
new file mode 100644
index 000000000000..23f370111458
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/elementwise_functions/equal.hpp
@@ -0,0 +1,46 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern void init_equal(py::module_ m);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/floor_divide.cpp b/dpctl_ext/tensor/libtensor/source/elementwise_functions/floor_divide.cpp
new file mode 100644
index 000000000000..af4635a0f500
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/elementwise_functions/floor_divide.cpp
@@ -0,0 +1,205 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "elementwise_functions.hpp"
+#include "floor_divide.hpp"
+#include "utils/type_dispatch.hpp"
+
+#include "kernels/elementwise_functions/common.hpp"
+#include "kernels/elementwise_functions/common_inplace.hpp"
+#include "kernels/elementwise_functions/floor_divide.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
+using ew_cmn_ns::binary_contig_impl_fn_ptr_t;
+using ew_cmn_ns::binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t;
+using ew_cmn_ns::binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t;
+using ew_cmn_ns::binary_strided_impl_fn_ptr_t;
+
+using ew_cmn_ns::binary_inplace_contig_impl_fn_ptr_t;
+using ew_cmn_ns::binary_inplace_row_matrix_broadcast_impl_fn_ptr_t;
+using ew_cmn_ns::binary_inplace_strided_impl_fn_ptr_t;
+
+// B10: ===== FLOOR_DIVIDE (x1, x2)
+namespace impl
+{
+namespace floor_divide_fn_ns = dpctl::tensor::kernels::floor_divide;
+
+static binary_contig_impl_fn_ptr_t
+    floor_divide_contig_dispatch_table[td_ns::num_types][td_ns::num_types];
+
+static int floor_divide_output_id_table[td_ns::num_types][td_ns::num_types];
+static int floor_divide_inplace_output_id_table[td_ns::num_types]
+                                               [td_ns::num_types];
+
+static binary_strided_impl_fn_ptr_t
+    floor_divide_strided_dispatch_table[td_ns::num_types][td_ns::num_types];
+
+static binary_inplace_contig_impl_fn_ptr_t
+    floor_divide_inplace_contig_dispatch_table[td_ns::num_types]
+                                              [td_ns::num_types];
+static binary_inplace_strided_impl_fn_ptr_t
+    floor_divide_inplace_strided_dispatch_table[td_ns::num_types]
+                                               [td_ns::num_types];
+
+void populate_floor_divide_dispatch_tables(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = floor_divide_fn_ns;
+
+    // which input types are supported, and what is the type of the result
+    using fn_ns::FloorDivideTypeMapFactory;
+    DispatchTableBuilder<int, FloorDivideTypeMapFactory, num_types> dtb1;
+    dtb1.populate_dispatch_table(floor_divide_output_id_table);
+
+    // function pointers for operation on general strided arrays
+    using fn_ns::FloorDivideStridedFactory;
+    DispatchTableBuilder<binary_strided_impl_fn_ptr_t,
+                         FloorDivideStridedFactory, num_types>
+        dtb2;
+    dtb2.populate_dispatch_table(floor_divide_strided_dispatch_table);
+
+    // function pointers for operation on contiguous inputs and output
+    using fn_ns::FloorDivideContigFactory;
+    DispatchTableBuilder<binary_contig_impl_fn_ptr_t, FloorDivideContigFactory,
+                         num_types>
+        dtb3;
+    dtb3.populate_dispatch_table(floor_divide_contig_dispatch_table);
+
+    // function pointers for inplace operation on general strided arrays
+    using fn_ns::FloorDivideInplaceStridedFactory;
+    DispatchTableBuilder<binary_inplace_strided_impl_fn_ptr_t,
+                         FloorDivideInplaceStridedFactory, num_types>
+        dtb4;
+    dtb4.populate_dispatch_table(floor_divide_inplace_strided_dispatch_table);
+
+    // function pointers for inplace operation on contiguous inputs and output
+    using fn_ns::FloorDivideInplaceContigFactory;
+    DispatchTableBuilder<binary_inplace_contig_impl_fn_ptr_t,
+                         FloorDivideInplaceContigFactory, num_types>
+        dtb5;
+    dtb5.populate_dispatch_table(floor_divide_inplace_contig_dispatch_table);
+
+    // which types are supported by the in-place kernels
+    using fn_ns::FloorDivideInplaceTypeMapFactory;
+    DispatchTableBuilder<int, FloorDivideInplaceTypeMapFactory, num_types> dtb6;
+    dtb6.populate_dispatch_table(floor_divide_inplace_output_id_table);
+};
+
+} // namespace impl
+
+void init_floor_divide(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        impl::populate_floor_divide_dispatch_tables();
+        using impl::floor_divide_contig_dispatch_table;
+        using impl::floor_divide_output_id_table;
+        using impl::floor_divide_strided_dispatch_table;
+
+        auto floor_divide_pyapi = [&](const arrayT &src1, const arrayT &src2,
+                                      const arrayT &dst, sycl::queue &exec_q,
+                                      const event_vecT &depends = {}) {
+            return py_binary_ufunc(
+                src1, src2, dst, exec_q, depends, floor_divide_output_id_table,
+                // function pointers to handle operation on contiguous arrays
+                // (pointers may be nullptr)
+                floor_divide_contig_dispatch_table,
+                // function pointers to handle operation on strided arrays (most
+                // general case)
+                floor_divide_strided_dispatch_table,
+                // function pointers to handle operation of c-contig matrix and
+                // c-contig row with broadcasting (may be nullptr)
+                td_ns::NullPtrTable<
+                    binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{},
+                // function pointers to handle operation of c-contig matrix and
+                // c-contig row with broadcasting (may be nullptr)
+                td_ns::NullPtrTable<
+                    binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{});
+        };
+        auto floor_divide_result_type_pyapi = [&](const py::dtype &dtype1,
+                                                  const py::dtype &dtype2) {
+            return py_binary_ufunc_result_type(dtype1, dtype2,
+                                               floor_divide_output_id_table);
+        };
+        m.def("_floor_divide", floor_divide_pyapi, "", py::arg("src1"),
+              py::arg("src2"), py::arg("dst"), py::arg("sycl_queue"),
+              py::arg("depends") = py::list());
+        m.def("_floor_divide_result_type", floor_divide_result_type_pyapi, "");
+
+        using impl::floor_divide_inplace_contig_dispatch_table;
+        using impl::floor_divide_inplace_output_id_table;
+        using impl::floor_divide_inplace_strided_dispatch_table;
+
+        auto floor_divide_inplace_pyapi = [&](const arrayT &src,
+                                              const arrayT &dst,
+                                              sycl::queue &exec_q,
+                                              const event_vecT &depends = {}) {
+            return py_binary_inplace_ufunc(
+                src, dst, exec_q, depends, floor_divide_inplace_output_id_table,
+                // function pointers to handle inplace operation on
+                // contiguous arrays (pointers may be nullptr)
+                floor_divide_inplace_contig_dispatch_table,
+                // function pointers to handle inplace operation on strided
+                // arrays (most general case)
+                floor_divide_inplace_strided_dispatch_table,
+                // function pointers to handle inplace operation on
+                // c-contig matrix with c-contig row with broadcasting
+                // (may be nullptr)
+                td_ns::NullPtrTable<
+                    binary_inplace_row_matrix_broadcast_impl_fn_ptr_t>{});
+        };
+        m.def("_floor_divide_inplace", floor_divide_inplace_pyapi, "",
+              py::arg("lhs"), py::arg("rhs"), py::arg("sycl_queue"),
+              py::arg("depends") = py::list());
+    }
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/floor_divide.hpp b/dpctl_ext/tensor/libtensor/source/elementwise_functions/floor_divide.hpp
new file mode 100644
index 000000000000..17d493b58057
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/elementwise_functions/floor_divide.hpp
@@ -0,0 +1,46 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern void init_floor_divide(py::module_ m);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/greater.cpp b/dpctl_ext/tensor/libtensor/source/elementwise_functions/greater.cpp
new file mode 100644
index 000000000000..f3cfaeae2286
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/elementwise_functions/greater.cpp
@@ -0,0 +1,145 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "elementwise_functions.hpp"
+#include "greater.hpp"
+#include "utils/type_dispatch.hpp"
+
+#include "kernels/elementwise_functions/common.hpp"
+#include "kernels/elementwise_functions/greater.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
+using ew_cmn_ns::binary_contig_impl_fn_ptr_t;
+using ew_cmn_ns::binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t;
+using ew_cmn_ns::binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t;
+using ew_cmn_ns::binary_strided_impl_fn_ptr_t;
+
+// B11: ===== GREATER (x1, x2)
+namespace impl
+{
+namespace greater_fn_ns = dpctl::tensor::kernels::greater;
+
+static binary_contig_impl_fn_ptr_t
+    greater_contig_dispatch_table[td_ns::num_types][td_ns::num_types];
+static int greater_output_id_table[td_ns::num_types][td_ns::num_types];
+
+static binary_strided_impl_fn_ptr_t
+    greater_strided_dispatch_table[td_ns::num_types][td_ns::num_types];
+
+void populate_greater_dispatch_tables(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = greater_fn_ns;
+
+    // which input types are supported, and what is the type of the result
+    using fn_ns::GreaterTypeMapFactory;
+    DispatchTableBuilder<int, GreaterTypeMapFactory, num_types> dtb1;
+    dtb1.populate_dispatch_table(greater_output_id_table);
+
+    // function pointers for operation on general strided arrays
+    using fn_ns::GreaterStridedFactory;
+    DispatchTableBuilder<binary_strided_impl_fn_ptr_t, GreaterStridedFactory,
+                         num_types>
+        dtb2;
+    dtb2.populate_dispatch_table(greater_strided_dispatch_table);
+
+    // function pointers for operation on contiguous inputs and output
+    using fn_ns::GreaterContigFactory;
+    DispatchTableBuilder<binary_contig_impl_fn_ptr_t, GreaterContigFactory,
+                         num_types>
+        dtb3;
+    dtb3.populate_dispatch_table(greater_contig_dispatch_table);
+};
+
+} // namespace impl
+
+void init_greater(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        impl::populate_greater_dispatch_tables();
+        using impl::greater_contig_dispatch_table;
+        using impl::greater_output_id_table;
+        using impl::greater_strided_dispatch_table;
+
+        auto greater_pyapi = [&](const arrayT &src1, const arrayT &src2,
+                                 const arrayT &dst, sycl::queue &exec_q,
+                                 const event_vecT &depends = {}) {
+            return py_binary_ufunc(
+                src1, src2, dst, exec_q, depends, greater_output_id_table,
+                // function pointers to handle operation on contiguous arrays
+                // (pointers may be nullptr)
+                greater_contig_dispatch_table,
+                // function pointers to handle operation on strided arrays (most
+                // general case)
+                greater_strided_dispatch_table,
+                // function pointers to handle operation of c-contig matrix and
+                // c-contig row with broadcasting (may be nullptr)
+                td_ns::NullPtrTable<
+                    binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{},
+                // function pointers to handle operation of c-contig matrix and
+                // c-contig row with broadcasting (may be nullptr)
+                td_ns::NullPtrTable<
+                    binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{});
+        };
+        auto greater_result_type_pyapi = [&](const py::dtype &dtype1,
+                                             const py::dtype &dtype2) {
+            return py_binary_ufunc_result_type(dtype1, dtype2,
+                                               greater_output_id_table);
+        };
+        m.def("_greater", greater_pyapi, "", py::arg("src1"), py::arg("src2"),
+              py::arg("dst"), py::arg("sycl_queue"),
+              py::arg("depends") = py::list());
+        m.def("_greater_result_type", greater_result_type_pyapi, "");
+    }
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/greater.hpp b/dpctl_ext/tensor/libtensor/source/elementwise_functions/greater.hpp
new file mode 100644
index 000000000000..c8c3caa5f1fd
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/elementwise_functions/greater.hpp
@@ -0,0 +1,46 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern void init_greater(py::module_ m);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/greater_equal.cpp b/dpctl_ext/tensor/libtensor/source/elementwise_functions/greater_equal.cpp
new file mode 100644
index 000000000000..ad9af91ce3d8
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/elementwise_functions/greater_equal.cpp
@@ -0,0 +1,146 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "elementwise_functions.hpp"
+#include "greater_equal.hpp"
+#include "utils/type_dispatch.hpp"
+
+#include "kernels/elementwise_functions/common.hpp"
+#include "kernels/elementwise_functions/greater_equal.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
+using ew_cmn_ns::binary_contig_impl_fn_ptr_t;
+using ew_cmn_ns::binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t;
+using ew_cmn_ns::binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t;
+using ew_cmn_ns::binary_strided_impl_fn_ptr_t;
+
+// B12: ===== GREATER_EQUAL (x1, x2)
+namespace impl
+{
+namespace greater_equal_fn_ns = dpctl::tensor::kernels::greater_equal;
+
+static binary_contig_impl_fn_ptr_t
+    greater_equal_contig_dispatch_table[td_ns::num_types][td_ns::num_types];
+static int greater_equal_output_id_table[td_ns::num_types][td_ns::num_types];
+
+static binary_strided_impl_fn_ptr_t
+    greater_equal_strided_dispatch_table[td_ns::num_types][td_ns::num_types];
+
+void populate_greater_equal_dispatch_tables(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = greater_equal_fn_ns;
+
+    // which input types are supported, and what is the type of the result
+    using fn_ns::GreaterEqualTypeMapFactory;
+    DispatchTableBuilder<int, GreaterEqualTypeMapFactory, num_types> dtb1;
+    dtb1.populate_dispatch_table(greater_equal_output_id_table);
+
+    // function pointers for operation on general strided arrays
+    using fn_ns::GreaterEqualStridedFactory;
+    DispatchTableBuilder<binary_strided_impl_fn_ptr_t,
+                         GreaterEqualStridedFactory, num_types>
+        dtb2;
+    dtb2.populate_dispatch_table(greater_equal_strided_dispatch_table);
+
+    // function pointers for operation on contiguous inputs and output
+    using fn_ns::GreaterEqualContigFactory;
+    DispatchTableBuilder<binary_contig_impl_fn_ptr_t, GreaterEqualContigFactory,
+                         num_types>
+        dtb3;
+    dtb3.populate_dispatch_table(greater_equal_contig_dispatch_table);
+};
+
+} // namespace impl
+
+void init_greater_equal(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        impl::populate_greater_equal_dispatch_tables();
+        using impl::greater_equal_contig_dispatch_table;
+        using impl::greater_equal_output_id_table;
+        using impl::greater_equal_strided_dispatch_table;
+
+        auto greater_equal_pyapi = [&](const arrayT &src1, const arrayT &src2,
+                                       const arrayT &dst, sycl::queue &exec_q,
+                                       const event_vecT &depends = {}) {
+            return py_binary_ufunc(
+                src1, src2, dst, exec_q, depends, greater_equal_output_id_table,
+                // function pointers to handle operation on contiguous arrays
+                // (pointers may be nullptr)
+                greater_equal_contig_dispatch_table,
+                // function pointers to handle operation on strided arrays (most
+                // general case)
+                greater_equal_strided_dispatch_table,
+                // function pointers to handle operation of c-contig matrix and
+                // c-contig row with broadcasting (may be nullptr)
+                td_ns::NullPtrTable<
+                    binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{},
+                // function pointers to handle operation of c-contig matrix and
+                // c-contig row with broadcasting (may be nullptr)
+                td_ns::NullPtrTable<
+                    binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{});
+        };
+        auto greater_equal_result_type_pyapi = [&](const py::dtype &dtype1,
+                                                   const py::dtype &dtype2) {
+            return py_binary_ufunc_result_type(dtype1, dtype2,
+                                               greater_equal_output_id_table);
+        };
+        m.def("_greater_equal", greater_equal_pyapi, "", py::arg("src1"),
+              py::arg("src2"), py::arg("dst"), py::arg("sycl_queue"),
+              py::arg("depends") = py::list());
+        m.def("_greater_equal_result_type", greater_equal_result_type_pyapi,
+              "");
+    }
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/greater_equal.hpp b/dpctl_ext/tensor/libtensor/source/elementwise_functions/greater_equal.hpp
new file mode 100644
index 000000000000..0cf7f8e89bbf
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/elementwise_functions/greater_equal.hpp
@@ -0,0 +1,46 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern void init_greater_equal(py::module_ m);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/hypot.cpp b/dpctl_ext/tensor/libtensor/source/elementwise_functions/hypot.cpp
new file mode 100644
index 000000000000..f4ce161f4cda
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/elementwise_functions/hypot.cpp
@@ -0,0 +1,145 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "elementwise_functions.hpp"
+#include "hypot.hpp"
+#include "utils/type_dispatch.hpp"
+
+#include "kernels/elementwise_functions/common.hpp"
+#include "kernels/elementwise_functions/hypot.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
+using ew_cmn_ns::binary_contig_impl_fn_ptr_t;
+using ew_cmn_ns::binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t;
+using ew_cmn_ns::binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t;
+using ew_cmn_ns::binary_strided_impl_fn_ptr_t;
+
+// B24: ===== HYPOT (x1, x2)
+namespace impl
+{
+namespace hypot_fn_ns = dpctl::tensor::kernels::hypot;
+
+static binary_contig_impl_fn_ptr_t
+    hypot_contig_dispatch_table[td_ns::num_types][td_ns::num_types];
+static int hypot_output_id_table[td_ns::num_types][td_ns::num_types];
+
+static binary_strided_impl_fn_ptr_t
+    hypot_strided_dispatch_table[td_ns::num_types][td_ns::num_types];
+
+void populate_hypot_dispatch_tables(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = hypot_fn_ns;
+
+    // which input types are supported, and what is the type of the result
+    using fn_ns::HypotTypeMapFactory;
+    DispatchTableBuilder<int, HypotTypeMapFactory, num_types> dtb1;
+    dtb1.populate_dispatch_table(hypot_output_id_table);
+
+    // function pointers for operation on general strided arrays
+    using fn_ns::HypotStridedFactory;
+    DispatchTableBuilder<binary_strided_impl_fn_ptr_t, HypotStridedFactory,
+                         num_types>
+        dtb2;
+    dtb2.populate_dispatch_table(hypot_strided_dispatch_table);
+
+    // function pointers for operation on contiguous inputs and output
+    using fn_ns::HypotContigFactory;
+    DispatchTableBuilder<binary_contig_impl_fn_ptr_t, HypotContigFactory,
+                         num_types>
+        dtb3;
+    dtb3.populate_dispatch_table(hypot_contig_dispatch_table);
+};
+
+} // namespace impl
+
+void init_hypot(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        impl::populate_hypot_dispatch_tables();
+        using impl::hypot_contig_dispatch_table;
+        using impl::hypot_output_id_table;
+        using impl::hypot_strided_dispatch_table;
+
+        auto hypot_pyapi = [&](const arrayT &src1, const arrayT &src2,
+                               const arrayT &dst, sycl::queue &exec_q,
+                               const event_vecT &depends = {}) {
+            return py_binary_ufunc(
+                src1, src2, dst, exec_q, depends, hypot_output_id_table,
+                // function pointers to handle operation on contiguous arrays
+                // (pointers may be nullptr)
+                hypot_contig_dispatch_table,
+                // function pointers to handle operation on strided arrays (most
+                // general case)
+                hypot_strided_dispatch_table,
+                // function pointers to handle operation of c-contig matrix and
+                // c-contig row with broadcasting (may be nullptr)
+                td_ns::NullPtrTable<
+                    binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{},
+                // function pointers to handle operation of c-contig matrix and
+                // c-contig row with broadcasting (may be nullptr)
+                td_ns::NullPtrTable<
+                    binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{});
+        };
+        auto hypot_result_type_pyapi = [&](const py::dtype &dtype1,
+                                           const py::dtype &dtype2) {
+            return py_binary_ufunc_result_type(dtype1, dtype2,
+                                               hypot_output_id_table);
+        };
+        m.def("_hypot", hypot_pyapi, "", py::arg("src1"), py::arg("src2"),
+              py::arg("dst"), py::arg("sycl_queue"),
+              py::arg("depends") = py::list());
+        m.def("_hypot_result_type", hypot_result_type_pyapi, "");
+    }
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/hypot.hpp b/dpctl_ext/tensor/libtensor/source/elementwise_functions/hypot.hpp
new file mode 100644
index 000000000000..5bc73e717ad3
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/elementwise_functions/hypot.hpp
@@ -0,0 +1,46 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern void init_hypot(py::module_ m);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/less.cpp b/dpctl_ext/tensor/libtensor/source/elementwise_functions/less.cpp
new file mode 100644
index 000000000000..d587ee713178
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/elementwise_functions/less.cpp
@@ -0,0 +1,145 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "elementwise_functions.hpp"
+#include "less.hpp"
+#include "utils/type_dispatch.hpp"
+
+#include "kernels/elementwise_functions/common.hpp"
+#include "kernels/elementwise_functions/less.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
+using ew_cmn_ns::binary_contig_impl_fn_ptr_t;
+using ew_cmn_ns::binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t;
+using ew_cmn_ns::binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t;
+using ew_cmn_ns::binary_strided_impl_fn_ptr_t;
+
+// B13: ===== LESS (x1, x2)
+namespace impl
+{
+namespace less_fn_ns = dpctl::tensor::kernels::less;
+
+static binary_contig_impl_fn_ptr_t less_contig_dispatch_table[td_ns::num_types]
+                                                             [td_ns::num_types];
+static int less_output_id_table[td_ns::num_types][td_ns::num_types];
+
+static binary_strided_impl_fn_ptr_t
+    less_strided_dispatch_table[td_ns::num_types][td_ns::num_types];
+
+void populate_less_dispatch_tables(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = less_fn_ns;
+
+    // which input types are supported, and what is the type of the result
+    using fn_ns::LessTypeMapFactory;
+    DispatchTableBuilder<int, LessTypeMapFactory, num_types> dtb1;
+    dtb1.populate_dispatch_table(less_output_id_table);
+
+    // function pointers for operation on general strided arrays
+    using fn_ns::LessStridedFactory;
+    DispatchTableBuilder<binary_strided_impl_fn_ptr_t, LessStridedFactory,
+                         num_types>
+        dtb2;
+    dtb2.populate_dispatch_table(less_strided_dispatch_table);
+
+    // function pointers for operation on contiguous inputs and output
+    using fn_ns::LessContigFactory;
+    DispatchTableBuilder<binary_contig_impl_fn_ptr_t, LessContigFactory,
+                         num_types>
+        dtb3;
+    dtb3.populate_dispatch_table(less_contig_dispatch_table);
+};
+
+} // namespace impl
+
+void init_less(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        impl::populate_less_dispatch_tables();
+        using impl::less_contig_dispatch_table;
+        using impl::less_output_id_table;
+        using impl::less_strided_dispatch_table;
+
+        auto less_pyapi = [&](const arrayT &src1, const arrayT &src2,
+                              const arrayT &dst, sycl::queue &exec_q,
+                              const event_vecT &depends = {}) {
+            return py_binary_ufunc(
+                src1, src2, dst, exec_q, depends, less_output_id_table,
+                // function pointers to handle operation on contiguous arrays
+                // (pointers may be nullptr)
+                less_contig_dispatch_table,
+                // function pointers to handle operation on strided arrays (most
+                // general case)
+                less_strided_dispatch_table,
+                // function pointers to handle operation of c-contig matrix and
+                // c-contig row with broadcasting (may be nullptr)
+                td_ns::NullPtrTable<
+                    binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{},
+                // function pointers to handle operation of c-contig matrix and
+                // c-contig row with broadcasting (may be nullptr)
+                td_ns::NullPtrTable<
+                    binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{});
+        };
+        auto less_result_type_pyapi = [&](const py::dtype &dtype1,
+                                          const py::dtype &dtype2) {
+            return py_binary_ufunc_result_type(dtype1, dtype2,
+                                               less_output_id_table);
+        };
+        m.def("_less", less_pyapi, "", py::arg("src1"), py::arg("src2"),
+              py::arg("dst"), py::arg("sycl_queue"),
+              py::arg("depends") = py::list());
+        m.def("_less_result_type", less_result_type_pyapi, "");
+    }
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/less.hpp b/dpctl_ext/tensor/libtensor/source/elementwise_functions/less.hpp
new file mode 100644
index 000000000000..e08d84f380da
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/elementwise_functions/less.hpp
@@ -0,0 +1,46 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern void init_less(py::module_ m);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/less_equal.cpp b/dpctl_ext/tensor/libtensor/source/elementwise_functions/less_equal.cpp
new file mode 100644
index 000000000000..433969cead27
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/elementwise_functions/less_equal.cpp
@@ -0,0 +1,145 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "elementwise_functions.hpp"
+#include "less_equal.hpp"
+#include "utils/type_dispatch.hpp"
+
+#include "kernels/elementwise_functions/common.hpp"
+#include "kernels/elementwise_functions/less_equal.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
+using ew_cmn_ns::binary_contig_impl_fn_ptr_t;
+using ew_cmn_ns::binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t;
+using ew_cmn_ns::binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t;
+using ew_cmn_ns::binary_strided_impl_fn_ptr_t;
+
+// B14: ===== LESS_EQUAL (x1, x2)
+namespace impl
+{
+namespace less_equal_fn_ns = dpctl::tensor::kernels::less_equal;
+
+static binary_contig_impl_fn_ptr_t
+    less_equal_contig_dispatch_table[td_ns::num_types][td_ns::num_types];
+static int less_equal_output_id_table[td_ns::num_types][td_ns::num_types];
+
+static binary_strided_impl_fn_ptr_t
+    less_equal_strided_dispatch_table[td_ns::num_types][td_ns::num_types];
+
+void populate_less_equal_dispatch_tables(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = less_equal_fn_ns;
+
+    // which input types are supported, and what is the type of the result
+    using fn_ns::LessEqualTypeMapFactory;
+    DispatchTableBuilder<int, LessEqualTypeMapFactory, num_types> dtb1;
+    dtb1.populate_dispatch_table(less_equal_output_id_table);
+
+    // function pointers for operation on general strided arrays
+    using fn_ns::LessEqualStridedFactory;
+    DispatchTableBuilder<binary_strided_impl_fn_ptr_t, LessEqualStridedFactory,
+                         num_types>
+        dtb2;
+    dtb2.populate_dispatch_table(less_equal_strided_dispatch_table);
+
+    // function pointers for operation on contiguous inputs and output
+    using fn_ns::LessEqualContigFactory;
+    DispatchTableBuilder<binary_contig_impl_fn_ptr_t, LessEqualContigFactory,
+                         num_types>
+        dtb3;
+    dtb3.populate_dispatch_table(less_equal_contig_dispatch_table);
+};
+
+} // namespace impl
+
+void init_less_equal(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        impl::populate_less_equal_dispatch_tables();
+        using impl::less_equal_contig_dispatch_table;
+        using impl::less_equal_output_id_table;
+        using impl::less_equal_strided_dispatch_table;
+
+        auto less_equal_pyapi = [&](const arrayT &src1, const arrayT &src2,
+                                    const arrayT &dst, sycl::queue &exec_q,
+                                    const event_vecT &depends = {}) {
+            return py_binary_ufunc(
+                src1, src2, dst, exec_q, depends, less_equal_output_id_table,
+                // function pointers to handle operation on contiguous arrays
+                // (pointers may be nullptr)
+                less_equal_contig_dispatch_table,
+                // function pointers to handle operation on strided arrays (most
+                // general case)
+                less_equal_strided_dispatch_table,
+                // function pointers to handle operation of c-contig matrix and
+                // c-contig row with broadcasting (may be nullptr)
+                td_ns::NullPtrTable<
+                    binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{},
+                // function pointers to handle operation of c-contig matrix and
+                // c-contig row with broadcasting (may be nullptr)
+                td_ns::NullPtrTable<
+                    binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{});
+        };
+        auto less_equal_result_type_pyapi = [&](const py::dtype &dtype1,
+                                                const py::dtype &dtype2) {
+            return py_binary_ufunc_result_type(dtype1, dtype2,
+                                               less_equal_output_id_table);
+        };
+        m.def("_less_equal", less_equal_pyapi, "", py::arg("src1"),
+              py::arg("src2"), py::arg("dst"), py::arg("sycl_queue"),
+              py::arg("depends") = py::list());
+        m.def("_less_equal_result_type", less_equal_result_type_pyapi, "");
+    }
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/less_equal.hpp b/dpctl_ext/tensor/libtensor/source/elementwise_functions/less_equal.hpp
new file mode 100644
index 000000000000..8eeb837a35a7
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/elementwise_functions/less_equal.hpp
@@ -0,0 +1,46 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern void init_less_equal(py::module_ m);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/logaddexp.cpp b/dpctl_ext/tensor/libtensor/source/elementwise_functions/logaddexp.cpp
new file mode 100644
index 000000000000..71bc9cad4035
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/elementwise_functions/logaddexp.cpp
@@ -0,0 +1,145 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "elementwise_functions.hpp"
+#include "logaddexp.hpp"
+#include "utils/type_dispatch.hpp"
+
+#include "kernels/elementwise_functions/common.hpp"
+#include "kernels/elementwise_functions/logaddexp.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
+using ew_cmn_ns::binary_contig_impl_fn_ptr_t;
+using ew_cmn_ns::binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t;
+using ew_cmn_ns::binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t;
+using ew_cmn_ns::binary_strided_impl_fn_ptr_t;
+
+// B15: ===== LOGADDEXP (x1, x2)
+namespace impl
+{
+namespace logaddexp_fn_ns = dpctl::tensor::kernels::logaddexp;
+
+static binary_contig_impl_fn_ptr_t
+    logaddexp_contig_dispatch_table[td_ns::num_types][td_ns::num_types];
+static int logaddexp_output_id_table[td_ns::num_types][td_ns::num_types];
+
+static binary_strided_impl_fn_ptr_t
+    logaddexp_strided_dispatch_table[td_ns::num_types][td_ns::num_types];
+
+void populate_logaddexp_dispatch_tables(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = logaddexp_fn_ns;
+
+    // which input types are supported, and what is the type of the result
+    using fn_ns::LogAddExpTypeMapFactory;
+    DispatchTableBuilder<int, LogAddExpTypeMapFactory, num_types> dtb1;
+    dtb1.populate_dispatch_table(logaddexp_output_id_table);
+
+    // function pointers for operation on general strided arrays
+    using fn_ns::LogAddExpStridedFactory;
+    DispatchTableBuilder<binary_strided_impl_fn_ptr_t, LogAddExpStridedFactory,
+                         num_types>
+        dtb2;
+    dtb2.populate_dispatch_table(logaddexp_strided_dispatch_table);
+
+    // function pointers for operation on contiguous inputs and output
+    using fn_ns::LogAddExpContigFactory;
+    DispatchTableBuilder<binary_contig_impl_fn_ptr_t, LogAddExpContigFactory,
+                         num_types>
+        dtb3;
+    dtb3.populate_dispatch_table(logaddexp_contig_dispatch_table);
+};
+
+} // namespace impl
+
+void init_logaddexp(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        impl::populate_logaddexp_dispatch_tables();
+        using impl::logaddexp_contig_dispatch_table;
+        using impl::logaddexp_output_id_table;
+        using impl::logaddexp_strided_dispatch_table;
+
+        auto logaddexp_pyapi = [&](const arrayT &src1, const arrayT &src2,
+                                   const arrayT &dst, sycl::queue &exec_q,
+                                   const event_vecT &depends = {}) {
+            return py_binary_ufunc(
+                src1, src2, dst, exec_q, depends, logaddexp_output_id_table,
+                // function pointers to handle operation on contiguous arrays
+                // (pointers may be nullptr)
+                logaddexp_contig_dispatch_table,
+                // function pointers to handle operation on strided arrays (most
+                // general case)
+                logaddexp_strided_dispatch_table,
+                // function pointers to handle operation of c-contig matrix and
+                // c-contig row with broadcasting (may be nullptr)
+                td_ns::NullPtrTable<
+                    binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{},
+                // function pointers to handle operation of c-contig matrix and
+                // c-contig row with broadcasting (may be nullptr)
+                td_ns::NullPtrTable<
+                    binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{});
+        };
+        auto logaddexp_result_type_pyapi = [&](const py::dtype &dtype1,
+                                               const py::dtype &dtype2) {
+            return py_binary_ufunc_result_type(dtype1, dtype2,
+                                               logaddexp_output_id_table);
+        };
+        m.def("_logaddexp", logaddexp_pyapi, "", py::arg("src1"),
+              py::arg("src2"), py::arg("dst"), py::arg("sycl_queue"),
+              py::arg("depends") = py::list());
+        m.def("_logaddexp_result_type", logaddexp_result_type_pyapi, "");
+    }
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/logaddexp.hpp b/dpctl_ext/tensor/libtensor/source/elementwise_functions/logaddexp.hpp
new file mode 100644
index 000000000000..2c4efa7d0c56
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/elementwise_functions/logaddexp.hpp
@@ -0,0 +1,46 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern void init_logaddexp(py::module_ m);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/true_divide.cpp b/dpctl_ext/tensor/libtensor/source/elementwise_functions/true_divide.cpp
new file mode 100644
index 000000000000..4c1a117fbcae
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/elementwise_functions/true_divide.cpp
@@ -0,0 +1,500 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#include <algorithm>
+#include <complex>
+#include <cstddef>
+#include <cstdint>
+#include <stdexcept>
+#include <string>
+#include <utility> // for std::ignore
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "elementwise_functions.hpp"
+#include "simplify_iteration_space.hpp"
+#include "true_divide.hpp"
+
+#include "utils/memory_overlap.hpp"
+#include "utils/offset_utils.hpp"
+#include "utils/output_validation.hpp"
+#include "utils/sycl_alloc_utils.hpp"
+#include "utils/type_dispatch.hpp"
+
+#include "kernels/elementwise_functions/common.hpp"
+#include "kernels/elementwise_functions/common_inplace.hpp"
+#include "kernels/elementwise_functions/true_divide.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
+using ew_cmn_ns::binary_contig_impl_fn_ptr_t;
+using ew_cmn_ns::binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t;
+using ew_cmn_ns::binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t;
+using ew_cmn_ns::binary_strided_impl_fn_ptr_t;
+
+using ew_cmn_ns::binary_inplace_contig_impl_fn_ptr_t;
+using ew_cmn_ns::binary_inplace_row_matrix_broadcast_impl_fn_ptr_t;
+using ew_cmn_ns::binary_inplace_strided_impl_fn_ptr_t;
+
+// B08: ===== DIVIDE (x1, x2)
+namespace impl
+{
+namespace true_divide_fn_ns = dpctl::tensor::kernels::true_divide;
+
+static binary_contig_impl_fn_ptr_t
+    true_divide_contig_dispatch_table[td_ns::num_types][td_ns::num_types];
+static int true_divide_output_id_table[td_ns::num_types][td_ns::num_types];
+static int true_divide_inplace_output_id_table[td_ns::num_types]
+                                              [td_ns::num_types];
+
+static binary_strided_impl_fn_ptr_t
+    true_divide_strided_dispatch_table[td_ns::num_types][td_ns::num_types];
+
+// divide(matrix, row)
+static binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t
+    true_divide_contig_matrix_contig_row_broadcast_dispatch_table
+        [td_ns::num_types][td_ns::num_types];
+
+// divide(row, matrix)
+static binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t
+    true_divide_contig_row_contig_matrix_broadcast_dispatch_table
+        [td_ns::num_types][td_ns::num_types];
+
+static binary_inplace_contig_impl_fn_ptr_t
+    true_divide_inplace_contig_dispatch_table[td_ns::num_types]
+                                             [td_ns::num_types];
+static binary_inplace_strided_impl_fn_ptr_t
+    true_divide_inplace_strided_dispatch_table[td_ns::num_types]
+                                              [td_ns::num_types];
+static binary_inplace_row_matrix_broadcast_impl_fn_ptr_t
+    true_divide_inplace_row_matrix_dispatch_table[td_ns::num_types]
+                                                 [td_ns::num_types];
+
+void populate_true_divide_dispatch_tables(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = true_divide_fn_ns;
+
+    // which input types are supported, and what is the type of the result
+    using fn_ns::TrueDivideTypeMapFactory;
+    DispatchTableBuilder<int, TrueDivideTypeMapFactory, num_types> dtb1;
+    dtb1.populate_dispatch_table(true_divide_output_id_table);
+
+    // function pointers for operation on general strided arrays
+    using fn_ns::TrueDivideStridedFactory;
+    DispatchTableBuilder<binary_strided_impl_fn_ptr_t, TrueDivideStridedFactory,
+                         num_types>
+        dtb2;
+    dtb2.populate_dispatch_table(true_divide_strided_dispatch_table);
+
+    // function pointers for operation on contiguous inputs and output
+    using fn_ns::TrueDivideContigFactory;
+    DispatchTableBuilder<binary_contig_impl_fn_ptr_t, TrueDivideContigFactory,
+                         num_types>
+        dtb3;
+    dtb3.populate_dispatch_table(true_divide_contig_dispatch_table);
+
+    // function pointers for operation on contiguous matrix, contiguous row
+    // with contiguous matrix output
+    using fn_ns::TrueDivideContigMatrixContigRowBroadcastFactory;
+    DispatchTableBuilder<
+        binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t,
+        TrueDivideContigMatrixContigRowBroadcastFactory, num_types>
+        dtb4;
+    dtb4.populate_dispatch_table(
+        true_divide_contig_matrix_contig_row_broadcast_dispatch_table);
+
+    // function pointers for operation on contiguous row, contiguous matrix
+    // with contiguous matrix output
+    using fn_ns::TrueDivideContigRowContigMatrixBroadcastFactory;
+    DispatchTableBuilder<
+        binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t,
+        TrueDivideContigRowContigMatrixBroadcastFactory, num_types>
+        dtb5;
+    dtb5.populate_dispatch_table(
+        true_divide_contig_row_contig_matrix_broadcast_dispatch_table);
+
+    // which types are supported by the in-place kernels
+    using fn_ns::TrueDivideInplaceTypeMapFactory;
+    DispatchTableBuilder<int, TrueDivideInplaceTypeMapFactory, num_types> dtb6;
+    dtb6.populate_dispatch_table(true_divide_inplace_output_id_table);
+
+    // function pointers for inplace operation on general strided arrays
+    using fn_ns::TrueDivideInplaceStridedFactory;
+    DispatchTableBuilder<binary_inplace_strided_impl_fn_ptr_t,
+                         TrueDivideInplaceStridedFactory, num_types>
+        dtb7;
+    dtb7.populate_dispatch_table(true_divide_inplace_strided_dispatch_table);
+
+    // function pointers for inplace operation on contiguous inputs and output
+    using fn_ns::TrueDivideInplaceContigFactory;
+    DispatchTableBuilder<binary_inplace_contig_impl_fn_ptr_t,
+                         TrueDivideInplaceContigFactory, num_types>
+        dtb8;
+    dtb8.populate_dispatch_table(true_divide_inplace_contig_dispatch_table);
+
+    // function pointers for inplace operation on contiguous matrix
+    // and contiguous row
+    using fn_ns::TrueDivideInplaceRowMatrixBroadcastFactory;
+    DispatchTableBuilder<binary_inplace_row_matrix_broadcast_impl_fn_ptr_t,
+                         TrueDivideInplaceRowMatrixBroadcastFactory, num_types>
+        dtb9;
+    dtb9.populate_dispatch_table(true_divide_inplace_row_matrix_dispatch_table);
+};
+
+template <typename T>
+class divide_by_scalar_krn;
+
+typedef sycl::event (*divide_by_scalar_fn_ptr_t)(
+    sycl::queue &,
+    std::size_t,
+    int,
+    const ssize_t *,
+    const char *,
+    py::ssize_t,
+    const char *,
+    char *,
+    py::ssize_t,
+    const std::vector<sycl::event> &);
+
+template <typename T, typename scalarT>
+sycl::event divide_by_scalar(sycl::queue &exec_q,
+                             std::size_t nelems,
+                             int nd,
+                             const ssize_t *shape_and_strides,
+                             const char *arg_p,
+                             py::ssize_t arg_offset,
+                             const char *scalar_ptr,
+                             char *res_p,
+                             py::ssize_t res_offset,
+                             const std::vector<sycl::event> &depends = {})
+{
+    const scalarT sc_v = *reinterpret_cast<const scalarT *>(scalar_ptr);
+
+    sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+
+        using BinOpT =
+            dpctl::tensor::kernels::true_divide::TrueDivideFunctor<T, scalarT,
+                                                                   T>;
+
+        auto op = BinOpT();
+
+        using IndexerT =
+            typename dpctl::tensor::offset_utils::TwoOffsets_StridedIndexer;
+
+        const IndexerT two_offsets_indexer{nd, arg_offset, res_offset,
+                                           shape_and_strides};
+
+        const T *arg_tp = reinterpret_cast<const T *>(arg_p);
+        T *res_tp = reinterpret_cast<T *>(res_p);
+
+        cgh.parallel_for<divide_by_scalar_krn<T>>(
+            {nelems}, [=](sycl::id<1> id) {
+                const auto &two_offsets_ =
+                    two_offsets_indexer(static_cast<ssize_t>(id.get(0)));
+
+                const auto &arg_i = two_offsets_.get_first_offset();
+                const auto &res_i = two_offsets_.get_second_offset();
+                res_tp[res_i] = op(arg_tp[arg_i], sc_v);
+            });
+    });
+    return comp_ev;
+}
+
+std::pair<sycl::event, sycl::event>
+    py_divide_by_scalar(const dpctl::tensor::usm_ndarray &src,
+                        double scalar,
+                        const dpctl::tensor::usm_ndarray &dst,
+                        sycl::queue &exec_q,
+                        const std::vector<sycl::event> &depends = {})
+{
+    int src_typenum = src.get_typenum();
+    int dst_typenum = dst.get_typenum();
+
+    auto array_types = td_ns::usm_ndarray_types();
+    int src_typeid = array_types.typenum_to_lookup_id(src_typenum);
+    int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum);
+
+    if (src_typeid != dst_typeid) {
+        throw py::value_error(
+            "Destination array has unexpected elemental data type.");
+    }
+
+    // check that queues are compatible
+    if (!dpctl::utils::queues_are_compatible(exec_q, {src, dst})) {
+        throw py::value_error(
+            "Execution queue is not compatible with allocation queues");
+    }
+
+    dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst);
+    // check shapes, broadcasting is assumed done by caller
+    // check that dimensions are the same
+    int dst_nd = dst.get_ndim();
+    if (dst_nd != src.get_ndim()) {
+        throw py::value_error("Array dimensions are not the same.");
+    }
+
+    // check that shapes are the same
+    const py::ssize_t *src_shape = src.get_shape_raw();
+    const py::ssize_t *dst_shape = dst.get_shape_raw();
+    bool shapes_equal(true);
+    std::size_t src_nelems(1);
+
+    for (int i = 0; i < dst_nd; ++i) {
+        src_nelems *= static_cast<std::size_t>(src_shape[i]);
+        shapes_equal = shapes_equal && (src_shape[i] == dst_shape[i]);
+    }
+    if (!shapes_equal) {
+        throw py::value_error("Array shapes are not the same.");
+    }
+
+    // if nelems is zero, return
+    if (src_nelems == 0) {
+        return std::make_pair(sycl::event(), sycl::event());
+    }
+
+    dpctl::tensor::validation::AmpleMemory::throw_if_not_ample(dst, src_nelems);
+
+    auto const &overlap = dpctl::tensor::overlap::MemoryOverlap();
+    auto const &same_logical_tensors =
+        dpctl::tensor::overlap::SameLogicalTensors();
+    if ((overlap(src, dst) && !same_logical_tensors(src, dst))) {
+        throw py::value_error("Arrays index overlapping segments of memory");
+    }
+
+    const char *src_data = src.get_data();
+    char *dst_data = dst.get_data();
+
+    static constexpr int float16_typeid =
+        static_cast<int>(td_ns::typenum_t::HALF);
+    static constexpr int float32_typeid =
+        static_cast<int>(td_ns::typenum_t::FLOAT);
+    static constexpr int float64_typeid =
+        static_cast<int>(td_ns::typenum_t::DOUBLE);
+    static constexpr int complex64_typeid =
+        static_cast<int>(td_ns::typenum_t::CFLOAT);
+    static constexpr int complex128_typeid =
+        static_cast<int>(td_ns::typenum_t::CDOUBLE);
+
+    // statically pre-allocated memory for scalar
+    alignas(double) char scalar_alloc[sizeof(double)] = {0};
+
+    divide_by_scalar_fn_ptr_t fn;
+    // placement new into stack memory means no call to delete is necessary
+    switch (src_typeid) {
+    case float16_typeid:
+    {
+        fn = divide_by_scalar<sycl::half, sycl::half>;
+        std::ignore =
+            new (scalar_alloc) sycl::half(static_cast<sycl::half>(scalar));
+        break;
+    }
+    case float32_typeid:
+    {
+        fn = divide_by_scalar<float, float>;
+        std::ignore = new (scalar_alloc) float(scalar);
+        break;
+    }
+    case float64_typeid:
+    {
+        fn = divide_by_scalar<double, double>;
+        std::ignore = new (scalar_alloc) double(scalar);
+        break;
+    }
+    case complex64_typeid:
+    {
+        fn = divide_by_scalar<std::complex<float>, float>;
+        std::ignore = new (scalar_alloc) float(scalar);
+        break;
+    }
+    case complex128_typeid:
+    {
+        fn = divide_by_scalar<std::complex<double>, double>;
+        std::ignore = new (scalar_alloc) double(scalar);
+        break;
+    }
+    default:
+        throw std::runtime_error("Implementation is missing for typeid=" +
+                                 std::to_string(src_typeid));
+    }
+
+    // simplify strides
+    auto const &src_strides = src.get_strides_vector();
+    auto const &dst_strides = dst.get_strides_vector();
+
+    using shT = std::vector<py::ssize_t>;
+    shT simplified_shape;
+    shT simplified_src_strides;
+    shT simplified_dst_strides;
+    py::ssize_t src_offset(0);
+    py::ssize_t dst_offset(0);
+
+    int nd = dst_nd;
+    const py::ssize_t *shape = src_shape;
+
+    std::vector<sycl::event> host_tasks{};
+    simplify_iteration_space(nd, shape, src_strides, dst_strides,
+                             // outputs
+                             simplified_shape, simplified_src_strides,
+                             simplified_dst_strides, src_offset, dst_offset);
+
+    if (nd == 0) {
+        // handle 0d array as 1d array with 1 element
+        static constexpr py::ssize_t one{1};
+        simplified_shape.push_back(one);
+        simplified_src_strides.push_back(one);
+        simplified_dst_strides.push_back(one);
+        src_offset = 0;
+        dst_offset = 0;
+    }
+
+    using dpctl::tensor::offset_utils::device_allocate_and_pack;
+    auto ptr_sz_event_triple_ = device_allocate_and_pack<py::ssize_t>(
+        exec_q, host_tasks, simplified_shape, simplified_src_strides,
+        simplified_dst_strides);
+    auto shape_strides_owner = std::move(std::get<0>(ptr_sz_event_triple_));
+    auto &copy_metadata_ev = std::get<2>(ptr_sz_event_triple_);
+
+    const py::ssize_t *shape_strides = shape_strides_owner.get();
+
+    std::vector<sycl::event> all_deps;
+    all_deps.reserve(depends.size() + 1);
+    all_deps.resize(depends.size());
+    std::copy(depends.begin(), depends.end(), all_deps.begin());
+    all_deps.push_back(copy_metadata_ev);
+
+    sycl::event div_ev =
+        fn(exec_q, src_nelems, nd, shape_strides, src_data, src_offset,
+           scalar_alloc, dst_data, dst_offset, all_deps);
+
+    // async free of shape_strides temporary
+    sycl::event tmp_cleanup_ev = dpctl::tensor::alloc_utils::async_smart_free(
+        exec_q, {div_ev}, shape_strides_owner);
+
+    host_tasks.push_back(tmp_cleanup_ev);
+
+    return std::make_pair(
+        dpctl::utils::keep_args_alive(exec_q, {src, dst}, host_tasks), div_ev);
+}
+
+} // namespace impl
+
+void init_divide(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        impl::populate_true_divide_dispatch_tables();
+        using impl::true_divide_contig_dispatch_table;
+        using impl::
+            true_divide_contig_matrix_contig_row_broadcast_dispatch_table;
+        using impl::
+            true_divide_contig_row_contig_matrix_broadcast_dispatch_table;
+        using impl::true_divide_output_id_table;
+        using impl::true_divide_strided_dispatch_table;
+
+        auto divide_pyapi = [&](const arrayT &src1, const arrayT &src2,
+                                const arrayT &dst, sycl::queue &exec_q,
+                                const event_vecT &depends = {}) {
+            return py_binary_ufunc(
+                src1, src2, dst, exec_q, depends, true_divide_output_id_table,
+                // function pointers to handle operation on contiguous arrays
+                // (pointers may be nullptr)
+                true_divide_contig_dispatch_table,
+                // function pointers to handle operation on strided arrays (most
+                // general case)
+                true_divide_strided_dispatch_table,
+                // function pointers to handle operation of c-contig matrix and
+                // c-contig row with broadcasting (may be nullptr)
+                true_divide_contig_matrix_contig_row_broadcast_dispatch_table,
+                // function pointers to handle operation of c-contig matrix and
+                // c-contig row with broadcasting (may be nullptr)
+                true_divide_contig_row_contig_matrix_broadcast_dispatch_table);
+        };
+        auto divide_result_type_pyapi = [&](const py::dtype &dtype1,
+                                            const py::dtype &dtype2) {
+            return py_binary_ufunc_result_type(dtype1, dtype2,
+                                               true_divide_output_id_table);
+        };
+        m.def("_divide", divide_pyapi, "", py::arg("src1"), py::arg("src2"),
+              py::arg("dst"), py::arg("sycl_queue"),
+              py::arg("depends") = py::list());
+        m.def("_divide_result_type", divide_result_type_pyapi, "");
+
+        using impl::true_divide_inplace_contig_dispatch_table;
+        using impl::true_divide_inplace_output_id_table;
+        using impl::true_divide_inplace_row_matrix_dispatch_table;
+        using impl::true_divide_inplace_strided_dispatch_table;
+
+        auto divide_inplace_pyapi = [&](const arrayT &src, const arrayT &dst,
+                                        sycl::queue &exec_q,
+                                        const event_vecT &depends = {}) {
+            return py_binary_inplace_ufunc(
+                src, dst, exec_q, depends, true_divide_inplace_output_id_table,
+                // function pointers to handle inplace operation on
+                // contiguous arrays (pointers may be nullptr)
+                true_divide_inplace_contig_dispatch_table,
+                // function pointers to handle inplace operation on strided
+                // arrays (most general case)
+                true_divide_inplace_strided_dispatch_table,
+                // function pointers to handle inplace operation on
+                // c-contig matrix with c-contig row with broadcasting
+                // (may be nullptr)
+                true_divide_inplace_row_matrix_dispatch_table);
+        };
+        m.def("_divide_inplace", divide_inplace_pyapi, "", py::arg("lhs"),
+              py::arg("rhs"), py::arg("sycl_queue"),
+              py::arg("depends") = py::list());
+
+        using impl::py_divide_by_scalar;
+        m.def("_divide_by_scalar", &py_divide_by_scalar, "", py::arg("src"),
+              py::arg("scalar"), py::arg("dst"), py::arg("sycl_queue"),
+              py::arg("depends") = py::list());
+    }
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/true_divide.hpp b/dpctl_ext/tensor/libtensor/source/elementwise_functions/true_divide.hpp
new file mode 100644
index 000000000000..941384beaf8d
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/elementwise_functions/true_divide.hpp
@@ -0,0 +1,46 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern void init_divide(py::module_ m);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpnp/dpnp_iface_logic.py b/dpnp/dpnp_iface_logic.py
index 90a669f713bd..b901206f8763 100644
--- a/dpnp/dpnp_iface_logic.py
+++ b/dpnp/dpnp_iface_logic.py
@@ -661,8 +661,8 @@ def array_equiv(a1, a2):
 
 equal = DPNPBinaryFunc(
     "equal",
-    ti._equal_result_type,
-    ti._equal,
+    ti_ext._equal_result_type,
+    ti_ext._equal,
     _EQUAL_DOCSTRING,
 )
 
@@ -737,8 +737,8 @@ def array_equiv(a1, a2):
 
 greater = DPNPBinaryFunc(
     "greater",
-    ti._greater_result_type,
-    ti._greater,
+    ti_ext._greater_result_type,
+    ti_ext._greater,
     _GREATER_DOCSTRING,
 )
 
@@ -814,8 +814,8 @@ def array_equiv(a1, a2):
 
 greater_equal = DPNPBinaryFunc(
     "greater_equal",
-    ti._greater_equal_result_type,
-    ti._greater_equal,
+    ti_ext._greater_equal_result_type,
+    ti_ext._greater_equal,
     _GREATER_EQUAL_DOCSTRING,
 )
 
@@ -1750,8 +1750,8 @@ def isscalar(element):
 
 less = DPNPBinaryFunc(
     "less",
-    ti._less_result_type,
-    ti._less,
+    ti_ext._less_result_type,
+    ti_ext._less,
     _LESS_DOCSTRING,
 )
 
@@ -1826,8 +1826,8 @@ def isscalar(element):
 
 less_equal = DPNPBinaryFunc(
     "less_equal",
-    ti._less_equal_result_type,
-    ti._less_equal,
+    ti_ext._less_equal_result_type,
+    ti_ext._less_equal,
     _LESS_EQUAL_DOCSTRING,
 )
 
diff --git a/dpnp/dpnp_iface_mathematical.py b/dpnp/dpnp_iface_mathematical.py
index d1bdbdcfc961..54a17cec0c37 100644
--- a/dpnp/dpnp_iface_mathematical.py
+++ b/dpnp/dpnp_iface_mathematical.py
@@ -1558,12 +1558,12 @@ def diff(a, n=1, axis=-1, prepend=None, append=None):
 
 divide = DPNPBinaryFunc(
     "divide",
-    ti._divide_result_type,
-    ti._divide,
+    ti_ext._divide_result_type,
+    ti_ext._divide,
     _DIVIDE_DOCSTRING,
     mkl_fn_to_call="_mkl_div_to_call",
     mkl_impl_fn="_div",
-    binary_inplace_fn=ti._divide_inplace,
+    binary_inplace_fn=ti_ext._divide_inplace,
     acceptance_fn=dtu._acceptance_fn_divide,
 )
 
@@ -2139,10 +2139,10 @@ def ediff1d(ary, to_end=None, to_begin=None):
 
 floor_divide = DPNPBinaryFunc(
     "floor_divide",
-    ti._floor_divide_result_type,
-    ti._floor_divide,
+    ti_ext._floor_divide_result_type,
+    ti_ext._floor_divide,
     _FLOOR_DIVIDE_DOCSTRING,
-    binary_inplace_fn=ti._floor_divide_inplace,
+    binary_inplace_fn=ti_ext._floor_divide_inplace,
 )
 
 
diff --git a/dpnp/dpnp_iface_trigonometric.py b/dpnp/dpnp_iface_trigonometric.py
index 186ae47b0958..906a20f1625e 100644
--- a/dpnp/dpnp_iface_trigonometric.py
+++ b/dpnp/dpnp_iface_trigonometric.py
@@ -42,12 +42,10 @@
 # pylint: disable=protected-access
 # pylint: disable=no-name-in-module
 
-import dpctl.tensor._tensor_elementwise_impl as ti
-
 # TODO: revert to `import dpctl.tensor...`
 # when dpnp fully migrates dpctl/tensor
 import dpctl_ext.tensor as dpt
-import dpctl_ext.tensor._tensor_elementwise_impl as ti_ext
+import dpctl_ext.tensor._tensor_elementwise_impl as ti
 import dpctl_ext.tensor._type_utils as dtu
 import dpnp
 import dpnp.backend.extensions.ufunc._ufunc_impl as ufi
@@ -139,8 +137,8 @@ def _get_accumulation_res_dt(a, dtype):
 
 acos = DPNPUnaryFunc(
     "acos",
-    ti_ext._acos_result_type,
-    ti_ext._acos,
+    ti._acos_result_type,
+    ti._acos,
     _ACOS_DOCSTRING,
     mkl_fn_to_call="_mkl_acos_to_call",
     mkl_impl_fn="_acos",
@@ -225,8 +223,8 @@ def _get_accumulation_res_dt(a, dtype):
 
 acosh = DPNPUnaryFunc(
     "acosh",
-    ti_ext._acosh_result_type,
-    ti_ext._acosh,
+    ti._acosh_result_type,
+    ti._acosh,
     _ACOSH_DOCSTRING,
     mkl_fn_to_call="_mkl_acosh_to_call",
     mkl_impl_fn="_acosh",
@@ -311,8 +309,8 @@ def _get_accumulation_res_dt(a, dtype):
 
 asin = DPNPUnaryFunc(
     "asin",
-    ti_ext._asin_result_type,
-    ti_ext._asin,
+    ti._asin_result_type,
+    ti._asin,
     _ASIN_DOCSTRING,
     mkl_fn_to_call="_mkl_asin_to_call",
     mkl_impl_fn="_asin",
@@ -395,8 +393,8 @@ def _get_accumulation_res_dt(a, dtype):
 
 asinh = DPNPUnaryFunc(
     "asinh",
-    ti_ext._asinh_result_type,
-    ti_ext._asinh,
+    ti._asinh_result_type,
+    ti._asinh,
     _ASINH_DOCSTRING,
     mkl_fn_to_call="_mkl_asinh_to_call",
     mkl_impl_fn="_asinh",
@@ -481,8 +479,8 @@ def _get_accumulation_res_dt(a, dtype):
 
 atan = DPNPUnaryFunc(
     "atan",
-    ti_ext._atan_result_type,
-    ti_ext._atan,
+    ti._atan_result_type,
+    ti._atan,
     _ATAN_DOCSTRING,
     mkl_fn_to_call="_mkl_atan_to_call",
     mkl_impl_fn="_atan",
@@ -572,8 +570,8 @@ def _get_accumulation_res_dt(a, dtype):
 
 atan2 = DPNPBinaryFunc(
     "atan2",
-    ti_ext._atan2_result_type,
-    ti_ext._atan2,
+    ti._atan2_result_type,
+    ti._atan2,
     _ATAN2_DOCSTRING,
     mkl_fn_to_call="_mkl_atan2_to_call",
     mkl_impl_fn="_atan2",
@@ -656,8 +654,8 @@ def _get_accumulation_res_dt(a, dtype):
 
 atanh = DPNPUnaryFunc(
     "atanh",
-    ti_ext._atanh_result_type,
-    ti_ext._atanh,
+    ti._atanh_result_type,
+    ti._atanh,
     _ATANH_DOCSTRING,
     mkl_fn_to_call="_mkl_atanh_to_call",
     mkl_impl_fn="_atanh",
@@ -718,8 +716,8 @@ def _get_accumulation_res_dt(a, dtype):
 
 cbrt = DPNPUnaryFunc(
     "cbrt",
-    ti_ext._cbrt_result_type,
-    ti_ext._cbrt,
+    ti._cbrt_result_type,
+    ti._cbrt,
     _CBRT_DOCSTRING,
     mkl_fn_to_call="_mkl_cbrt_to_call",
     mkl_impl_fn="_cbrt",
@@ -777,8 +775,8 @@ def _get_accumulation_res_dt(a, dtype):
 
 cos = DPNPUnaryFunc(
     "cos",
-    ti_ext._cos_result_type,
-    ti_ext._cos,
+    ti._cos_result_type,
+    ti._cos,
     _COS_DOCSTRING,
     mkl_fn_to_call="_mkl_cos_to_call",
     mkl_impl_fn="_cos",
@@ -841,8 +839,8 @@ def _get_accumulation_res_dt(a, dtype):
 
 cosh = DPNPUnaryFunc(
     "cosh",
-    ti_ext._cosh_result_type,
-    ti_ext._cosh,
+    ti._cosh_result_type,
+    ti._cosh,
     _COSH_DOCSTRING,
     mkl_fn_to_call="_mkl_cosh_to_call",
     mkl_impl_fn="_cosh",
@@ -1127,8 +1125,8 @@ def cumlogsumexp(
 
 exp = DPNPUnaryFunc(
     "exp",
-    ti_ext._exp_result_type,
-    ti_ext._exp,
+    ti._exp_result_type,
+    ti._exp,
     _EXP_DOCSTRING,
     mkl_fn_to_call="_mkl_exp_to_call",
     mkl_impl_fn="_exp",
@@ -1187,8 +1185,8 @@ def cumlogsumexp(
 
 exp2 = DPNPUnaryFunc(
     "exp2",
-    ti_ext._exp2_result_type,
-    ti_ext._exp2,
+    ti._exp2_result_type,
+    ti._exp2,
     _EXP2_DOCSTRING,
     mkl_fn_to_call="_mkl_exp2_to_call",
     mkl_impl_fn="_exp2",
@@ -1259,8 +1257,8 @@ def cumlogsumexp(
 
 expm1 = DPNPUnaryFunc(
     "expm1",
-    ti_ext._expm1_result_type,
-    ti_ext._expm1,
+    ti._expm1_result_type,
+    ti._expm1,
     _EXPM1_DOCSTRING,
     mkl_fn_to_call="_mkl_expm1_to_call",
     mkl_impl_fn="_expm1",
@@ -1416,8 +1414,8 @@ def cumlogsumexp(
 
 log = DPNPUnaryFunc(
     "log",
-    ti_ext._log_result_type,
-    ti_ext._log,
+    ti._log_result_type,
+    ti._log,
     _LOG_DOCSTRING,
     mkl_fn_to_call="_mkl_ln_to_call",
     mkl_impl_fn="_ln",
@@ -1495,8 +1493,8 @@ def cumlogsumexp(
 
 log10 = DPNPUnaryFunc(
     "log10",
-    ti_ext._log10_result_type,
-    ti_ext._log10,
+    ti._log10_result_type,
+    ti._log10,
     _LOG10_DOCSTRING,
     mkl_fn_to_call="_mkl_log10_to_call",
     mkl_impl_fn="_log10",
@@ -1580,8 +1578,8 @@ def cumlogsumexp(
 
 log1p = DPNPUnaryFunc(
     "log1p",
-    ti_ext._log1p_result_type,
-    ti_ext._log1p,
+    ti._log1p_result_type,
+    ti._log1p,
     _LOG1P_DOCSTRING,
     mkl_fn_to_call="_mkl_log1p_to_call",
     mkl_impl_fn="_log1p",
@@ -1660,8 +1658,8 @@ def cumlogsumexp(
 
 log2 = DPNPUnaryFunc(
     "log2",
-    ti_ext._log2_result_type,
-    ti_ext._log2,
+    ti._log2_result_type,
+    ti._log2,
     _LOG2_DOCSTRING,
     mkl_fn_to_call="_mkl_log2_to_call",
     mkl_impl_fn="_log2",
@@ -2107,8 +2105,8 @@ def logsumexp(x, /, *, axis=None, dtype=None, keepdims=False, out=None):
 
 reciprocal = DPNPUnaryFunc(
     "reciprocal",
-    ti_ext._reciprocal_result_type,
-    ti_ext._reciprocal,
+    ti._reciprocal_result_type,
+    ti._reciprocal,
     _RECIPROCAL_DOCSTRING,
     mkl_fn_to_call="_mkl_inv_to_call",
     mkl_impl_fn="_inv",
@@ -2252,8 +2250,8 @@ def reduce_hypot(x, /, *, axis=None, dtype=None, keepdims=False, out=None):
 
 rsqrt = DPNPUnaryFunc(
     "rsqrt",
-    ti_ext._rsqrt_result_type,
-    ti_ext._rsqrt,
+    ti._rsqrt_result_type,
+    ti._rsqrt,
     _RSQRT_DOCSTRING,
 )
 
@@ -2309,8 +2307,8 @@ def reduce_hypot(x, /, *, axis=None, dtype=None, keepdims=False, out=None):
 
 sin = DPNPUnaryFunc(
     "sin",
-    ti_ext._sin_result_type,
-    ti_ext._sin,
+    ti._sin_result_type,
+    ti._sin,
     _SIN_DOCSTRING,
     mkl_fn_to_call="_mkl_sin_to_call",
     mkl_impl_fn="_sin",
@@ -2372,8 +2370,8 @@ def reduce_hypot(x, /, *, axis=None, dtype=None, keepdims=False, out=None):
 
 sinh = DPNPUnaryFunc(
     "sinh",
-    ti_ext._sinh_result_type,
-    ti_ext._sinh,
+    ti._sinh_result_type,
+    ti._sinh,
     _SINH_DOCSTRING,
     mkl_fn_to_call="_mkl_sinh_to_call",
     mkl_impl_fn="_sinh",
@@ -2449,8 +2447,8 @@ def reduce_hypot(x, /, *, axis=None, dtype=None, keepdims=False, out=None):
 
 sqrt = DPNPUnaryFunc(
     "sqrt",
-    ti_ext._sqrt_result_type,
-    ti_ext._sqrt,
+    ti._sqrt_result_type,
+    ti._sqrt,
     _SQRT_DOCSTRING,
     mkl_fn_to_call="_mkl_sqrt_to_call",
     mkl_impl_fn="_sqrt",
@@ -2508,8 +2506,8 @@ def reduce_hypot(x, /, *, axis=None, dtype=None, keepdims=False, out=None):
 
 square = DPNPUnaryFunc(
     "square",
-    ti_ext._square_result_type,
-    ti_ext._square,
+    ti._square_result_type,
+    ti._square,
     _SQUARE_DOCSTRING,
     mkl_fn_to_call="_mkl_sqr_to_call",
     mkl_impl_fn="_sqr",
@@ -2567,8 +2565,8 @@ def reduce_hypot(x, /, *, axis=None, dtype=None, keepdims=False, out=None):
 
 tan = DPNPUnaryFunc(
     "tan",
-    ti_ext._tan_result_type,
-    ti_ext._tan,
+    ti._tan_result_type,
+    ti._tan,
     _TAN_DOCSTRING,
     mkl_fn_to_call="_mkl_tan_to_call",
     mkl_impl_fn="_tan",
@@ -2632,8 +2630,8 @@ def reduce_hypot(x, /, *, axis=None, dtype=None, keepdims=False, out=None):
 
 tanh = DPNPUnaryFunc(
     "tanh",
-    ti_ext._tanh_result_type,
-    ti_ext._tanh,
+    ti._tanh_result_type,
+    ti._tanh,
     _TANH_DOCSTRING,
     mkl_fn_to_call="_mkl_tanh_to_call",
     mkl_impl_fn="_tanh",

From aa816fdab570edd8eac1fd37adc477b442ee99a9 Mon Sep 17 00:00:00 2001
From: vlad-perevezentsev <vladislav.perevezentsev@intel.com>
Date: Thu, 19 Mar 2026 12:24:13 +0100
Subject: [PATCH 19/43] Extend `_tensor_elementwise_impl` with binary functions
 part 3 (#2805)

This PR extends _tensor_elementwise_impl with the remaining binary
functions : ` copysign, logical_and, logical_or, logical_xor, maximum, minimum, multiply, nextafter, not_equal, pow,  remainder, subtract `

This is the last PR series of `_tensor_elementwise_impl` migration which
fully migrates all elementwise functions to `dpctl_ext.tensor`
---
 dpctl_ext/tensor/CMakeLists.txt               |  24 +-
 dpctl_ext/tensor/__init__.py                  |  24 +
 dpctl_ext/tensor/_elementwise_funcs.py        | 409 +++++++++++
 .../elementwise_functions/copysign.hpp        | 248 +++++++
 .../elementwise_functions/logical_and.hpp     | 291 ++++++++
 .../elementwise_functions/logical_or.hpp      | 290 ++++++++
 .../elementwise_functions/logical_xor.hpp     | 292 ++++++++
 .../kernels/elementwise_functions/maximum.hpp |   3 +
 .../kernels/elementwise_functions/minimum.hpp |   3 +
 .../elementwise_functions/multiply.hpp        | 648 ++++++++++++++++++
 .../elementwise_functions/nextafter.hpp       | 248 +++++++
 .../elementwise_functions/not_equal.hpp       | 304 ++++++++
 .../kernels/elementwise_functions/pow.hpp     | 602 ++++++++++++++++
 .../elementwise_functions/remainder.hpp       | 578 ++++++++++++++++
 .../elementwise_functions/subtract.hpp        | 646 +++++++++++++++++
 .../elementwise_functions/true_divide.hpp     |   2 +-
 .../source/elementwise_functions/copysign.cpp | 146 ++++
 .../source/elementwise_functions/copysign.hpp |  46 ++
 .../elementwise_common.cpp                    |  47 +-
 .../elementwise_functions/logical_and.cpp     | 146 ++++
 .../elementwise_functions/logical_and.hpp     |  46 ++
 .../elementwise_functions/logical_or.cpp      | 146 ++++
 .../elementwise_functions/logical_or.hpp      |  46 ++
 .../elementwise_functions/logical_xor.cpp     | 146 ++++
 .../elementwise_functions/logical_xor.hpp     |  46 ++
 .../source/elementwise_functions/maximum.cpp  | 146 ++++
 .../source/elementwise_functions/maximum.hpp  |  46 ++
 .../source/elementwise_functions/minimum.cpp  | 146 ++++
 .../source/elementwise_functions/minimum.hpp  |  46 ++
 .../source/elementwise_functions/multiply.cpp | 244 +++++++
 .../source/elementwise_functions/multiply.hpp |  46 ++
 .../elementwise_functions/nextafter.cpp       | 146 ++++
 .../elementwise_functions/nextafter.hpp       |  46 ++
 .../elementwise_functions/not_equal.cpp       | 146 ++++
 .../elementwise_functions/not_equal.hpp       |  46 ++
 .../source/elementwise_functions/pow.cpp      | 203 ++++++
 .../source/elementwise_functions/pow.hpp      |  46 ++
 .../elementwise_functions/remainder.cpp       | 205 ++++++
 .../elementwise_functions/remainder.hpp       |  46 ++
 .../source/elementwise_functions/subtract.cpp | 243 +++++++
 .../source/elementwise_functions/subtract.hpp |  42 ++
 dpnp/dpnp_iface_logic.py                      |  40 +-
 dpnp/dpnp_iface_mathematical.py               |  75 +-
 43 files changed, 7290 insertions(+), 95 deletions(-)
 create mode 100644 dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/copysign.hpp
 create mode 100644 dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/logical_and.hpp
 create mode 100644 dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/logical_or.hpp
 create mode 100644 dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/logical_xor.hpp
 create mode 100644 dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/multiply.hpp
 create mode 100644 dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/nextafter.hpp
 create mode 100644 dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/not_equal.hpp
 create mode 100644 dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/pow.hpp
 create mode 100644 dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/remainder.hpp
 create mode 100644 dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/subtract.hpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/elementwise_functions/copysign.cpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/elementwise_functions/copysign.hpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/elementwise_functions/logical_and.cpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/elementwise_functions/logical_and.hpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/elementwise_functions/logical_or.cpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/elementwise_functions/logical_or.hpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/elementwise_functions/logical_xor.cpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/elementwise_functions/logical_xor.hpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/elementwise_functions/maximum.cpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/elementwise_functions/maximum.hpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/elementwise_functions/minimum.cpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/elementwise_functions/minimum.hpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/elementwise_functions/multiply.cpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/elementwise_functions/multiply.hpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/elementwise_functions/nextafter.cpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/elementwise_functions/nextafter.hpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/elementwise_functions/not_equal.cpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/elementwise_functions/not_equal.hpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/elementwise_functions/pow.cpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/elementwise_functions/pow.hpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/elementwise_functions/remainder.cpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/elementwise_functions/remainder.hpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/elementwise_functions/subtract.cpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/elementwise_functions/subtract.hpp

diff --git a/dpctl_ext/tensor/CMakeLists.txt b/dpctl_ext/tensor/CMakeLists.txt
index 7e1170f4ebff..6f286a8d7198 100644
--- a/dpctl_ext/tensor/CMakeLists.txt
+++ b/dpctl_ext/tensor/CMakeLists.txt
@@ -91,7 +91,7 @@ set(_elementwise_sources
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/cbrt.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/ceil.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/conj.cpp
-    #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/copysign.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/copysign.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/cos.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/cosh.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/equal.cpp
@@ -114,22 +114,22 @@ set(_elementwise_sources
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/log2.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/log10.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/logaddexp.cpp
-    #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/logical_and.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/logical_and.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/logical_not.cpp
-    #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/logical_or.cpp
-    #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/logical_xor.cpp
-    #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/maximum.cpp
-    #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/minimum.cpp
-    #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/multiply.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/logical_or.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/logical_xor.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/maximum.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/minimum.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/multiply.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/negative.cpp
-    #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/nextafter.cpp
-    #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/not_equal.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/nextafter.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/not_equal.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/positive.cpp
-    #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/pow.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/pow.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/proj.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/real.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/reciprocal.cpp
-    #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/remainder.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/remainder.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/round.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/rsqrt.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/sign.cpp
@@ -138,7 +138,7 @@ set(_elementwise_sources
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/sinh.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/sqrt.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/square.cpp
-    #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/subtract.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/subtract.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/tan.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/tanh.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/true_divide.cpp
diff --git a/dpctl_ext/tensor/__init__.py b/dpctl_ext/tensor/__init__.py
index 279e3a95fd03..71ef714c642a 100644
--- a/dpctl_ext/tensor/__init__.py
+++ b/dpctl_ext/tensor/__init__.py
@@ -73,6 +73,7 @@
     cbrt,
     ceil,
     conj,
+    copysign,
     cos,
     cosh,
     divide,
@@ -96,12 +97,22 @@
     log2,
     log10,
     logaddexp,
+    logical_and,
     logical_not,
+    logical_or,
+    logical_xor,
+    maximum,
+    minimum,
+    multiply,
     negative,
+    nextafter,
+    not_equal,
     positive,
+    pow,
     proj,
     real,
     reciprocal,
+    remainder,
     round,
     rsqrt,
     sign,
@@ -110,6 +121,7 @@
     sinh,
     sqrt,
     square,
+    subtract,
     tan,
     tanh,
     trunc,
@@ -204,6 +216,7 @@
     "concat",
     "conj",
     "copy",
+    "copysign",
     "cos",
     "cosh",
     "count_nonzero",
@@ -244,24 +257,33 @@
     "linspace",
     "log",
     "logaddexp",
+    "logical_and",
     "logical_not",
+    "logical_or",
+    "logical_xor",
     "logsumexp",
     "log1p",
     "log2",
     "log10",
     "max",
+    "maximum",
     "meshgrid",
     "min",
+    "minimum",
     "moveaxis",
+    "multiply",
     "permute_dims",
     "matmul",
     "matrix_transpose",
     "negative",
+    "nextafter",
     "nonzero",
+    "not_equal",
     "ones",
     "ones_like",
     "place",
     "positive",
+    "pow",
     "prod",
     "proj",
     "put",
@@ -269,6 +291,7 @@
     "real",
     "reciprocal",
     "reduce_hypot",
+    "remainder",
     "repeat",
     "reshape",
     "result_type",
@@ -285,6 +308,7 @@
     "square",
     "squeeze",
     "stack",
+    "subtract",
     "sum",
     "swapaxes",
     "take",
diff --git a/dpctl_ext/tensor/_elementwise_funcs.py b/dpctl_ext/tensor/_elementwise_funcs.py
index 17bdf94d9be5..6442ef0b4594 100644
--- a/dpctl_ext/tensor/_elementwise_funcs.py
+++ b/dpctl_ext/tensor/_elementwise_funcs.py
@@ -35,6 +35,7 @@
     _acceptance_fn_divide,
     _acceptance_fn_negative,
     _acceptance_fn_reciprocal,
+    _acceptance_fn_subtract,
     _resolve_weak_types_all_py_ints,
 )
 
@@ -1243,6 +1244,102 @@
 )
 del _logaddexp_docstring_
 
+# B16: ==== LOGICAL_AND (x1, x2)
+_logical_and_docstring_ = r"""
+logical_and(x1, x2, /, \*, out=None, order='K')
+
+Computes the logical AND for each element `x1_i` of the input array `x1` with
+the respective element `x2_i` of the input array `x2`.
+
+Args:
+    x1 (usm_ndarray):
+        First input array. May have any data type.
+    x2 (usm_ndarray):
+        Second input array. May have any data type.
+    out (Union[usm_ndarray, None], optional):
+        Output array to populate.
+        Array must have the correct shape and the expected data type.
+    order ("C","F","A","K", optional):
+        Memory layout of the new output array, if parameter
+        `out` is ``None``.
+        Default: "K".
+
+Returns:
+    usm_ndarray:
+        An array containing the element-wise logical AND results.
+"""
+logical_and = BinaryElementwiseFunc(
+    "logical_and",
+    ti._logical_and_result_type,
+    ti._logical_and,
+    _logical_and_docstring_,
+)
+del _logical_and_docstring_
+
+# B17: ==== LOGICAL_OR  (x1, x2)
+_logical_or_docstring_ = r"""
+logical_or(x1, x2, /, \*, out=None, order='K')
+
+Computes the logical OR for each element `x1_i` of the input array `x1`
+with the respective element `x2_i` of the input array `x2`.
+
+Args:
+    x1 (usm_ndarray):
+        First input array. May have any data type.
+    x2 (usm_ndarray):
+        Second input array. May have any data type.
+    out (Union[usm_ndarray, None], optional):
+        Output array to populate.
+        Array must have the correct shape and the expected data type.
+    order ("C","F","A","K", optional):
+        Memory layout of the new output array, if parameter
+        `out` is ``None``.
+        Default: "K".
+
+Returns:
+    usm_ndarray:
+        An array containing the element-wise logical OR results.
+"""
+logical_or = BinaryElementwiseFunc(
+    "logical_or",
+    ti._logical_or_result_type,
+    ti._logical_or,
+    _logical_or_docstring_,
+)
+del _logical_or_docstring_
+
+# B18: ==== LOGICAL_XOR (x1, x2)
+_logical_xor_docstring_ = r"""
+logical_xor(x1, x2, /, \*, out=None, order='K')
+
+Computes the logical XOR for each element `x1_i` of the input array `x1`
+with the respective element `x2_i` of the input array `x2`.
+
+Args:
+    x1 (usm_ndarray):
+        First input array. May have any data type.
+    x2 (usm_ndarray):
+        Second input array. May have any data type.
+    out (Union[usm_ndarray, None], optional):
+        Output array to populate.
+        Array must have the correct shape and the expected data type.
+    order ("C","F","A","K", optional):
+        Memory layout of the new output array, if parameter
+        `out` is ``None``.
+        Default: "K".
+
+Returns:
+    usm_ndarray:
+        An array containing the element-wise logical XOR results.
+"""
+logical_xor = BinaryElementwiseFunc(
+    "logical_xor",
+    ti._logical_xor_result_type,
+    ti._logical_xor,
+    _logical_xor_docstring_,
+)
+del _logical_xor_docstring_
+
 # U24: ==== LOGICAL_NOT (x)
 _logical_not_docstring = r"""
 logical_not(x, /, \*, out=None, order='K')
@@ -1272,6 +1369,106 @@
 )
 del _logical_not_docstring
 
+# B26: ==== MAXIMUM    (x1, x2)
+_maximum_docstring_ = r"""
+maximum(x1, x2, /, \*, out=None, order='K')
+
+Compares two input arrays `x1` and `x2` and returns a new array containing the
+element-wise maxima.
+
+Args:
+    x1 (usm_ndarray):
+        First input array. May have any data type.
+    x2 (usm_ndarray):
+        Second input array. May have any data type.
+    out (Union[usm_ndarray, None], optional):
+        Output array to populate.
+        Array must have the correct shape and the expected data type.
+    order ("C","F","A","K", optional):
+        Memory layout of the new output array, if parameter
+        `out` is ``None``.
+        Default: "K".
+
+Returns:
+    usm_ndarray:
+        An array containing the element-wise maxima. The data type of
+        the returned array is determined by the Type Promotion Rules.
+"""
+maximum = BinaryElementwiseFunc(
+    "maximum",
+    ti._maximum_result_type,
+    ti._maximum,
+    _maximum_docstring_,
+)
+del _maximum_docstring_
+
+# B27: ==== MINIMUM    (x1, x2)
+_minimum_docstring_ = r"""
+minimum(x1, x2, /, \*, out=None, order='K')
+
+Compares two input arrays `x1` and `x2` and returns a new array containing the
+element-wise minima.
+
+Args:
+    x1 (usm_ndarray):
+        First input array. May have any data type.
+    x2 (usm_ndarray):
+        Second input array. May have any data type.
+    out (Union[usm_ndarray, None], optional):
+        Output array to populate.
+        Array must have the correct shape and the expected data type.
+    order ("C","F","A","K", optional):
+        Memory layout of the new output array, if parameter
+        `out` is ``None``.
+        Default: "K".
+
+Returns:
+    usm_ndarray:
+        An array containing the element-wise minima. The data type of
+        the returned array is determined by the Type Promotion Rules.
+"""
+minimum = BinaryElementwiseFunc(
+    "minimum",
+    ti._minimum_result_type,
+    ti._minimum,
+    _minimum_docstring_,
+)
+del _minimum_docstring_
+
+# B19: ==== MULTIPLY    (x1, x2)
+_multiply_docstring_ = r"""
+multiply(x1, x2, /, \*, out=None, order='K')
+
+Calculates the product for each element `x1_i` of the input array `x1` with the
+respective element `x2_i` of the input array `x2`.
+
+Args:
+    x1 (usm_ndarray):
+        First input array. May have any data type.
+    x2 (usm_ndarray):
+        Second input array. May have any data type.
+    out (Union[usm_ndarray, None], optional):
+        Output array to populate.
+        Array must have the correct shape and the expected data type.
+    order ("C","F","A","K", optional):
+        Memory layout of the new output array, if parameter
+        `out` is ``None``.
+        Default: "K".
+
+Returns:
+    usm_ndarray:
+        An array containing the element-wise products. The data type of
+        the returned array is determined by the Type Promotion Rules.
+"""
+multiply = BinaryElementwiseFunc(
+    "multiply",
+    ti._multiply_result_type,
+    ti._multiply,
+    _multiply_docstring_,
+    binary_inplace_fn=ti._multiply_inplace,
+)
+del _multiply_docstring_
+
 # U25: ==== NEGATIVE    (x)
 _negative_docstring_ = r"""
 negative(x, /, \*, out=None, order='K')
@@ -1302,6 +1499,77 @@
 )
 del _negative_docstring_
 
+# B28: ==== NEXTAFTER    (x1, x2)
+_nextafter_docstring_ = r"""
+nextafter(x1, x2, /, \*, out=None, order='K')
+
+Calculates the next floating-point value after element `x1_i` of the input
+array `x1` toward the respective element `x2_i` of the input array `x2`.
+
+Args:
+    x1 (usm_ndarray):
+        First input array, expected to have a real-valued floating-point data
+        type.
+    x2 (usm_ndarray):
+        Second input array, expected to have a real-valued floating-point data
+        type.
+    out (Union[usm_ndarray, None], optional):
+        Output array to populate.
+        Array must have the correct shape and the expected data type.
+    order ("C","F","A","K", optional):
+        Memory layout of the new output array, if parameter
+        `out` is ``None``.
+        Default: "K".
+
+Returns:
+    usm_ndarray:
+        An array containing the element-wise next representable values of `x1`
+        in the direction of `x2`. The data type of the returned array is
+        determined by the Type Promotion Rules.
+"""
+nextafter = BinaryElementwiseFunc(
+    "nextafter",
+    ti._nextafter_result_type,
+    ti._nextafter,
+    _nextafter_docstring_,
+)
+del _nextafter_docstring_
+
+# B20: ==== NOT_EQUAL   (x1, x2)
+_not_equal_docstring_ = r"""
+not_equal(x1, x2, /, \*, out=None, order='K')
+
+Calculates inequality test results for each element `x1_i` of the
+input array `x1` with the respective element `x2_i` of the input array `x2`.
+
+Args:
+    x1 (usm_ndarray):
+        First input array.
+    x2 (usm_ndarray):
+        Second input array.
+    out (Union[usm_ndarray, None], optional):
+        Output array to populate.
+        Array must have the correct shape and the expected data type.
+    order ("C","F","A","K", optional):
+        Memory layout of the new output array, if parameter
+        `out` is ``None``.
+        Default: "K".
+
+Returns:
+    usm_ndarray:
+        An array containing the result of element-wise inequality comparison.
+        The returned array has a data type of `bool`.
+"""
+
+not_equal = BinaryElementwiseFunc(
+    "not_equal",
+    ti._not_equal_result_type,
+    ti._not_equal,
+    _not_equal_docstring_,
+    weak_type_resolver=_resolve_weak_types_all_py_ints,
+)
+del _not_equal_docstring_
+
 # U26: ==== POSITIVE    (x)
 _positive_docstring_ = r"""
 positive(x, /, \*, out=None, order='K')
@@ -1328,6 +1596,40 @@
 )
 del _positive_docstring_
 
+# B21: ==== POW         (x1, x2)
+_pow_docstring_ = r"""
+pow(x1, x2, /, \*, out=None, order='K')
+
+Calculates `x1_i` raised to `x2_i` for each element `x1_i` of the input array
+`x1` with the respective element `x2_i` of the input array `x2`.
+
+Args:
+    x1 (usm_ndarray):
+        First input array, expected to have a numeric data type.
+    x2 (usm_ndarray):
+        Second input array, also expected to have a numeric data type.
+    out (usm_ndarray):
+        Output array to populate. Array must have the correct
+        shape and the expected data type.
+    order ("C","F","A","K", optional): memory layout of the new
+        output array, if parameter `out` is ``None``.
+        Default: "K".
+
+Returns:
+    usm_ndarray:
+        An array containing the bases in `x1` raised to the exponents in `x2`
+        element-wise. The data type of the returned array is determined by the
+        Type Promotion Rules.
+"""
+pow = BinaryElementwiseFunc(
+    "pow",
+    ti._pow_result_type,
+    ti._pow,
+    _pow_docstring_,
+    binary_inplace_fn=ti._pow_inplace,
+)
+del _pow_docstring_
+
 # U27: ==== REAL        (x)
 _real_docstring = r"""
 real(x, /, \*, out=None, order='K')
@@ -1359,6 +1661,43 @@
 )
 del _real_docstring
 
+# B22: ==== REMAINDER   (x1, x2)
+_remainder_docstring_ = r"""
+remainder(x1, x2, /, \*, out=None, order='K')
+
+Calculates the remainder of division for each element `x1_i` of the input array
+`x1` with the respective element `x2_i` of the input array `x2`.
+
+This function is equivalent to the Python modulus operator.
+
+Args:
+    x1 (usm_ndarray):
+        First input array, expected to have a real-valued data type.
+    x2 (usm_ndarray):
+        Second input array, also expected to have a real-valued data type.
+    out (Union[usm_ndarray, None], optional):
+        Output array to populate.
+        Array must have the correct shape and the expected data type.
+    order ("C","F","A","K", optional):
+        Memory layout of the new output array, if parameter
+        `out` is ``None``.
+        Default: "K".
+
+Returns:
+    usm_ndarray:
+        An array containing the element-wise remainders. Each remainder has the
+        same sign as respective element `x2_i`. The data type of the returned
+        array is determined by the Type Promotion Rules.
+"""
+remainder = BinaryElementwiseFunc(
+    "remainder",
+    ti._remainder_result_type,
+    ti._remainder,
+    _remainder_docstring_,
+    binary_inplace_fn=ti._remainder_inplace,
+)
+del _remainder_docstring_
+
 # U28: ==== ROUND       (x)
 _round_docstring = r"""
 round(x, /, \*, out=None, order='K')
@@ -1534,6 +1873,41 @@
 )
 del _sqrt_docstring_
 
+# B23: ==== SUBTRACT    (x1, x2)
+_subtract_docstring_ = r"""
+subtract(x1, x2, /, \*, out=None, order='K')
+
+Calculates the difference between each element `x1_i` of the input
+array `x1` and the respective element `x2_i` of the input array `x2`.
+
+Args:
+    x1 (usm_ndarray):
+        First input array, expected to have a numeric data type.
+    x2 (usm_ndarray):
+        Second input array, also expected to have a numeric data type.
+    out (Union[usm_ndarray, None], optional):
+        Output array to populate.
+        Array must have the correct shape and the expected data type.
+    order ("C","F","A","K", optional):
+        Memory layout of the new output array, if parameter
+        `out` is ``None``.
+        Default: "K".
+
+Returns:
+    usm_ndarray:
+        An array containing the element-wise differences. The data type
+        of the returned array is determined by the Type Promotion Rules.
+"""
+subtract = BinaryElementwiseFunc(
+    "subtract",
+    ti._subtract_result_type,
+    ti._subtract,
+    _subtract_docstring_,
+    binary_inplace_fn=ti._subtract_inplace,
+    acceptance_fn=_acceptance_fn_subtract,
+)
+del _subtract_docstring_
+
 # U34: ==== TAN         (x)
 _tan_docstring = r"""
 tan(x, /, \*, out=None, order='K')
@@ -1710,6 +2084,41 @@
 )
 del _exp2_docstring_
 
+# B25: ==== COPYSIGN    (x1, x2)
+_copysign_docstring_ = r"""
+copysign(x1, x2, /, \*, out=None, order='K')
+
+Composes a floating-point value with the magnitude of `x1_i` and the sign of
+`x2_i` for each element of input arrays `x1` and `x2`.
+
+Args:
+    x1 (usm_ndarray):
+        First input array, expected to have a real-valued floating-point data
+        type.
+    x2 (usm_ndarray):
+        Second input array, also expected to have a real-valued floating-point
+        data type.
+    out (Union[usm_ndarray, None], optional):
+        Output array to populate.
+        Array have the correct shape and the expected data type.
+    order ("C","F","A","K", optional):
+        Memory layout of the new output array, if parameter
+        `out` is ``None``.
+        Default: "K".
+
+Returns:
+    usm_ndarray:
+        An array containing the element-wise results. The data type
+        of the returned array is determined by the Type Promotion Rules.
+"""
+copysign = BinaryElementwiseFunc(
+    "copysign",
+    ti._copysign_result_type,
+    ti._copysign,
+    _copysign_docstring_,
+)
+del _copysign_docstring_
+
 # U39: ==== RSQRT        (x)
 _rsqrt_docstring_ = r"""
 rsqrt(x, /, \*, out=None, order='K')
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/copysign.hpp b/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/copysign.hpp
new file mode 100644
index 000000000000..c2eb0f7e850e
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/copysign.hpp
@@ -0,0 +1,248 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for elementwise evaluation of COPYSIGN(x1, x2)
+/// function.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <cstddef>
+#include <cstdint>
+#include <type_traits>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "vec_size_util.hpp"
+
+#include "utils/offset_utils.hpp"
+#include "utils/type_dispatch_building.hpp"
+#include "utils/type_utils.hpp"
+
+#include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+
+namespace dpctl::tensor::kernels::copysign
+{
+
+using dpctl::tensor::ssize_t;
+namespace td_ns = dpctl::tensor::type_dispatch;
+namespace tu_ns = dpctl::tensor::type_utils;
+
+template <typename argT1, typename argT2, typename resT>
+struct CopysignFunctor
+{
+
+    using supports_sg_loadstore = std::true_type;
+    using supports_vec = std::true_type;
+
+    resT operator()(const argT1 &in1, const argT2 &in2) const
+    {
+        return sycl::copysign(in1, in2);
+    }
+
+    template <int vec_sz>
+    sycl::vec<resT, vec_sz>
+        operator()(const sycl::vec<argT1, vec_sz> &in1,
+                   const sycl::vec<argT2, vec_sz> &in2) const
+    {
+        auto tmp = sycl::copysign(in1, in2);
+        if constexpr (std::is_same_v<resT,
+                                     typename decltype(tmp)::element_type>) {
+            return tmp;
+        }
+        else {
+            using dpctl::tensor::type_utils::vec_cast;
+
+            return vec_cast<resT, typename decltype(tmp)::element_type, vec_sz>(
+                tmp);
+        }
+    }
+};
+
+template <typename argT1,
+          typename argT2,
+          typename resT,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
+          bool enable_sg_loadstore = true>
+using CopysignContigFunctor =
+    elementwise_common::BinaryContigFunctor<argT1,
+                                            argT2,
+                                            resT,
+                                            CopysignFunctor<argT1, argT2, resT>,
+                                            vec_sz,
+                                            n_vecs,
+                                            enable_sg_loadstore>;
+
+template <typename argT1, typename argT2, typename resT, typename IndexerT>
+using CopysignStridedFunctor = elementwise_common::BinaryStridedFunctor<
+    argT1,
+    argT2,
+    resT,
+    IndexerT,
+    CopysignFunctor<argT1, argT2, resT>>;
+
+template <typename T1, typename T2>
+struct CopysignOutputType
+{
+    using value_type = typename std::disjunction<
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        sycl::half,
+                                        T2,
+                                        sycl::half,
+                                        sycl::half>,
+        td_ns::BinaryTypeMapResultEntry<T1, float, T2, float, float>,
+        td_ns::BinaryTypeMapResultEntry<T1, double, T2, double, double>,
+        td_ns::DefaultResultEntry<void>>::result_type;
+
+    static constexpr bool is_defined = !std::is_same_v<value_type, void>;
+};
+
+namespace hyperparam_detail
+{
+
+namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
+
+using vsu_ns::BinaryContigHyperparameterSetEntry;
+using vsu_ns::ContigHyperparameterSetDefault;
+
+template <typename argTy1, typename argTy2>
+struct CopysignContigHyperparameterSet
+{
+    using value_type =
+        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
+
+    constexpr static auto vec_sz = value_type::vec_sz;
+    constexpr static auto n_vecs = value_type::n_vecs;
+};
+
+} // end of namespace hyperparam_detail
+
+template <typename argT1,
+          typename argT2,
+          typename resT,
+          std::uint8_t vec_sz,
+          std::uint8_t n_vecs>
+class copysign_contig_kernel;
+
+template <typename argTy1, typename argTy2>
+sycl::event copysign_contig_impl(sycl::queue &exec_q,
+                                 std::size_t nelems,
+                                 const char *arg1_p,
+                                 ssize_t arg1_offset,
+                                 const char *arg2_p,
+                                 ssize_t arg2_offset,
+                                 char *res_p,
+                                 ssize_t res_offset,
+                                 const std::vector<sycl::event> &depends = {})
+{
+    using CopySignHS =
+        hyperparam_detail::CopysignContigHyperparameterSet<argTy1, argTy2>;
+    static constexpr std::uint8_t vec_sz = CopySignHS::vec_sz;
+    static constexpr std::uint8_t n_vecs = CopySignHS::n_vecs;
+
+    return elementwise_common::binary_contig_impl<
+        argTy1, argTy2, CopysignOutputType, CopysignContigFunctor,
+        copysign_contig_kernel, vec_sz, n_vecs>(
+        exec_q, nelems, arg1_p, arg1_offset, arg2_p, arg2_offset, res_p,
+        res_offset, depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct CopysignContigFactory
+{
+    fnT get()
+    {
+        if constexpr (!CopysignOutputType<T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = copysign_contig_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+
+template <typename fnT, typename T1, typename T2>
+struct CopysignTypeMapFactory
+{
+    /*! @brief get typeid for output type of divide(T1 x, T2 y) */
+    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
+    {
+        using rT = typename CopysignOutputType<T1, T2>::value_type;
+        return td_ns::GetTypeid<rT>{}.get();
+    }
+};
+
+template <typename T1, typename T2, typename resT, typename IndexerT>
+class copysign_strided_kernel;
+
+template <typename argTy1, typename argTy2>
+sycl::event
+    copysign_strided_impl(sycl::queue &exec_q,
+                          std::size_t nelems,
+                          int nd,
+                          const ssize_t *shape_and_strides,
+                          const char *arg1_p,
+                          ssize_t arg1_offset,
+                          const char *arg2_p,
+                          ssize_t arg2_offset,
+                          char *res_p,
+                          ssize_t res_offset,
+                          const std::vector<sycl::event> &depends,
+                          const std::vector<sycl::event> &additional_depends)
+{
+    return elementwise_common::binary_strided_impl<
+        argTy1, argTy2, CopysignOutputType, CopysignStridedFunctor,
+        copysign_strided_kernel>(exec_q, nelems, nd, shape_and_strides, arg1_p,
+                                 arg1_offset, arg2_p, arg2_offset, res_p,
+                                 res_offset, depends, additional_depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct CopysignStridedFactory
+{
+    fnT get()
+    {
+        if constexpr (!CopysignOutputType<T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = copysign_strided_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+
+} // namespace dpctl::tensor::kernels::copysign
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/logical_and.hpp b/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/logical_and.hpp
new file mode 100644
index 000000000000..39049dab8d5e
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/logical_and.hpp
@@ -0,0 +1,291 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for elementwise evaluation of LOGICAL_AND(x1, x2)
+/// function.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <complex>
+#include <cstddef>
+#include <cstdint>
+#include <type_traits>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "vec_size_util.hpp"
+
+#include "utils/offset_utils.hpp"
+#include "utils/type_dispatch_building.hpp"
+#include "utils/type_utils.hpp"
+
+#include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+
+namespace dpctl::tensor::kernels::logical_and
+{
+
+using dpctl::tensor::ssize_t;
+namespace td_ns = dpctl::tensor::type_dispatch;
+namespace tu_ns = dpctl::tensor::type_utils;
+
+template <typename argT1, typename argT2, typename resT>
+struct LogicalAndFunctor
+{
+    static_assert(std::is_same_v<resT, bool>);
+
+    using supports_sg_loadstore = std::negation<
+        std::disjunction<tu_ns::is_complex<argT1>, tu_ns::is_complex<argT2>>>;
+    using supports_vec = std::conjunction<
+        std::is_same<argT1, argT2>,
+        std::negation<std::disjunction<tu_ns::is_complex<argT1>,
+                                       tu_ns::is_complex<argT2>>>>;
+
+    resT operator()(const argT1 &in1, const argT2 &in2) const
+    {
+        using tu_ns::convert_impl;
+
+        return (convert_impl<bool, argT1>(in1) &&
+                convert_impl<bool, argT2>(in2));
+    }
+
+    template <int vec_sz>
+    sycl::vec<resT, vec_sz>
+        operator()(const sycl::vec<argT1, vec_sz> &in1,
+                   const sycl::vec<argT2, vec_sz> &in2) const
+    {
+
+        auto tmp = (in1 && in2);
+        if constexpr (std::is_same_v<resT,
+                                     typename decltype(tmp)::element_type>) {
+            return tmp;
+        }
+        else {
+            using dpctl::tensor::type_utils::vec_cast;
+
+            return vec_cast<resT, typename decltype(tmp)::element_type, vec_sz>(
+                tmp);
+        }
+    }
+};
+
+template <typename argT1,
+          typename argT2,
+          typename resT,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
+          bool enable_sg_loadstore = true>
+using LogicalAndContigFunctor = elementwise_common::BinaryContigFunctor<
+    argT1,
+    argT2,
+    resT,
+    LogicalAndFunctor<argT1, argT2, resT>,
+    vec_sz,
+    n_vecs,
+    enable_sg_loadstore>;
+
+template <typename argT1, typename argT2, typename resT, typename IndexerT>
+using LogicalAndStridedFunctor = elementwise_common::BinaryStridedFunctor<
+    argT1,
+    argT2,
+    resT,
+    IndexerT,
+    LogicalAndFunctor<argT1, argT2, resT>>;
+
+template <typename T1, typename T2>
+struct LogicalAndOutputType
+{
+    using value_type = typename std::disjunction<
+        td_ns::BinaryTypeMapResultEntry<T1, bool, T2, bool, bool>,
+        td_ns::
+            BinaryTypeMapResultEntry<T1, std::uint8_t, T2, std::uint8_t, bool>,
+        td_ns::BinaryTypeMapResultEntry<T1, std::int8_t, T2, std::int8_t, bool>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint16_t,
+                                        T2,
+                                        std::uint16_t,
+                                        bool>,
+        td_ns::
+            BinaryTypeMapResultEntry<T1, std::int16_t, T2, std::int16_t, bool>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint32_t,
+                                        T2,
+                                        std::uint32_t,
+                                        bool>,
+        td_ns::
+            BinaryTypeMapResultEntry<T1, std::int32_t, T2, std::int32_t, bool>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint64_t,
+                                        T2,
+                                        std::uint64_t,
+                                        bool>,
+        td_ns::
+            BinaryTypeMapResultEntry<T1, std::int64_t, T2, std::int64_t, bool>,
+        td_ns::BinaryTypeMapResultEntry<T1, sycl::half, T2, sycl::half, bool>,
+        td_ns::BinaryTypeMapResultEntry<T1, float, T2, float, bool>,
+        td_ns::BinaryTypeMapResultEntry<T1, double, T2, double, bool>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::complex<float>,
+                                        T2,
+                                        std::complex<float>,
+                                        bool>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::complex<double>,
+                                        T2,
+                                        std::complex<double>,
+                                        bool>,
+        td_ns::DefaultResultEntry<void>>::result_type;
+
+    static constexpr bool is_defined = !std::is_same_v<value_type, void>;
+};
+
+namespace hyperparam_detail
+{
+
+namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
+
+using vsu_ns::BinaryContigHyperparameterSetEntry;
+using vsu_ns::ContigHyperparameterSetDefault;
+
+template <typename argTy1, typename argTy2>
+struct LogicalAndContigHyperparameterSet
+{
+    using value_type =
+        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
+
+    constexpr static auto vec_sz = value_type::vec_sz;
+    constexpr static auto n_vecs = value_type::n_vecs;
+};
+
+} // end of namespace hyperparam_detail
+
+template <typename argT1,
+          typename argT2,
+          typename resT,
+          std::uint8_t vec_sz,
+          std::uint8_t n_vecs>
+class logical_and_contig_kernel;
+
+template <typename argTy1, typename argTy2>
+sycl::event
+    logical_and_contig_impl(sycl::queue &exec_q,
+                            std::size_t nelems,
+                            const char *arg1_p,
+                            ssize_t arg1_offset,
+                            const char *arg2_p,
+                            ssize_t arg2_offset,
+                            char *res_p,
+                            ssize_t res_offset,
+                            const std::vector<sycl::event> &depends = {})
+{
+    using LogicalAndHS =
+        hyperparam_detail::LogicalAndContigHyperparameterSet<argTy1, argTy2>;
+    static constexpr std::uint8_t vec_sz = LogicalAndHS::vec_sz;
+    static constexpr std::uint8_t n_vecs = LogicalAndHS::n_vecs;
+
+    return elementwise_common::binary_contig_impl<
+        argTy1, argTy2, LogicalAndOutputType, LogicalAndContigFunctor,
+        logical_and_contig_kernel, vec_sz, n_vecs>(
+        exec_q, nelems, arg1_p, arg1_offset, arg2_p, arg2_offset, res_p,
+        res_offset, depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct LogicalAndContigFactory
+{
+    fnT get()
+    {
+        if constexpr (!LogicalAndOutputType<T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = logical_and_contig_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+
+template <typename fnT, typename T1, typename T2>
+struct LogicalAndTypeMapFactory
+{
+    /*! @brief get typeid for output type of operator()>(x, y), always bool
+     */
+    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
+    {
+        using rT = typename LogicalAndOutputType<T1, T2>::value_type;
+        return td_ns::GetTypeid<rT>{}.get();
+    }
+};
+
+template <typename T1, typename T2, typename resT, typename IndexerT>
+class logical_and_strided_kernel;
+
+template <typename argTy1, typename argTy2>
+sycl::event
+    logical_and_strided_impl(sycl::queue &exec_q,
+                             std::size_t nelems,
+                             int nd,
+                             const ssize_t *shape_and_strides,
+                             const char *arg1_p,
+                             ssize_t arg1_offset,
+                             const char *arg2_p,
+                             ssize_t arg2_offset,
+                             char *res_p,
+                             ssize_t res_offset,
+                             const std::vector<sycl::event> &depends,
+                             const std::vector<sycl::event> &additional_depends)
+{
+    return elementwise_common::binary_strided_impl<
+        argTy1, argTy2, LogicalAndOutputType, LogicalAndStridedFunctor,
+        logical_and_strided_kernel>(
+        exec_q, nelems, nd, shape_and_strides, arg1_p, arg1_offset, arg2_p,
+        arg2_offset, res_p, res_offset, depends, additional_depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct LogicalAndStridedFactory
+{
+    fnT get()
+    {
+        if constexpr (!LogicalAndOutputType<T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = logical_and_strided_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+
+} // namespace dpctl::tensor::kernels::logical_and
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/logical_or.hpp b/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/logical_or.hpp
new file mode 100644
index 000000000000..637e7681e7c0
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/logical_or.hpp
@@ -0,0 +1,290 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for elementwise evaluation of LOGICAL_OR(x1, x2)
+/// function.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <complex>
+#include <cstddef>
+#include <cstdint>
+#include <type_traits>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "vec_size_util.hpp"
+
+#include "utils/offset_utils.hpp"
+#include "utils/type_dispatch_building.hpp"
+#include "utils/type_utils.hpp"
+
+#include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+
+namespace dpctl::tensor::kernels::logical_or
+{
+
+using dpctl::tensor::ssize_t;
+namespace td_ns = dpctl::tensor::type_dispatch;
+namespace tu_ns = dpctl::tensor::type_utils;
+
+template <typename argT1, typename argT2, typename resT>
+struct LogicalOrFunctor
+{
+    static_assert(std::is_same_v<resT, bool>);
+
+    using supports_sg_loadstore = std::negation<
+        std::disjunction<tu_ns::is_complex<argT1>, tu_ns::is_complex<argT2>>>;
+    using supports_vec = std::conjunction<
+        std::is_same<argT1, argT2>,
+        std::negation<std::disjunction<tu_ns::is_complex<argT1>,
+                                       tu_ns::is_complex<argT2>>>>;
+
+    resT operator()(const argT1 &in1, const argT2 &in2) const
+    {
+        using tu_ns::convert_impl;
+
+        return (convert_impl<bool, argT1>(in1) ||
+                convert_impl<bool, argT2>(in2));
+    }
+
+    template <int vec_sz>
+    sycl::vec<resT, vec_sz>
+        operator()(const sycl::vec<argT1, vec_sz> &in1,
+                   const sycl::vec<argT2, vec_sz> &in2) const
+    {
+
+        auto tmp = (in1 || in2);
+        if constexpr (std::is_same_v<resT,
+                                     typename decltype(tmp)::element_type>) {
+            return tmp;
+        }
+        else {
+            using dpctl::tensor::type_utils::vec_cast;
+
+            return vec_cast<resT, typename decltype(tmp)::element_type, vec_sz>(
+                tmp);
+        }
+    }
+};
+
+template <typename argT1,
+          typename argT2,
+          typename resT,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
+          bool enable_sg_loadstore = true>
+using LogicalOrContigFunctor = elementwise_common::BinaryContigFunctor<
+    argT1,
+    argT2,
+    resT,
+    LogicalOrFunctor<argT1, argT2, resT>,
+    vec_sz,
+    n_vecs,
+    enable_sg_loadstore>;
+
+template <typename argT1, typename argT2, typename resT, typename IndexerT>
+using LogicalOrStridedFunctor = elementwise_common::BinaryStridedFunctor<
+    argT1,
+    argT2,
+    resT,
+    IndexerT,
+    LogicalOrFunctor<argT1, argT2, resT>>;
+
+template <typename T1, typename T2>
+struct LogicalOrOutputType
+{
+    using value_type = typename std::disjunction<
+        td_ns::BinaryTypeMapResultEntry<T1, bool, T2, bool, bool>,
+        td_ns::
+            BinaryTypeMapResultEntry<T1, std::uint8_t, T2, std::uint8_t, bool>,
+        td_ns::BinaryTypeMapResultEntry<T1, std::int8_t, T2, std::int8_t, bool>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint16_t,
+                                        T2,
+                                        std::uint16_t,
+                                        bool>,
+        td_ns::
+            BinaryTypeMapResultEntry<T1, std::int16_t, T2, std::int16_t, bool>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint32_t,
+                                        T2,
+                                        std::uint32_t,
+                                        bool>,
+        td_ns::
+            BinaryTypeMapResultEntry<T1, std::int32_t, T2, std::int32_t, bool>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint64_t,
+                                        T2,
+                                        std::uint64_t,
+                                        bool>,
+        td_ns::
+            BinaryTypeMapResultEntry<T1, std::int64_t, T2, std::int64_t, bool>,
+        td_ns::BinaryTypeMapResultEntry<T1, sycl::half, T2, sycl::half, bool>,
+        td_ns::BinaryTypeMapResultEntry<T1, float, T2, float, bool>,
+        td_ns::BinaryTypeMapResultEntry<T1, double, T2, double, bool>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::complex<float>,
+                                        T2,
+                                        std::complex<float>,
+                                        bool>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::complex<double>,
+                                        T2,
+                                        std::complex<double>,
+                                        bool>,
+        td_ns::DefaultResultEntry<void>>::result_type;
+
+    static constexpr bool is_defined = !std::is_same_v<value_type, void>;
+};
+
+namespace hyperparam_detail
+{
+
+namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
+
+using vsu_ns::BinaryContigHyperparameterSetEntry;
+using vsu_ns::ContigHyperparameterSetDefault;
+
+template <typename argTy1, typename argTy2>
+struct LogicalOrContigHyperparameterSet
+{
+    using value_type =
+        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
+
+    constexpr static auto vec_sz = value_type::vec_sz;
+    constexpr static auto n_vecs = value_type::n_vecs;
+};
+
+} // end of namespace hyperparam_detail
+
+template <typename argT1,
+          typename argT2,
+          typename resT,
+          std::uint8_t vec_sz,
+          std::uint8_t n_vecs>
+class logical_or_contig_kernel;
+
+template <typename argTy1, typename argTy2>
+sycl::event logical_or_contig_impl(sycl::queue &exec_q,
+                                   std::size_t nelems,
+                                   const char *arg1_p,
+                                   ssize_t arg1_offset,
+                                   const char *arg2_p,
+                                   ssize_t arg2_offset,
+                                   char *res_p,
+                                   ssize_t res_offset,
+                                   const std::vector<sycl::event> &depends = {})
+{
+    using LogicalOrHS =
+        hyperparam_detail::LogicalOrContigHyperparameterSet<argTy1, argTy2>;
+    static constexpr std::uint8_t vec_sz = LogicalOrHS::vec_sz;
+    static constexpr std::uint8_t n_vecs = LogicalOrHS::n_vecs;
+
+    return elementwise_common::binary_contig_impl<
+        argTy1, argTy2, LogicalOrOutputType, LogicalOrContigFunctor,
+        logical_or_contig_kernel, vec_sz, n_vecs>(
+        exec_q, nelems, arg1_p, arg1_offset, arg2_p, arg2_offset, res_p,
+        res_offset, depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct LogicalOrContigFactory
+{
+    fnT get()
+    {
+        if constexpr (!LogicalOrOutputType<T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = logical_or_contig_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+
+template <typename fnT, typename T1, typename T2>
+struct LogicalOrTypeMapFactory
+{
+    /*! @brief get typeid for output type of operator()>(x, y), always bool
+     */
+    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
+    {
+        using rT = typename LogicalOrOutputType<T1, T2>::value_type;
+        return td_ns::GetTypeid<rT>{}.get();
+    }
+};
+
+template <typename T1, typename T2, typename resT, typename IndexerT>
+class logical_or_strided_kernel;
+
+template <typename argTy1, typename argTy2>
+sycl::event
+    logical_or_strided_impl(sycl::queue &exec_q,
+                            std::size_t nelems,
+                            int nd,
+                            const ssize_t *shape_and_strides,
+                            const char *arg1_p,
+                            ssize_t arg1_offset,
+                            const char *arg2_p,
+                            ssize_t arg2_offset,
+                            char *res_p,
+                            ssize_t res_offset,
+                            const std::vector<sycl::event> &depends,
+                            const std::vector<sycl::event> &additional_depends)
+{
+    return elementwise_common::binary_strided_impl<
+        argTy1, argTy2, LogicalOrOutputType, LogicalOrStridedFunctor,
+        logical_or_strided_kernel>(
+        exec_q, nelems, nd, shape_and_strides, arg1_p, arg1_offset, arg2_p,
+        arg2_offset, res_p, res_offset, depends, additional_depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct LogicalOrStridedFactory
+{
+    fnT get()
+    {
+        if constexpr (!LogicalOrOutputType<T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = logical_or_strided_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+
+} // namespace dpctl::tensor::kernels::logical_or
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/logical_xor.hpp b/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/logical_xor.hpp
new file mode 100644
index 000000000000..698e4d9ab5c1
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/logical_xor.hpp
@@ -0,0 +1,292 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for elementwise evaluation of LOGICAL_XOR(x1, x2)
+/// function.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <complex>
+#include <cstddef>
+#include <cstdint>
+#include <type_traits>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "vec_size_util.hpp"
+
+#include "utils/offset_utils.hpp"
+#include "utils/type_dispatch_building.hpp"
+#include "utils/type_utils.hpp"
+
+#include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+
+namespace dpctl::tensor::kernels::logical_xor
+{
+
+using dpctl::tensor::ssize_t;
+namespace td_ns = dpctl::tensor::type_dispatch;
+namespace tu_ns = dpctl::tensor::type_utils;
+
+template <typename argT1, typename argT2, typename resT>
+struct LogicalXorFunctor
+{
+    static_assert(std::is_same_v<resT, bool>);
+
+    using supports_sg_loadstore = std::negation<
+        std::disjunction<tu_ns::is_complex<argT1>, tu_ns::is_complex<argT2>>>;
+    using supports_vec = std::conjunction<
+        std::is_same<argT1, argT2>,
+        std::negation<std::disjunction<tu_ns::is_complex<argT1>,
+                                       tu_ns::is_complex<argT2>>>>;
+
+    resT operator()(const argT1 &in1, const argT2 &in2) const
+    {
+        using tu_ns::convert_impl;
+
+        return (convert_impl<bool, argT1>(in1) !=
+                convert_impl<bool, argT2>(in2));
+    }
+
+    template <int vec_sz>
+    sycl::vec<resT, vec_sz>
+        operator()(const sycl::vec<argT1, vec_sz> &in1,
+                   const sycl::vec<argT2, vec_sz> &in2) const
+    {
+        using tu_ns::vec_cast;
+        auto tmp1 = vec_cast<bool, argT1, vec_sz>(in1);
+        auto tmp2 = vec_cast<bool, argT2, vec_sz>(in2);
+
+        auto tmp = (tmp1 != tmp2);
+        if constexpr (std::is_same_v<resT,
+                                     typename decltype(tmp)::element_type>) {
+            return tmp;
+        }
+        else {
+            return vec_cast<resT, typename decltype(tmp)::element_type, vec_sz>(
+                tmp);
+        }
+    }
+};
+
+template <typename argT1,
+          typename argT2,
+          typename resT,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
+          bool enable_sg_loadstore = true>
+using LogicalXorContigFunctor = elementwise_common::BinaryContigFunctor<
+    argT1,
+    argT2,
+    resT,
+    LogicalXorFunctor<argT1, argT2, resT>,
+    vec_sz,
+    n_vecs,
+    enable_sg_loadstore>;
+
+template <typename argT1, typename argT2, typename resT, typename IndexerT>
+using LogicalXorStridedFunctor = elementwise_common::BinaryStridedFunctor<
+    argT1,
+    argT2,
+    resT,
+    IndexerT,
+    LogicalXorFunctor<argT1, argT2, resT>>;
+
+template <typename T1, typename T2>
+struct LogicalXorOutputType
+{
+    using value_type = typename std::disjunction<
+        td_ns::BinaryTypeMapResultEntry<T1, bool, T2, bool, bool>,
+        td_ns::
+            BinaryTypeMapResultEntry<T1, std::uint8_t, T2, std::uint8_t, bool>,
+        td_ns::BinaryTypeMapResultEntry<T1, std::int8_t, T2, std::int8_t, bool>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint16_t,
+                                        T2,
+                                        std::uint16_t,
+                                        bool>,
+        td_ns::
+            BinaryTypeMapResultEntry<T1, std::int16_t, T2, std::int16_t, bool>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint32_t,
+                                        T2,
+                                        std::uint32_t,
+                                        bool>,
+        td_ns::
+            BinaryTypeMapResultEntry<T1, std::int32_t, T2, std::int32_t, bool>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint64_t,
+                                        T2,
+                                        std::uint64_t,
+                                        bool>,
+        td_ns::
+            BinaryTypeMapResultEntry<T1, std::int64_t, T2, std::int64_t, bool>,
+        td_ns::BinaryTypeMapResultEntry<T1, sycl::half, T2, sycl::half, bool>,
+        td_ns::BinaryTypeMapResultEntry<T1, float, T2, float, bool>,
+        td_ns::BinaryTypeMapResultEntry<T1, double, T2, double, bool>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::complex<float>,
+                                        T2,
+                                        std::complex<float>,
+                                        bool>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::complex<double>,
+                                        T2,
+                                        std::complex<double>,
+                                        bool>,
+        td_ns::DefaultResultEntry<void>>::result_type;
+
+    static constexpr bool is_defined = !std::is_same_v<value_type, void>;
+};
+
+namespace hyperparam_detail
+{
+
+namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
+
+using vsu_ns::BinaryContigHyperparameterSetEntry;
+using vsu_ns::ContigHyperparameterSetDefault;
+
+template <typename argTy1, typename argTy2>
+struct LogicalXorContigHyperparameterSet
+{
+    using value_type =
+        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
+
+    constexpr static auto vec_sz = value_type::vec_sz;
+    constexpr static auto n_vecs = value_type::n_vecs;
+};
+
+} // end of namespace hyperparam_detail
+
+template <typename argT1,
+          typename argT2,
+          typename resT,
+          std::uint8_t vec_sz,
+          std::uint8_t n_vecs>
+class logical_xor_contig_kernel;
+
+template <typename argTy1, typename argTy2>
+sycl::event
+    logical_xor_contig_impl(sycl::queue &exec_q,
+                            std::size_t nelems,
+                            const char *arg1_p,
+                            ssize_t arg1_offset,
+                            const char *arg2_p,
+                            ssize_t arg2_offset,
+                            char *res_p,
+                            ssize_t res_offset,
+                            const std::vector<sycl::event> &depends = {})
+{
+    using LogicalXorHS =
+        hyperparam_detail::LogicalXorContigHyperparameterSet<argTy1, argTy2>;
+    static constexpr std::uint8_t vec_sz = LogicalXorHS::vec_sz;
+    static constexpr std::uint8_t n_vecs = LogicalXorHS::n_vecs;
+
+    return elementwise_common::binary_contig_impl<
+        argTy1, argTy2, LogicalXorOutputType, LogicalXorContigFunctor,
+        logical_xor_contig_kernel, vec_sz, n_vecs>(
+        exec_q, nelems, arg1_p, arg1_offset, arg2_p, arg2_offset, res_p,
+        res_offset, depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct LogicalXorContigFactory
+{
+    fnT get()
+    {
+        if constexpr (!LogicalXorOutputType<T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = logical_xor_contig_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+
+template <typename fnT, typename T1, typename T2>
+struct LogicalXorTypeMapFactory
+{
+    /*! @brief get typeid for output type of operator()>(x, y), always bool
+     */
+    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
+    {
+        using rT = typename LogicalXorOutputType<T1, T2>::value_type;
+        return td_ns::GetTypeid<rT>{}.get();
+    }
+};
+
+template <typename T1, typename T2, typename resT, typename IndexerT>
+class logical_xor_strided_kernel;
+
+template <typename argTy1, typename argTy2>
+sycl::event
+    logical_xor_strided_impl(sycl::queue &exec_q,
+                             std::size_t nelems,
+                             int nd,
+                             const ssize_t *shape_and_strides,
+                             const char *arg1_p,
+                             ssize_t arg1_offset,
+                             const char *arg2_p,
+                             ssize_t arg2_offset,
+                             char *res_p,
+                             ssize_t res_offset,
+                             const std::vector<sycl::event> &depends,
+                             const std::vector<sycl::event> &additional_depends)
+{
+    return elementwise_common::binary_strided_impl<
+        argTy1, argTy2, LogicalXorOutputType, LogicalXorStridedFunctor,
+        logical_xor_strided_kernel>(
+        exec_q, nelems, nd, shape_and_strides, arg1_p, arg1_offset, arg2_p,
+        arg2_offset, res_p, res_offset, depends, additional_depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct LogicalXorStridedFactory
+{
+    fnT get()
+    {
+        if constexpr (!LogicalXorOutputType<T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = logical_xor_strided_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+
+} // namespace dpctl::tensor::kernels::logical_xor
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/maximum.hpp b/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/maximum.hpp
index f204b6640042..af6f95863e65 100644
--- a/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/maximum.hpp
+++ b/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/maximum.hpp
@@ -25,6 +25,8 @@
 // ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
 // THE POSSIBILITY OF SUCH DAMAGE.
 //*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
 ///
 /// \file
 /// This file defines kernels for elementwise evaluation of MAXIMUM(x1, x2)
@@ -52,6 +54,7 @@
 
 namespace dpctl::tensor::kernels::maximum
 {
+
 using dpctl::tensor::ssize_t;
 namespace td_ns = dpctl::tensor::type_dispatch;
 namespace tu_ns = dpctl::tensor::type_utils;
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/minimum.hpp b/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/minimum.hpp
index cb7d86377984..0a95987449a1 100644
--- a/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/minimum.hpp
+++ b/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/minimum.hpp
@@ -25,6 +25,8 @@
 // ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
 // THE POSSIBILITY OF SUCH DAMAGE.
 //*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
 ///
 /// \file
 /// This file defines kernels for elementwise evaluation of MINIMUM(x1, x2)
@@ -52,6 +54,7 @@
 
 namespace dpctl::tensor::kernels::minimum
 {
+
 using dpctl::tensor::ssize_t;
 namespace td_ns = dpctl::tensor::type_dispatch;
 namespace tu_ns = dpctl::tensor::type_utils;
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/multiply.hpp b/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/multiply.hpp
new file mode 100644
index 000000000000..587a05106ead
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/multiply.hpp
@@ -0,0 +1,648 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for elementwise evaluation of MUL(x1, x2)
+/// function.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <complex>
+#include <cstddef>
+#include <cstdint>
+#include <type_traits>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "sycl_complex.hpp"
+#include "vec_size_util.hpp"
+
+#include "utils/offset_utils.hpp"
+#include "utils/type_dispatch_building.hpp"
+#include "utils/type_utils.hpp"
+
+#include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+#include "kernels/elementwise_functions/common_inplace.hpp"
+
+namespace dpctl::tensor::kernels::multiply
+{
+
+using dpctl::tensor::ssize_t;
+namespace td_ns = dpctl::tensor::type_dispatch;
+namespace tu_ns = dpctl::tensor::type_utils;
+
+template <typename argT1, typename argT2, typename resT>
+struct MultiplyFunctor
+{
+
+    using supports_sg_loadstore = std::negation<
+        std::disjunction<tu_ns::is_complex<argT1>, tu_ns::is_complex<argT2>>>;
+    using supports_vec = std::negation<
+        std::disjunction<tu_ns::is_complex<argT1>, tu_ns::is_complex<argT2>>>;
+
+    resT operator()(const argT1 &in1, const argT2 &in2) const
+    {
+        if constexpr (tu_ns::is_complex<argT1>::value &&
+                      tu_ns::is_complex<argT2>::value)
+        {
+            using realT1 = typename argT1::value_type;
+            using realT2 = typename argT2::value_type;
+
+            return exprm_ns::complex<realT1>(in1) *
+                   exprm_ns::complex<realT2>(in2);
+        }
+        else {
+            return in1 * in2;
+        }
+    }
+
+    template <int vec_sz>
+    sycl::vec<resT, vec_sz>
+        operator()(const sycl::vec<argT1, vec_sz> &in1,
+                   const sycl::vec<argT2, vec_sz> &in2) const
+    {
+        auto tmp = in1 * in2;
+        if constexpr (std::is_same_v<resT,
+                                     typename decltype(tmp)::element_type>) {
+            return tmp;
+        }
+        else {
+            using dpctl::tensor::type_utils::vec_cast;
+
+            return vec_cast<resT, typename decltype(tmp)::element_type, vec_sz>(
+                tmp);
+        }
+    }
+};
+
+template <typename argT1,
+          typename argT2,
+          typename resT,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
+          bool enable_sg_loadstore = true>
+using MultiplyContigFunctor =
+    elementwise_common::BinaryContigFunctor<argT1,
+                                            argT2,
+                                            resT,
+                                            MultiplyFunctor<argT1, argT2, resT>,
+                                            vec_sz,
+                                            n_vecs,
+                                            enable_sg_loadstore>;
+
+template <typename argT1, typename argT2, typename resT, typename IndexerT>
+using MultiplyStridedFunctor = elementwise_common::BinaryStridedFunctor<
+    argT1,
+    argT2,
+    resT,
+    IndexerT,
+    MultiplyFunctor<argT1, argT2, resT>>;
+
+template <typename T1, typename T2>
+struct MultiplyOutputType
+{
+    using value_type = typename std::disjunction<
+        td_ns::BinaryTypeMapResultEntry<T1, bool, T2, bool, bool>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint8_t,
+                                        T2,
+                                        std::uint8_t,
+                                        std::uint8_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::int8_t,
+                                        T2,
+                                        std::int8_t,
+                                        std::int8_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint16_t,
+                                        T2,
+                                        std::uint16_t,
+                                        std::uint16_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::int16_t,
+                                        T2,
+                                        std::int16_t,
+                                        std::int16_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint32_t,
+                                        T2,
+                                        std::uint32_t,
+                                        std::uint32_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::int32_t,
+                                        T2,
+                                        std::int32_t,
+                                        std::int32_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint64_t,
+                                        T2,
+                                        std::uint64_t,
+                                        std::uint64_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::int64_t,
+                                        T2,
+                                        std::int64_t,
+                                        std::int64_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        sycl::half,
+                                        T2,
+                                        sycl::half,
+                                        sycl::half>,
+        td_ns::BinaryTypeMapResultEntry<T1, float, T2, float, float>,
+        td_ns::BinaryTypeMapResultEntry<T1, double, T2, double, double>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::complex<float>,
+                                        T2,
+                                        std::complex<float>,
+                                        std::complex<float>>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::complex<double>,
+                                        T2,
+                                        std::complex<double>,
+                                        std::complex<double>>,
+        td_ns::DefaultResultEntry<void>>::result_type;
+
+    static constexpr bool is_defined = !std::is_same_v<value_type, void>;
+};
+
+namespace hyperparam_detail
+{
+
+namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
+
+using vsu_ns::BinaryContigHyperparameterSetEntry;
+using vsu_ns::ContigHyperparameterSetDefault;
+
+template <typename argTy1, typename argTy2>
+struct MultiplyContigHyperparameterSet
+{
+    using value_type =
+        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
+
+    constexpr static auto vec_sz = value_type::vec_sz;
+    constexpr static auto n_vecs = value_type::n_vecs;
+};
+
+} // end of namespace hyperparam_detail
+
+template <typename argT1,
+          typename argT2,
+          typename resT,
+          std::uint8_t vec_sz,
+          std::uint8_t n_vecs>
+class multiply_contig_kernel;
+
+template <typename argTy1, typename argTy2>
+sycl::event multiply_contig_impl(sycl::queue &exec_q,
+                                 std::size_t nelems,
+                                 const char *arg1_p,
+                                 ssize_t arg1_offset,
+                                 const char *arg2_p,
+                                 ssize_t arg2_offset,
+                                 char *res_p,
+                                 ssize_t res_offset,
+                                 const std::vector<sycl::event> &depends = {})
+{
+    using MulHS =
+        hyperparam_detail::MultiplyContigHyperparameterSet<argTy1, argTy2>;
+    static constexpr std::uint8_t vec_sz = MulHS::vec_sz;
+    static constexpr std::uint8_t n_vecs = MulHS::n_vecs;
+
+    return elementwise_common::binary_contig_impl<
+        argTy1, argTy2, MultiplyOutputType, MultiplyContigFunctor,
+        multiply_contig_kernel, vec_sz, n_vecs>(
+        exec_q, nelems, arg1_p, arg1_offset, arg2_p, arg2_offset, res_p,
+        res_offset, depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct MultiplyContigFactory
+{
+    fnT get()
+    {
+        if constexpr (!MultiplyOutputType<T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = multiply_contig_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+
+template <typename fnT, typename T1, typename T2>
+struct MultiplyTypeMapFactory
+{
+    /*! @brief get typeid for output type of multiply(T1 x, T2 y) */
+    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
+    {
+        using rT = typename MultiplyOutputType<T1, T2>::value_type;
+        return td_ns::GetTypeid<rT>{}.get();
+    }
+};
+
+template <typename T1, typename T2, typename resT, typename IndexerT>
+class multiply_strided_kernel;
+
+template <typename argTy1, typename argTy2>
+sycl::event
+    multiply_strided_impl(sycl::queue &exec_q,
+                          std::size_t nelems,
+                          int nd,
+                          const ssize_t *shape_and_strides,
+                          const char *arg1_p,
+                          ssize_t arg1_offset,
+                          const char *arg2_p,
+                          ssize_t arg2_offset,
+                          char *res_p,
+                          ssize_t res_offset,
+                          const std::vector<sycl::event> &depends,
+                          const std::vector<sycl::event> &additional_depends)
+{
+    return elementwise_common::binary_strided_impl<
+        argTy1, argTy2, MultiplyOutputType, MultiplyStridedFunctor,
+        multiply_strided_kernel>(exec_q, nelems, nd, shape_and_strides, arg1_p,
+                                 arg1_offset, arg2_p, arg2_offset, res_p,
+                                 res_offset, depends, additional_depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct MultiplyStridedFactory
+{
+    fnT get()
+    {
+        if constexpr (!MultiplyOutputType<T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = multiply_strided_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+
+template <typename argT1, typename argT2, typename resT>
+class multiply_matrix_row_broadcast_sg_krn;
+
+template <typename argT1, typename argT2, typename resT>
+using MultiplyContigMatrixContigRowBroadcastingFunctor =
+    elementwise_common::BinaryContigMatrixContigRowBroadcastingFunctor<
+        argT1,
+        argT2,
+        resT,
+        MultiplyFunctor<argT1, argT2, resT>>;
+
+template <typename argT1, typename argT2, typename resT>
+sycl::event multiply_contig_matrix_contig_row_broadcast_impl(
+    sycl::queue &exec_q,
+    std::vector<sycl::event> &host_tasks,
+    std::size_t n0,
+    std::size_t n1,
+    const char *mat_p, // typeless pointer to (n0, n1) C-contiguous matrix
+    ssize_t mat_offset,
+    const char *vec_p, // typeless pointer to (n1,) contiguous row
+    ssize_t vec_offset,
+    char *res_p, // typeless pointer to (n0, n1) result C-contig. matrix,
+                 //    res[i,j] = mat[i,j] * vec[j]
+    ssize_t res_offset,
+    const std::vector<sycl::event> &depends = {})
+{
+    return elementwise_common::binary_contig_matrix_contig_row_broadcast_impl<
+        argT1, argT2, resT, MultiplyContigMatrixContigRowBroadcastingFunctor,
+        multiply_matrix_row_broadcast_sg_krn>(exec_q, host_tasks, n0, n1, mat_p,
+                                              mat_offset, vec_p, vec_offset,
+                                              res_p, res_offset, depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct MultiplyContigMatrixContigRowBroadcastFactory
+{
+    fnT get()
+    {
+        if constexpr (!MultiplyOutputType<T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            using resT = typename MultiplyOutputType<T1, T2>::value_type;
+            if constexpr (dpctl::tensor::type_utils::is_complex<T1>::value ||
+                          dpctl::tensor::type_utils::is_complex<T2>::value ||
+                          dpctl::tensor::type_utils::is_complex<resT>::value)
+            {
+                fnT fn = nullptr;
+                return fn;
+            }
+            else {
+                fnT fn =
+                    multiply_contig_matrix_contig_row_broadcast_impl<T1, T2,
+                                                                     resT>;
+                return fn;
+            }
+        }
+    }
+};
+
+template <typename argT1, typename argT2, typename resT>
+sycl::event multiply_contig_row_contig_matrix_broadcast_impl(
+    sycl::queue &exec_q,
+    std::vector<sycl::event> &host_tasks,
+    std::size_t n0,
+    std::size_t n1,
+    const char *vec_p, // typeless pointer to (n1,) contiguous row
+    ssize_t vec_offset,
+    const char *mat_p, // typeless pointer to (n0, n1) C-contiguous matrix
+    ssize_t mat_offset,
+    char *res_p, // typeless pointer to (n0, n1) result C-contig. matrix,
+                 //    res[i,j] = mat[i,j] * vec[j]
+    ssize_t res_offset,
+    const std::vector<sycl::event> &depends = {})
+{
+    return multiply_contig_matrix_contig_row_broadcast_impl<argT2, argT1, resT>(
+        exec_q, host_tasks, n0, n1, mat_p, mat_offset, vec_p, vec_offset, res_p,
+        res_offset, depends);
+};
+
+template <typename fnT, typename T1, typename T2>
+struct MultiplyContigRowContigMatrixBroadcastFactory
+{
+    fnT get()
+    {
+        if constexpr (!MultiplyOutputType<T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            using resT = typename MultiplyOutputType<T1, T2>::value_type;
+            if constexpr (dpctl::tensor::type_utils::is_complex<T1>::value ||
+                          dpctl::tensor::type_utils::is_complex<T2>::value ||
+                          dpctl::tensor::type_utils::is_complex<resT>::value)
+            {
+                fnT fn = nullptr;
+                return fn;
+            }
+            else {
+                fnT fn =
+                    multiply_contig_row_contig_matrix_broadcast_impl<T1, T2,
+                                                                     resT>;
+                return fn;
+            }
+        }
+    }
+};
+
+template <typename argT, typename resT>
+struct MultiplyInplaceFunctor
+{
+
+    using supports_sg_loadstore = std::negation<
+        std::disjunction<tu_ns::is_complex<argT>, tu_ns::is_complex<resT>>>;
+    using supports_vec = std::negation<
+        std::disjunction<tu_ns::is_complex<argT>, tu_ns::is_complex<resT>>>;
+
+    void operator()(resT &res, const argT &in)
+    {
+        res *= in;
+    }
+
+    template <int vec_sz>
+    void operator()(sycl::vec<resT, vec_sz> &res,
+                    const sycl::vec<argT, vec_sz> &in)
+    {
+        res *= in;
+    }
+};
+
+template <typename argT,
+          typename resT,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
+          bool enable_sg_loadstore = true>
+using MultiplyInplaceContigFunctor =
+    elementwise_common::BinaryInplaceContigFunctor<
+        argT,
+        resT,
+        MultiplyInplaceFunctor<argT, resT>,
+        vec_sz,
+        n_vecs,
+        enable_sg_loadstore>;
+
+template <typename argT, typename resT, typename IndexerT>
+using MultiplyInplaceStridedFunctor =
+    elementwise_common::BinaryInplaceStridedFunctor<
+        argT,
+        resT,
+        IndexerT,
+        MultiplyInplaceFunctor<argT, resT>>;
+
+template <typename argT,
+          typename resT,
+          std::uint8_t vec_sz,
+          std::uint8_t n_vecs>
+class multiply_inplace_contig_kernel;
+
+/* @brief Types supported by in-place multiplication */
+template <typename argTy, typename resTy>
+struct MultiplyInplaceTypePairSupport
+{
+    /* value if true a kernel for <argTy, resTy> must be instantiated  */
+    static constexpr bool is_defined = std::disjunction<
+        td_ns::TypePairDefinedEntry<argTy, bool, resTy, bool>,
+        td_ns::TypePairDefinedEntry<argTy, std::int8_t, resTy, std::int8_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, resTy, std::uint8_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int16_t, resTy, std::int16_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, resTy, std::uint16_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int32_t, resTy, std::int32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint32_t, resTy, std::uint32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int64_t, resTy, std::int64_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint64_t, resTy, std::uint64_t>,
+        td_ns::TypePairDefinedEntry<argTy, sycl::half, resTy, sycl::half>,
+        td_ns::TypePairDefinedEntry<argTy, float, resTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, double, resTy, double>,
+        td_ns::TypePairDefinedEntry<argTy,
+                                    std::complex<float>,
+                                    resTy,
+                                    std::complex<float>>,
+        td_ns::TypePairDefinedEntry<argTy,
+                                    std::complex<double>,
+                                    resTy,
+                                    std::complex<double>>,
+        // fall-through
+        td_ns::NotDefinedEntry>::is_defined;
+};
+
+template <typename fnT, typename argT, typename resT>
+struct MultiplyInplaceTypeMapFactory
+{
+    /*! @brief get typeid for output type of x *= y */
+    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
+    {
+        if constexpr (MultiplyInplaceTypePairSupport<argT, resT>::is_defined) {
+            return td_ns::GetTypeid<resT>{}.get();
+        }
+        else {
+            return td_ns::GetTypeid<void>{}.get();
+        }
+    }
+};
+
+template <typename argTy, typename resTy>
+sycl::event
+    multiply_inplace_contig_impl(sycl::queue &exec_q,
+                                 std::size_t nelems,
+                                 const char *arg_p,
+                                 ssize_t arg_offset,
+                                 char *res_p,
+                                 ssize_t res_offset,
+                                 const std::vector<sycl::event> &depends = {})
+{
+    using MulHS =
+        hyperparam_detail::MultiplyContigHyperparameterSet<resTy, argTy>;
+    static constexpr std::uint8_t vec_sz = MulHS::vec_sz;
+    static constexpr std::uint8_t n_vecs = MulHS::n_vecs;
+
+    return elementwise_common::binary_inplace_contig_impl<
+        argTy, resTy, MultiplyInplaceContigFunctor,
+        multiply_inplace_contig_kernel, vec_sz, n_vecs>(
+        exec_q, nelems, arg_p, arg_offset, res_p, res_offset, depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct MultiplyInplaceContigFactory
+{
+    fnT get()
+    {
+        if constexpr (!MultiplyInplaceTypePairSupport<T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = multiply_inplace_contig_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+
+template <typename resT, typename argT, typename IndexerT>
+class multiply_inplace_strided_kernel;
+
+template <typename argTy, typename resTy>
+sycl::event multiply_inplace_strided_impl(
+    sycl::queue &exec_q,
+    std::size_t nelems,
+    int nd,
+    const ssize_t *shape_and_strides,
+    const char *arg_p,
+    ssize_t arg_offset,
+    char *res_p,
+    ssize_t res_offset,
+    const std::vector<sycl::event> &depends,
+    const std::vector<sycl::event> &additional_depends)
+{
+    return elementwise_common::binary_inplace_strided_impl<
+        argTy, resTy, MultiplyInplaceStridedFunctor,
+        multiply_inplace_strided_kernel>(exec_q, nelems, nd, shape_and_strides,
+                                         arg_p, arg_offset, res_p, res_offset,
+                                         depends, additional_depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct MultiplyInplaceStridedFactory
+{
+    fnT get()
+    {
+        if constexpr (!MultiplyInplaceTypePairSupport<T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = multiply_inplace_strided_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+
+template <typename argT, typename resT>
+class multiply_inplace_row_matrix_broadcast_sg_krn;
+
+template <typename argT, typename resT>
+using MultiplyInplaceRowMatrixBroadcastingFunctor =
+    elementwise_common::BinaryInplaceRowMatrixBroadcastingFunctor<
+        argT,
+        resT,
+        MultiplyInplaceFunctor<argT, resT>>;
+
+template <typename argT, typename resT>
+sycl::event multiply_inplace_row_matrix_broadcast_impl(
+    sycl::queue &exec_q,
+    std::vector<sycl::event> &host_tasks,
+    std::size_t n0,
+    std::size_t n1,
+    const char *vec_p, // typeless pointer to (n1,) contiguous row
+    ssize_t vec_offset,
+    char *mat_p, // typeless pointer to (n0, n1) C-contiguous matrix
+    ssize_t mat_offset,
+    const std::vector<sycl::event> &depends = {})
+{
+    return elementwise_common::binary_inplace_row_matrix_broadcast_impl<
+        argT, resT, MultiplyInplaceRowMatrixBroadcastingFunctor,
+        multiply_inplace_row_matrix_broadcast_sg_krn>(
+        exec_q, host_tasks, n0, n1, vec_p, vec_offset, mat_p, mat_offset,
+        depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct MultiplyInplaceRowMatrixBroadcastFactory
+{
+    fnT get()
+    {
+        if constexpr (!MultiplyInplaceTypePairSupport<T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            if constexpr (dpctl::tensor::type_utils::is_complex<T1>::value ||
+                          dpctl::tensor::type_utils::is_complex<T2>::value)
+            {
+                fnT fn = nullptr;
+                return fn;
+            }
+            else {
+                fnT fn = multiply_inplace_row_matrix_broadcast_impl<T1, T2>;
+                return fn;
+            }
+        }
+    }
+};
+
+} // namespace dpctl::tensor::kernels::multiply
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/nextafter.hpp b/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/nextafter.hpp
new file mode 100644
index 000000000000..a703892a7606
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/nextafter.hpp
@@ -0,0 +1,248 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for elementwise evaluation of NEXTAFTER(x1, x2)
+/// function.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <cstddef>
+#include <cstdint>
+#include <type_traits>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "vec_size_util.hpp"
+
+#include "utils/offset_utils.hpp"
+#include "utils/type_dispatch_building.hpp"
+#include "utils/type_utils.hpp"
+
+#include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+
+namespace dpctl::tensor::kernels::nextafter
+{
+
+using dpctl::tensor::ssize_t;
+namespace td_ns = dpctl::tensor::type_dispatch;
+namespace tu_ns = dpctl::tensor::type_utils;
+
+template <typename argT1, typename argT2, typename resT>
+struct NextafterFunctor
+{
+
+    using supports_sg_loadstore = std::true_type;
+    using supports_vec = std::true_type;
+
+    resT operator()(const argT1 &in1, const argT2 &in2) const
+    {
+        return sycl::nextafter(in1, in2);
+    }
+
+    template <int vec_sz>
+    sycl::vec<resT, vec_sz>
+        operator()(const sycl::vec<argT1, vec_sz> &in1,
+                   const sycl::vec<argT2, vec_sz> &in2) const
+    {
+        auto res = sycl::nextafter(in1, in2);
+        if constexpr (std::is_same_v<resT,
+                                     typename decltype(res)::element_type>) {
+            return res;
+        }
+        else {
+            using dpctl::tensor::type_utils::vec_cast;
+
+            return vec_cast<resT, typename decltype(res)::element_type, vec_sz>(
+                res);
+        }
+    }
+};
+
+template <typename argT1,
+          typename argT2,
+          typename resT,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
+          bool enable_sg_loadstore = true>
+using NextafterContigFunctor = elementwise_common::BinaryContigFunctor<
+    argT1,
+    argT2,
+    resT,
+    NextafterFunctor<argT1, argT2, resT>,
+    vec_sz,
+    n_vecs,
+    enable_sg_loadstore>;
+
+template <typename argT1, typename argT2, typename resT, typename IndexerT>
+using NextafterStridedFunctor = elementwise_common::BinaryStridedFunctor<
+    argT1,
+    argT2,
+    resT,
+    IndexerT,
+    NextafterFunctor<argT1, argT2, resT>>;
+
+template <typename T1, typename T2>
+struct NextafterOutputType
+{
+    using value_type = typename std::disjunction<
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        sycl::half,
+                                        T2,
+                                        sycl::half,
+                                        sycl::half>,
+        td_ns::BinaryTypeMapResultEntry<T1, float, T2, float, float>,
+        td_ns::BinaryTypeMapResultEntry<T1, double, T2, double, double>,
+        td_ns::DefaultResultEntry<void>>::result_type;
+
+    static constexpr bool is_defined = !std::is_same_v<value_type, void>;
+};
+
+namespace hyperparam_detail
+{
+
+namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
+
+using vsu_ns::BinaryContigHyperparameterSetEntry;
+using vsu_ns::ContigHyperparameterSetDefault;
+
+template <typename argTy1, typename argTy2>
+struct NextafterContigHyperparameterSet
+{
+    using value_type =
+        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
+
+    constexpr static auto vec_sz = value_type::vec_sz;
+    constexpr static auto n_vecs = value_type::n_vecs;
+};
+
+} // end of namespace hyperparam_detail
+
+template <typename argT1,
+          typename argT2,
+          typename resT,
+          std::uint8_t vec_sz,
+          std::uint8_t n_vecs>
+class nextafter_contig_kernel;
+
+template <typename argTy1, typename argTy2>
+sycl::event nextafter_contig_impl(sycl::queue &exec_q,
+                                  std::size_t nelems,
+                                  const char *arg1_p,
+                                  ssize_t arg1_offset,
+                                  const char *arg2_p,
+                                  ssize_t arg2_offset,
+                                  char *res_p,
+                                  ssize_t res_offset,
+                                  const std::vector<sycl::event> &depends = {})
+{
+    using NextafterHS =
+        hyperparam_detail::NextafterContigHyperparameterSet<argTy1, argTy2>;
+    static constexpr std::uint8_t vec_sz = NextafterHS::vec_sz;
+    static constexpr std::uint8_t n_vecs = NextafterHS::n_vecs;
+
+    return elementwise_common::binary_contig_impl<
+        argTy1, argTy2, NextafterOutputType, NextafterContigFunctor,
+        nextafter_contig_kernel, vec_sz, n_vecs>(
+        exec_q, nelems, arg1_p, arg1_offset, arg2_p, arg2_offset, res_p,
+        res_offset, depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct NextafterContigFactory
+{
+    fnT get()
+    {
+        if constexpr (!NextafterOutputType<T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = nextafter_contig_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+
+template <typename fnT, typename T1, typename T2>
+struct NextafterTypeMapFactory
+{
+    /*! @brief get typeid for output type of std::nextafter(T1 x, T2 y) */
+    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
+    {
+        using rT = typename NextafterOutputType<T1, T2>::value_type;
+        return td_ns::GetTypeid<rT>{}.get();
+    }
+};
+
+template <typename T1, typename T2, typename resT, typename IndexerT>
+class nextafter_strided_kernel;
+
+template <typename argTy1, typename argTy2>
+sycl::event
+    nextafter_strided_impl(sycl::queue &exec_q,
+                           std::size_t nelems,
+                           int nd,
+                           const ssize_t *shape_and_strides,
+                           const char *arg1_p,
+                           ssize_t arg1_offset,
+                           const char *arg2_p,
+                           ssize_t arg2_offset,
+                           char *res_p,
+                           ssize_t res_offset,
+                           const std::vector<sycl::event> &depends,
+                           const std::vector<sycl::event> &additional_depends)
+{
+    return elementwise_common::binary_strided_impl<
+        argTy1, argTy2, NextafterOutputType, NextafterStridedFunctor,
+        nextafter_strided_kernel>(exec_q, nelems, nd, shape_and_strides, arg1_p,
+                                  arg1_offset, arg2_p, arg2_offset, res_p,
+                                  res_offset, depends, additional_depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct NextafterStridedFactory
+{
+    fnT get()
+    {
+        if constexpr (!NextafterOutputType<T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = nextafter_strided_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+
+} // namespace dpctl::tensor::kernels::nextafter
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/not_equal.hpp b/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/not_equal.hpp
new file mode 100644
index 000000000000..224e3fbe5b77
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/not_equal.hpp
@@ -0,0 +1,304 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for elementwise evaluation of inequality of
+/// tensor elements.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <complex>
+#include <cstddef>
+#include <cstdint>
+#include <type_traits>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "vec_size_util.hpp"
+
+#include "utils/offset_utils.hpp"
+#include "utils/type_dispatch_building.hpp"
+#include "utils/type_utils.hpp"
+
+#include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+
+namespace dpctl::tensor::kernels::not_equal
+{
+
+using dpctl::tensor::ssize_t;
+namespace td_ns = dpctl::tensor::type_dispatch;
+namespace tu_ns = dpctl::tensor::type_utils;
+
+template <typename argT1, typename argT2, typename resT>
+struct NotEqualFunctor
+{
+    static_assert(std::is_same_v<resT, bool>);
+
+    using supports_sg_loadstore = std::negation<
+        std::disjunction<tu_ns::is_complex<argT1>, tu_ns::is_complex<argT2>>>;
+    using supports_vec = std::conjunction<
+        std::is_same<argT1, argT2>,
+        std::negation<std::disjunction<tu_ns::is_complex<argT1>,
+                                       tu_ns::is_complex<argT2>>>>;
+
+    resT operator()(const argT1 &in1, const argT2 &in2) const
+    {
+        if constexpr (std::is_integral_v<argT1> && std::is_integral_v<argT2> &&
+                      std::is_signed_v<argT1> != std::is_signed_v<argT2>)
+        {
+            if constexpr (std::is_signed_v<argT1> && !std::is_signed_v<argT2>) {
+                return (in1 < 0) ? true : (static_cast<argT2>(in1) != in2);
+            }
+            else {
+                if constexpr (!std::is_signed_v<argT1> &&
+                              std::is_signed_v<argT2>) {
+                    return (in2 < 0) ? true : (in1 != static_cast<argT1>(in2));
+                }
+            }
+        }
+        else {
+            return (in1 != in2);
+        }
+    }
+
+    template <int vec_sz>
+    sycl::vec<resT, vec_sz>
+        operator()(const sycl::vec<argT1, vec_sz> &in1,
+                   const sycl::vec<argT2, vec_sz> &in2) const
+    {
+        auto tmp = (in1 != in2);
+        if constexpr (std::is_same_v<resT,
+                                     typename decltype(tmp)::element_type>) {
+            return tmp;
+        }
+        else {
+            using dpctl::tensor::type_utils::vec_cast;
+
+            return vec_cast<resT, typename decltype(tmp)::element_type, vec_sz>(
+                tmp);
+        }
+    }
+};
+
+template <typename argT1,
+          typename argT2,
+          typename resT,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
+          bool enable_sg_loadstore = true>
+using NotEqualContigFunctor =
+    elementwise_common::BinaryContigFunctor<argT1,
+                                            argT2,
+                                            resT,
+                                            NotEqualFunctor<argT1, argT2, resT>,
+                                            vec_sz,
+                                            n_vecs,
+                                            enable_sg_loadstore>;
+
+template <typename argT1, typename argT2, typename resT, typename IndexerT>
+using NotEqualStridedFunctor = elementwise_common::BinaryStridedFunctor<
+    argT1,
+    argT2,
+    resT,
+    IndexerT,
+    NotEqualFunctor<argT1, argT2, resT>>;
+
+template <typename T1, typename T2>
+struct NotEqualOutputType
+{
+    using value_type = typename std::disjunction<
+        td_ns::BinaryTypeMapResultEntry<T1, bool, T2, bool, bool>,
+        td_ns::
+            BinaryTypeMapResultEntry<T1, std::uint8_t, T2, std::uint8_t, bool>,
+        td_ns::BinaryTypeMapResultEntry<T1, std::int8_t, T2, std::int8_t, bool>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint16_t,
+                                        T2,
+                                        std::uint16_t,
+                                        bool>,
+        td_ns::
+            BinaryTypeMapResultEntry<T1, std::int16_t, T2, std::int16_t, bool>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint32_t,
+                                        T2,
+                                        std::uint32_t,
+                                        bool>,
+        td_ns::
+            BinaryTypeMapResultEntry<T1, std::int32_t, T2, std::int32_t, bool>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint64_t,
+                                        T2,
+                                        std::uint64_t,
+                                        bool>,
+        td_ns::
+            BinaryTypeMapResultEntry<T1, std::int64_t, T2, std::int64_t, bool>,
+        td_ns::
+            BinaryTypeMapResultEntry<T1, std::uint64_t, T2, std::int64_t, bool>,
+        td_ns::
+            BinaryTypeMapResultEntry<T1, std::int64_t, T2, std::uint64_t, bool>,
+        td_ns::BinaryTypeMapResultEntry<T1, sycl::half, T2, sycl::half, bool>,
+        td_ns::BinaryTypeMapResultEntry<T1, float, T2, float, bool>,
+        td_ns::BinaryTypeMapResultEntry<T1, double, T2, double, bool>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::complex<float>,
+                                        T2,
+                                        std::complex<float>,
+                                        bool>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::complex<double>,
+                                        T2,
+                                        std::complex<double>,
+                                        bool>,
+        td_ns::DefaultResultEntry<void>>::result_type;
+
+    static constexpr bool is_defined = !std::is_same_v<value_type, void>;
+};
+
+namespace hyperparam_detail
+{
+
+namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
+
+using vsu_ns::BinaryContigHyperparameterSetEntry;
+using vsu_ns::ContigHyperparameterSetDefault;
+
+template <typename argTy1, typename argTy2>
+struct NotEqualContigHyperparameterSet
+{
+    using value_type =
+        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
+
+    constexpr static auto vec_sz = value_type::vec_sz;
+    constexpr static auto n_vecs = value_type::n_vecs;
+};
+
+} // end of namespace hyperparam_detail
+
+template <typename argT1,
+          typename argT2,
+          typename resT,
+          std::uint8_t vec_sz,
+          std::uint8_t n_vecs>
+class not_equal_contig_kernel;
+
+template <typename argTy1, typename argTy2>
+sycl::event not_equal_contig_impl(sycl::queue &exec_q,
+                                  std::size_t nelems,
+                                  const char *arg1_p,
+                                  ssize_t arg1_offset,
+                                  const char *arg2_p,
+                                  ssize_t arg2_offset,
+                                  char *res_p,
+                                  ssize_t res_offset,
+                                  const std::vector<sycl::event> &depends = {})
+{
+    using NotEqHS =
+        hyperparam_detail::NotEqualContigHyperparameterSet<argTy1, argTy2>;
+    static constexpr std::uint8_t vec_sz = NotEqHS::vec_sz;
+    static constexpr std::uint8_t n_vecs = NotEqHS::n_vecs;
+
+    return elementwise_common::binary_contig_impl<
+        argTy1, argTy2, NotEqualOutputType, NotEqualContigFunctor,
+        not_equal_contig_kernel, vec_sz, n_vecs>(
+        exec_q, nelems, arg1_p, arg1_offset, arg2_p, arg2_offset, res_p,
+        res_offset, depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct NotEqualContigFactory
+{
+    fnT get()
+    {
+        if constexpr (!NotEqualOutputType<T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = not_equal_contig_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+
+template <typename fnT, typename T1, typename T2>
+struct NotEqualTypeMapFactory
+{
+    /*! @brief get typeid for output type of operator()!=(x, y), always bool */
+    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
+    {
+        using rT = typename NotEqualOutputType<T1, T2>::value_type;
+        return td_ns::GetTypeid<rT>{}.get();
+    }
+};
+
+template <typename T1, typename T2, typename resT, typename IndexerT>
+class not_equal_strided_kernel;
+
+template <typename argTy1, typename argTy2>
+sycl::event
+    not_equal_strided_impl(sycl::queue &exec_q,
+                           std::size_t nelems,
+                           int nd,
+                           const ssize_t *shape_and_strides,
+                           const char *arg1_p,
+                           ssize_t arg1_offset,
+                           const char *arg2_p,
+                           ssize_t arg2_offset,
+                           char *res_p,
+                           ssize_t res_offset,
+                           const std::vector<sycl::event> &depends,
+                           const std::vector<sycl::event> &additional_depends)
+{
+    return elementwise_common::binary_strided_impl<
+        argTy1, argTy2, NotEqualOutputType, NotEqualStridedFunctor,
+        not_equal_strided_kernel>(exec_q, nelems, nd, shape_and_strides, arg1_p,
+                                  arg1_offset, arg2_p, arg2_offset, res_p,
+                                  res_offset, depends, additional_depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct NotEqualStridedFactory
+{
+    fnT get()
+    {
+        if constexpr (!NotEqualOutputType<T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = not_equal_strided_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+
+} // namespace dpctl::tensor::kernels::not_equal
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/pow.hpp b/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/pow.hpp
new file mode 100644
index 000000000000..46489f45985e
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/pow.hpp
@@ -0,0 +1,602 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for elementwise evaluation of POW(x1, x2)
+/// function.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <complex>
+#include <cstddef>
+#include <cstdint>
+#include <type_traits>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "sycl_complex.hpp"
+#include "vec_size_util.hpp"
+
+#include "utils/offset_utils.hpp"
+#include "utils/type_dispatch_building.hpp"
+#include "utils/type_utils.hpp"
+
+#include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+#include "kernels/elementwise_functions/common_inplace.hpp"
+
+namespace dpctl::tensor::kernels::pow
+{
+
+using dpctl::tensor::ssize_t;
+namespace td_ns = dpctl::tensor::type_dispatch;
+namespace tu_ns = dpctl::tensor::type_utils;
+
+template <typename argT1, typename argT2, typename resT>
+struct PowFunctor
+{
+
+    using supports_sg_loadstore = std::negation<
+        std::disjunction<tu_ns::is_complex<argT1>, tu_ns::is_complex<argT2>>>;
+    using supports_vec = std::negation<
+        std::disjunction<tu_ns::is_complex<argT1>, tu_ns::is_complex<argT2>>>;
+
+    resT operator()(const argT1 &in1, const argT2 &in2) const
+    {
+        if constexpr (std::is_integral_v<argT1> || std::is_integral_v<argT2>) {
+            auto tmp1 = in1;
+            auto tmp2 = in2;
+            if constexpr (std::is_signed_v<argT2>) {
+                if (tmp2 < 0) {
+                    // invalid; return 0
+                    return resT(0);
+                }
+            }
+            resT res = 1;
+            if (tmp1 == 1 || tmp2 == 0) {
+                return res;
+            }
+            while (tmp2 > 0) {
+                if (tmp2 & 1) {
+                    res *= tmp1;
+                }
+                tmp2 >>= 1;
+                tmp1 *= tmp1;
+            }
+            return res;
+        }
+        else if constexpr (tu_ns::is_complex<argT1>::value &&
+                           tu_ns::is_complex<argT2>::value)
+        {
+            using realT1 = typename argT1::value_type;
+            using realT2 = typename argT2::value_type;
+
+            return exprm_ns::pow(exprm_ns::complex<realT1>(in1),
+                                 exprm_ns::complex<realT2>(in2));
+        }
+        else {
+            return sycl::pow(in1, in2);
+        }
+    }
+
+    template <int vec_sz>
+    sycl::vec<resT, vec_sz>
+        operator()(const sycl::vec<argT1, vec_sz> &in1,
+                   const sycl::vec<argT2, vec_sz> &in2) const
+    {
+        if constexpr (std::is_integral_v<argT1> || std::is_integral_v<argT2>) {
+            sycl::vec<resT, vec_sz> res;
+#pragma unroll
+            for (int i = 0; i < vec_sz; ++i) {
+                auto tmp1 = in1[i];
+                auto tmp2 = in2[i];
+                if constexpr (std::is_signed_v<argT2>) {
+                    if (tmp2 < 0) {
+                        // invalid; yield 0
+                        res[i] = 0;
+                        continue;
+                    }
+                }
+                resT res_tmp = 1;
+                if (tmp1 == 1 || tmp2 == 0) {
+                    res[i] = res_tmp;
+                    continue;
+                }
+                while (tmp2 > 0) {
+                    if (tmp2 & 1) {
+                        res_tmp *= tmp1;
+                    }
+                    tmp2 >>= 1;
+                    tmp1 *= tmp1;
+                }
+                res[i] = res_tmp;
+            }
+            return res;
+        }
+        else {
+            auto res = sycl::pow(in1, in2);
+            if constexpr (std::is_same_v<resT,
+                                         typename decltype(res)::element_type>)
+            {
+                return res;
+            }
+            else {
+                using dpctl::tensor::type_utils::vec_cast;
+
+                return vec_cast<resT, typename decltype(res)::element_type,
+                                vec_sz>(res);
+            }
+        }
+    }
+};
+
+template <typename argT1,
+          typename argT2,
+          typename resT,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
+          bool enable_sg_loadstore = true>
+using PowContigFunctor =
+    elementwise_common::BinaryContigFunctor<argT1,
+                                            argT2,
+                                            resT,
+                                            PowFunctor<argT1, argT2, resT>,
+                                            vec_sz,
+                                            n_vecs,
+                                            enable_sg_loadstore>;
+
+template <typename argT1, typename argT2, typename resT, typename IndexerT>
+using PowStridedFunctor =
+    elementwise_common::BinaryStridedFunctor<argT1,
+                                             argT2,
+                                             resT,
+                                             IndexerT,
+                                             PowFunctor<argT1, argT2, resT>>;
+
+template <typename T1, typename T2>
+struct PowOutputType
+{
+    using value_type = typename std::disjunction<
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint8_t,
+                                        T2,
+                                        std::uint8_t,
+                                        std::uint8_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::int8_t,
+                                        T2,
+                                        std::int8_t,
+                                        std::int8_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint16_t,
+                                        T2,
+                                        std::uint16_t,
+                                        std::uint16_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::int16_t,
+                                        T2,
+                                        std::int16_t,
+                                        std::int16_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint32_t,
+                                        T2,
+                                        std::uint32_t,
+                                        std::uint32_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::int32_t,
+                                        T2,
+                                        std::int32_t,
+                                        std::int32_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint64_t,
+                                        T2,
+                                        std::uint64_t,
+                                        std::uint64_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::int64_t,
+                                        T2,
+                                        std::int64_t,
+                                        std::int64_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        sycl::half,
+                                        T2,
+                                        sycl::half,
+                                        sycl::half>,
+        td_ns::BinaryTypeMapResultEntry<T1, float, T2, float, float>,
+        td_ns::BinaryTypeMapResultEntry<T1, double, T2, double, double>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::complex<float>,
+                                        T2,
+                                        std::complex<float>,
+                                        std::complex<float>>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::complex<double>,
+                                        T2,
+                                        std::complex<double>,
+                                        std::complex<double>>,
+        td_ns::DefaultResultEntry<void>>::result_type;
+
+    static constexpr bool is_defined = !std::is_same_v<value_type, void>;
+};
+
+namespace hyperparam_detail
+{
+
+namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
+
+using vsu_ns::BinaryContigHyperparameterSetEntry;
+using vsu_ns::ContigHyperparameterSetDefault;
+
+template <typename argTy1, typename argTy2>
+struct PowContigHyperparameterSet
+{
+    using value_type =
+        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
+
+    constexpr static auto vec_sz = value_type::vec_sz;
+    constexpr static auto n_vecs = value_type::n_vecs;
+};
+
+} // end of namespace hyperparam_detail
+
+template <typename argT1,
+          typename argT2,
+          typename resT,
+          std::uint8_t vec_sz,
+          std::uint8_t n_vecs>
+class pow_contig_kernel;
+
+template <typename argTy1, typename argTy2>
+sycl::event pow_contig_impl(sycl::queue &exec_q,
+                            std::size_t nelems,
+                            const char *arg1_p,
+                            ssize_t arg1_offset,
+                            const char *arg2_p,
+                            ssize_t arg2_offset,
+                            char *res_p,
+                            ssize_t res_offset,
+                            const std::vector<sycl::event> &depends = {})
+{
+    using PowHS = hyperparam_detail::PowContigHyperparameterSet<argTy1, argTy2>;
+    static constexpr std::uint8_t vec_sz = PowHS::vec_sz;
+    static constexpr std::uint8_t n_vecs = PowHS::n_vecs;
+
+    return elementwise_common::binary_contig_impl<
+        argTy1, argTy2, PowOutputType, PowContigFunctor, pow_contig_kernel,
+        vec_sz, n_vecs>(exec_q, nelems, arg1_p, arg1_offset, arg2_p,
+                        arg2_offset, res_p, res_offset, depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct PowContigFactory
+{
+    fnT get()
+    {
+        if constexpr (!PowOutputType<T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = pow_contig_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+
+template <typename fnT, typename T1, typename T2>
+struct PowTypeMapFactory
+{
+    /*! @brief get typeid for output type of std::pow(T1 x, T2 y) */
+    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
+    {
+        using rT = typename PowOutputType<T1, T2>::value_type;
+        return td_ns::GetTypeid<rT>{}.get();
+    }
+};
+
+template <typename T1, typename T2, typename resT, typename IndexerT>
+class pow_strided_kernel;
+
+template <typename argTy1, typename argTy2>
+sycl::event pow_strided_impl(sycl::queue &exec_q,
+                             std::size_t nelems,
+                             int nd,
+                             const ssize_t *shape_and_strides,
+                             const char *arg1_p,
+                             ssize_t arg1_offset,
+                             const char *arg2_p,
+                             ssize_t arg2_offset,
+                             char *res_p,
+                             ssize_t res_offset,
+                             const std::vector<sycl::event> &depends,
+                             const std::vector<sycl::event> &additional_depends)
+{
+    return elementwise_common::binary_strided_impl<
+        argTy1, argTy2, PowOutputType, PowStridedFunctor, pow_strided_kernel>(
+        exec_q, nelems, nd, shape_and_strides, arg1_p, arg1_offset, arg2_p,
+        arg2_offset, res_p, res_offset, depends, additional_depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct PowStridedFactory
+{
+    fnT get()
+    {
+        if constexpr (!PowOutputType<T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = pow_strided_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+
+template <typename argT, typename resT>
+struct PowInplaceFunctor
+{
+
+    using supports_sg_loadstore = std::negation<
+        std::disjunction<tu_ns::is_complex<argT>, tu_ns::is_complex<resT>>>;
+    using supports_vec = std::negation<
+        std::disjunction<tu_ns::is_complex<argT>, tu_ns::is_complex<resT>>>;
+
+    void operator()(resT &res, const argT &in)
+    {
+        if constexpr (std::is_integral_v<argT> || std::is_integral_v<resT>) {
+            auto tmp1 = res;
+            auto tmp2 = in;
+            if constexpr (std::is_signed_v<argT>) {
+                if (tmp2 < 0) {
+                    // invalid; return 0
+                    res = 0;
+                    return;
+                }
+            }
+            if (tmp1 == 1) {
+                return;
+            }
+            if (tmp2 == 0) {
+                res = 1;
+                return;
+            }
+            resT res_tmp = 1;
+            while (tmp2 > 0) {
+                if (tmp2 & 1) {
+                    res_tmp *= tmp1;
+                }
+                tmp2 >>= 1;
+                tmp1 *= tmp1;
+            }
+            res = res_tmp;
+        }
+        else if constexpr (tu_ns::is_complex<argT>::value &&
+                           tu_ns::is_complex<resT>::value)
+        {
+            using r_resT = typename resT::value_type;
+            using r_argT = typename argT::value_type;
+
+            res = exprm_ns::pow(exprm_ns::complex<r_resT>(res),
+                                exprm_ns::complex<r_argT>(in));
+        }
+        else {
+            res = sycl::pow(res, in);
+        }
+        return;
+    }
+
+    template <int vec_sz>
+    void operator()(sycl::vec<resT, vec_sz> &res,
+                    const sycl::vec<argT, vec_sz> &in)
+    {
+        if constexpr (std::is_integral_v<argT> || std::is_integral_v<resT>) {
+#pragma unroll
+            for (int i = 0; i < vec_sz; ++i) {
+                auto tmp1 = res[i];
+                auto tmp2 = in[i];
+                if constexpr (std::is_signed_v<argT>) {
+                    if (tmp2 < 0) {
+                        // invalid; return 0
+                        res[i] = 0;
+                        continue;
+                    }
+                }
+                if (tmp1 == 1) {
+                    continue;
+                }
+                if (tmp2 == 0) {
+                    res[i] = 1;
+                    continue;
+                }
+                resT res_tmp = 1;
+                while (tmp2 > 0) {
+                    if (tmp2 & 1) {
+                        res_tmp *= tmp1;
+                    }
+                    tmp2 >>= 1;
+                    tmp1 *= tmp1;
+                }
+                res[i] = res_tmp;
+            }
+        }
+        else {
+            res = sycl::pow(res, in);
+        }
+    }
+};
+
+template <typename argT,
+          typename resT,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
+          bool enable_sg_loadstore = true>
+using PowInplaceContigFunctor = elementwise_common::BinaryInplaceContigFunctor<
+    argT,
+    resT,
+    PowInplaceFunctor<argT, resT>,
+    vec_sz,
+    n_vecs,
+    enable_sg_loadstore>;
+
+template <typename argT, typename resT, typename IndexerT>
+using PowInplaceStridedFunctor =
+    elementwise_common::BinaryInplaceStridedFunctor<
+        argT,
+        resT,
+        IndexerT,
+        PowInplaceFunctor<argT, resT>>;
+
+template <typename argT,
+          typename resT,
+          std::uint8_t vec_sz,
+          std::uint8_t n_vecs>
+class pow_inplace_contig_kernel;
+
+/* @brief Types supported by in-place pow */
+template <typename argTy, typename resTy>
+struct PowInplaceTypePairSupport
+{
+    /* value if true a kernel for <argTy, resTy> must be instantiated  */
+    static constexpr bool is_defined = std::disjunction<
+        td_ns::TypePairDefinedEntry<argTy, std::int8_t, resTy, std::int8_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, resTy, std::uint8_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int16_t, resTy, std::int16_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, resTy, std::uint16_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int32_t, resTy, std::int32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint32_t, resTy, std::uint32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int64_t, resTy, std::int64_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint64_t, resTy, std::uint64_t>,
+        td_ns::TypePairDefinedEntry<argTy, sycl::half, resTy, sycl::half>,
+        td_ns::TypePairDefinedEntry<argTy, float, resTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, double, resTy, double>,
+        td_ns::TypePairDefinedEntry<argTy,
+                                    std::complex<float>,
+                                    resTy,
+                                    std::complex<float>>,
+        td_ns::TypePairDefinedEntry<argTy,
+                                    std::complex<double>,
+                                    resTy,
+                                    std::complex<double>>,
+        // fall-through
+        td_ns::NotDefinedEntry>::is_defined;
+};
+
+template <typename fnT, typename argT, typename resT>
+struct PowInplaceTypeMapFactory
+{
+    /*! @brief get typeid for output type of x **= y */
+    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
+    {
+        if constexpr (PowInplaceTypePairSupport<argT, resT>::is_defined) {
+            return td_ns::GetTypeid<resT>{}.get();
+        }
+        else {
+            return td_ns::GetTypeid<void>{}.get();
+        }
+    }
+};
+
+template <typename argTy, typename resTy>
+sycl::event
+    pow_inplace_contig_impl(sycl::queue &exec_q,
+                            std::size_t nelems,
+                            const char *arg_p,
+                            ssize_t arg_offset,
+                            char *res_p,
+                            ssize_t res_offset,
+                            const std::vector<sycl::event> &depends = {})
+{
+    using PowHS = hyperparam_detail::PowContigHyperparameterSet<resTy, argTy>;
+    static constexpr std::uint8_t vec_sz = PowHS::vec_sz;
+    static constexpr std::uint8_t n_vecs = PowHS::n_vecs;
+
+    return elementwise_common::binary_inplace_contig_impl<
+        argTy, resTy, PowInplaceContigFunctor, pow_inplace_contig_kernel,
+        vec_sz, n_vecs>(exec_q, nelems, arg_p, arg_offset, res_p, res_offset,
+                        depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct PowInplaceContigFactory
+{
+    fnT get()
+    {
+        if constexpr (!PowInplaceTypePairSupport<T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = pow_inplace_contig_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+
+template <typename resT, typename argT, typename IndexerT>
+class pow_inplace_strided_kernel;
+
+template <typename argTy, typename resTy>
+sycl::event
+    pow_inplace_strided_impl(sycl::queue &exec_q,
+                             std::size_t nelems,
+                             int nd,
+                             const ssize_t *shape_and_strides,
+                             const char *arg_p,
+                             ssize_t arg_offset,
+                             char *res_p,
+                             ssize_t res_offset,
+                             const std::vector<sycl::event> &depends,
+                             const std::vector<sycl::event> &additional_depends)
+{
+    return elementwise_common::binary_inplace_strided_impl<
+        argTy, resTy, PowInplaceStridedFunctor, pow_inplace_strided_kernel>(
+        exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p,
+        res_offset, depends, additional_depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct PowInplaceStridedFactory
+{
+    fnT get()
+    {
+        if constexpr (!PowInplaceTypePairSupport<T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = pow_inplace_strided_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+
+} // namespace dpctl::tensor::kernels::pow
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/remainder.hpp b/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/remainder.hpp
new file mode 100644
index 000000000000..ca87d0f41605
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/remainder.hpp
@@ -0,0 +1,578 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for elementwise evaluation of REMAINDER(x1, x2)
+/// function that computes the Python modulus operator, which is specifically
+/// designed as the complement to floor_divide(x1, x2).
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <cstddef>
+#include <cstdint>
+#include <sycl/sycl.hpp>
+#include <type_traits>
+#include <vector>
+
+#include "vec_size_util.hpp"
+
+#include "utils/offset_utils.hpp"
+#include "utils/type_dispatch_building.hpp"
+#include "utils/type_utils.hpp"
+
+#include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+#include "kernels/elementwise_functions/common_inplace.hpp"
+
+namespace dpctl::tensor::kernels::remainder
+{
+
+using dpctl::tensor::ssize_t;
+namespace td_ns = dpctl::tensor::type_dispatch;
+namespace tu_ns = dpctl::tensor::type_utils;
+
+template <typename argT1, typename argT2, typename resT>
+struct RemainderFunctor
+{
+    static_assert(std::is_same_v<argT1, argT2>);
+    using supports_sg_loadstore = std::true_type;
+    using supports_vec = std::true_type;
+
+    resT operator()(const argT1 &in1, const argT2 &in2) const
+    {
+        if constexpr (std::is_integral_v<argT1> || std::is_integral_v<argT2>) {
+            if (in2 == argT2(0)) {
+                return resT(0);
+            }
+            if constexpr (std::is_signed_v<argT1> || std::is_signed_v<argT2>) {
+                auto out = (in1 % in2);
+                if (out != 0 && l_xor(in1 < 0, in2 < 0)) {
+                    out += in2;
+                }
+                return out;
+            }
+            else {
+                return (in1 % in2);
+            }
+        }
+        else {
+            auto rem = sycl::fmod(in1, in2);
+            if (rem) {
+                if (l_xor(in2 < 0, rem < 0)) {
+                    rem += in2;
+                }
+            }
+            else {
+                rem = sycl::copysign(resT(0), in2);
+            }
+            return rem;
+        }
+    }
+
+    template <int vec_sz>
+    sycl::vec<resT, vec_sz>
+        operator()(const sycl::vec<argT1, vec_sz> &in1,
+                   const sycl::vec<argT2, vec_sz> &in2) const
+    {
+        if constexpr (std::is_integral_v<argT1> || std::is_integral_v<argT2>) {
+            sycl::vec<resT, vec_sz> rem;
+#pragma unroll
+            for (auto i = 0; i < vec_sz; ++i) {
+                if (in2[i] == argT2(0)) {
+                    rem[i] = resT(0);
+                }
+                else {
+                    rem[i] = in1[i] % in2[i];
+                    if constexpr (std::is_signed_v<argT1> ||
+                                  std::is_signed_v<argT2>) {
+                        if (rem[i] != 0 && l_xor(in1[i] < 0, in2[i] < 0)) {
+                            rem[i] += in2[i];
+                        }
+                    }
+                }
+            }
+            return rem;
+        }
+        else {
+            auto rem = sycl::fmod(in1, in2);
+            using remT = typename decltype(rem)::element_type;
+#pragma unroll
+            for (auto i = 0; i < vec_sz; ++i) {
+                if (rem[i]) {
+                    if (l_xor(in2[i] < 0, rem[i] < 0)) {
+                        rem[i] += in2[i];
+                    }
+                }
+                else {
+                    rem[i] = sycl::copysign(remT(0), in2[i]);
+                }
+            }
+            if constexpr (std::is_same_v<resT, remT>) {
+                return rem;
+            }
+            else {
+                using dpctl::tensor::type_utils::vec_cast;
+
+                return vec_cast<resT, remT, vec_sz>(rem);
+            }
+        }
+    }
+
+private:
+    bool l_xor(bool b1, bool b2) const
+    {
+        return (b1 != b2);
+    }
+};
+
+template <typename argT1,
+          typename argT2,
+          typename resT,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
+          bool enable_sg_loadstore = true>
+using RemainderContigFunctor = elementwise_common::BinaryContigFunctor<
+    argT1,
+    argT2,
+    resT,
+    RemainderFunctor<argT1, argT2, resT>,
+    vec_sz,
+    n_vecs,
+    enable_sg_loadstore>;
+
+template <typename argT1, typename argT2, typename resT, typename IndexerT>
+using RemainderStridedFunctor = elementwise_common::BinaryStridedFunctor<
+    argT1,
+    argT2,
+    resT,
+    IndexerT,
+    RemainderFunctor<argT1, argT2, resT>>;
+
+template <typename T1, typename T2>
+struct RemainderOutputType
+{
+    using value_type = typename std::disjunction<
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint8_t,
+                                        T2,
+                                        std::uint8_t,
+                                        std::uint8_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::int8_t,
+                                        T2,
+                                        std::int8_t,
+                                        std::int8_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint16_t,
+                                        T2,
+                                        std::uint16_t,
+                                        std::uint16_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::int16_t,
+                                        T2,
+                                        std::int16_t,
+                                        std::int16_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint32_t,
+                                        T2,
+                                        std::uint32_t,
+                                        std::uint32_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::int32_t,
+                                        T2,
+                                        std::int32_t,
+                                        std::int32_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint64_t,
+                                        T2,
+                                        std::uint64_t,
+                                        std::uint64_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::int64_t,
+                                        T2,
+                                        std::int64_t,
+                                        std::int64_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        sycl::half,
+                                        T2,
+                                        sycl::half,
+                                        sycl::half>,
+        td_ns::BinaryTypeMapResultEntry<T1, float, T2, float, float>,
+        td_ns::BinaryTypeMapResultEntry<T1, double, T2, double, double>,
+        td_ns::DefaultResultEntry<void>>::result_type;
+
+    static constexpr bool is_defined = !std::is_same_v<value_type, void>;
+};
+
+namespace hyperparam_detail
+{
+
+namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
+
+using vsu_ns::BinaryContigHyperparameterSetEntry;
+using vsu_ns::ContigHyperparameterSetDefault;
+
+template <typename argTy1, typename argTy2>
+struct RemainderContigHyperparameterSet
+{
+    using value_type =
+        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
+
+    constexpr static auto vec_sz = value_type::vec_sz;
+    constexpr static auto n_vecs = value_type::n_vecs;
+};
+
+} // end of namespace hyperparam_detail
+
+template <typename argT1,
+          typename argT2,
+          typename resT,
+          std::uint8_t vec_sz,
+          std::uint8_t n_vecs>
+class remainder_contig_kernel;
+
+template <typename argTy1, typename argTy2>
+sycl::event remainder_contig_impl(sycl::queue &exec_q,
+                                  std::size_t nelems,
+                                  const char *arg1_p,
+                                  ssize_t arg1_offset,
+                                  const char *arg2_p,
+                                  ssize_t arg2_offset,
+                                  char *res_p,
+                                  ssize_t res_offset,
+                                  const std::vector<sycl::event> &depends = {})
+{
+    using RemHS =
+        hyperparam_detail::RemainderContigHyperparameterSet<argTy1, argTy2>;
+    static constexpr std::uint8_t vec_sz = RemHS::vec_sz;
+    static constexpr std::uint8_t n_vecs = RemHS::n_vecs;
+
+    return elementwise_common::binary_contig_impl<
+        argTy1, argTy2, RemainderOutputType, RemainderContigFunctor,
+        remainder_contig_kernel, vec_sz, n_vecs>(
+        exec_q, nelems, arg1_p, arg1_offset, arg2_p, arg2_offset, res_p,
+        res_offset, depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct RemainderContigFactory
+{
+    fnT get()
+    {
+        if constexpr (!RemainderOutputType<T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = remainder_contig_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+
+template <typename fnT, typename T1, typename T2>
+struct RemainderTypeMapFactory
+{
+    /*! @brief get typeid for output type of remainder(T x, T y) */
+    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
+    {
+        using rT = typename RemainderOutputType<T1, T2>::value_type;
+        return td_ns::GetTypeid<rT>{}.get();
+    }
+};
+
+template <typename T1, typename T2, typename resT, typename IndexerT>
+class remainder_strided_kernel;
+
+template <typename argTy1, typename argTy2>
+sycl::event
+    remainder_strided_impl(sycl::queue &exec_q,
+                           std::size_t nelems,
+                           int nd,
+                           const ssize_t *shape_and_strides,
+                           const char *arg1_p,
+                           ssize_t arg1_offset,
+                           const char *arg2_p,
+                           ssize_t arg2_offset,
+                           char *res_p,
+                           ssize_t res_offset,
+                           const std::vector<sycl::event> &depends,
+                           const std::vector<sycl::event> &additional_depends)
+{
+    return elementwise_common::binary_strided_impl<
+        argTy1, argTy2, RemainderOutputType, RemainderStridedFunctor,
+        remainder_strided_kernel>(exec_q, nelems, nd, shape_and_strides, arg1_p,
+                                  arg1_offset, arg2_p, arg2_offset, res_p,
+                                  res_offset, depends, additional_depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct RemainderStridedFactory
+{
+    fnT get()
+    {
+        if constexpr (!RemainderOutputType<T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = remainder_strided_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+
+template <typename argT, typename resT>
+struct RemainderInplaceFunctor
+{
+
+    using supports_sg_loadstore = std::true_type;
+    using supports_vec = std::true_type;
+
+    // functor is only well-defined when argT and resT are the same
+    static_assert(std::is_same_v<argT, resT>);
+
+    void operator()(resT &res, const argT &in)
+    {
+        if constexpr (std::is_integral_v<argT> || std::is_integral_v<resT>) {
+            if (in == argT(0)) {
+                res = 0;
+                return;
+            }
+            if constexpr (std::is_signed_v<argT> || std::is_signed_v<resT>) {
+                auto tmp = res;
+                res %= in;
+                if (res != resT(0) && l_xor(tmp < 0, in < 0)) {
+                    res += in;
+                }
+            }
+            else {
+                res %= in;
+            }
+        }
+        else {
+            res = sycl::fmod(res, in);
+            if (res) {
+                if (l_xor(in < 0, res < 0)) {
+                    res += in;
+                }
+            }
+            else {
+                res = sycl::copysign(resT(0), in);
+            }
+        }
+    }
+
+    template <int vec_sz>
+    void operator()(sycl::vec<resT, vec_sz> &res,
+                    const sycl::vec<argT, vec_sz> &in)
+    {
+        if constexpr (std::is_integral_v<argT> || std::is_integral_v<resT>) {
+#pragma unroll
+            for (auto i = 0; i < vec_sz; ++i) {
+                if (in[i] == argT(0)) {
+                    res[i] = 0;
+                }
+                else {
+                    auto rem = res[i] % in[i];
+                    if constexpr (std::is_signed_v<argT> ||
+                                  std::is_signed_v<resT>) {
+                        if (rem != 0 && l_xor(res[i] < 0, in[i] < 0)) {
+                            rem += in[i];
+                        }
+                    }
+                    res[i] = rem;
+                }
+            }
+        }
+        else {
+            res = sycl::fmod(res, in);
+#pragma unroll
+            for (auto i = 0; i < vec_sz; ++i) {
+                if (res[i]) {
+                    if (l_xor(in[i] < 0, res[i] < 0)) {
+                        res[i] += in[i];
+                    }
+                }
+                else {
+                    res[i] = sycl::copysign(resT(0), in[i]);
+                }
+            }
+        }
+    }
+
+private:
+    bool l_xor(bool b1, bool b2) const
+    {
+        return (b1 != b2);
+    }
+};
+
+template <typename argT,
+          typename resT,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
+          bool enable_sg_loadstore = true>
+using RemainderInplaceContigFunctor =
+    elementwise_common::BinaryInplaceContigFunctor<
+        argT,
+        resT,
+        RemainderInplaceFunctor<argT, resT>,
+        vec_sz,
+        n_vecs,
+        enable_sg_loadstore>;
+
+template <typename argT, typename resT, typename IndexerT>
+using RemainderInplaceStridedFunctor =
+    elementwise_common::BinaryInplaceStridedFunctor<
+        argT,
+        resT,
+        IndexerT,
+        RemainderInplaceFunctor<argT, resT>>;
+
+template <typename argT,
+          typename resT,
+          std::uint8_t vec_sz,
+          std::uint8_t n_vecs>
+class remainder_inplace_contig_kernel;
+
+/* @brief Types supported by in-place remainder */
+template <typename argTy, typename resTy>
+struct RemainderInplaceTypePairSupport
+{
+    /* value if true a kernel for <argTy, resTy> must be instantiated  */
+    static constexpr bool is_defined = std::disjunction<
+        td_ns::TypePairDefinedEntry<argTy, std::int8_t, resTy, std::int8_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, resTy, std::uint8_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int16_t, resTy, std::int16_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, resTy, std::uint16_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int32_t, resTy, std::int32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint32_t, resTy, std::uint32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int64_t, resTy, std::int64_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint64_t, resTy, std::uint64_t>,
+        td_ns::TypePairDefinedEntry<argTy, sycl::half, resTy, sycl::half>,
+        td_ns::TypePairDefinedEntry<argTy, float, resTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, double, resTy, double>,
+        // fall-through
+        td_ns::NotDefinedEntry>::is_defined;
+};
+
+template <typename fnT, typename argT, typename resT>
+struct RemainderInplaceTypeMapFactory
+{
+    /*! @brief get typeid for output type of x %= y */
+    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
+    {
+        if constexpr (RemainderInplaceTypePairSupport<argT, resT>::is_defined) {
+            return td_ns::GetTypeid<resT>{}.get();
+        }
+        else {
+            return td_ns::GetTypeid<void>{}.get();
+        }
+    }
+};
+
+template <typename argTy, typename resTy>
+sycl::event
+    remainder_inplace_contig_impl(sycl::queue &exec_q,
+                                  std::size_t nelems,
+                                  const char *arg_p,
+                                  ssize_t arg_offset,
+                                  char *res_p,
+                                  ssize_t res_offset,
+                                  const std::vector<sycl::event> &depends = {})
+{
+    using RemHS =
+        hyperparam_detail::RemainderContigHyperparameterSet<resTy, argTy>;
+    static constexpr std::uint8_t vec_sz = RemHS::vec_sz;
+    static constexpr std::uint8_t n_vecs = RemHS::n_vecs;
+
+    return elementwise_common::binary_inplace_contig_impl<
+        argTy, resTy, RemainderInplaceContigFunctor,
+        remainder_inplace_contig_kernel, vec_sz, n_vecs>(
+        exec_q, nelems, arg_p, arg_offset, res_p, res_offset, depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct RemainderInplaceContigFactory
+{
+    fnT get()
+    {
+        if constexpr (!RemainderInplaceTypePairSupport<T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = remainder_inplace_contig_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+
+template <typename resT, typename argT, typename IndexerT>
+class remainder_inplace_strided_kernel;
+
+template <typename argTy, typename resTy>
+sycl::event remainder_inplace_strided_impl(
+    sycl::queue &exec_q,
+    std::size_t nelems,
+    int nd,
+    const ssize_t *shape_and_strides,
+    const char *arg_p,
+    ssize_t arg_offset,
+    char *res_p,
+    ssize_t res_offset,
+    const std::vector<sycl::event> &depends,
+    const std::vector<sycl::event> &additional_depends)
+{
+    return elementwise_common::binary_inplace_strided_impl<
+        argTy, resTy, RemainderInplaceStridedFunctor,
+        remainder_inplace_strided_kernel>(exec_q, nelems, nd, shape_and_strides,
+                                          arg_p, arg_offset, res_p, res_offset,
+                                          depends, additional_depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct RemainderInplaceStridedFactory
+{
+    fnT get()
+    {
+        if constexpr (!RemainderInplaceTypePairSupport<T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = remainder_inplace_strided_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+
+} // namespace dpctl::tensor::kernels::remainder
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/subtract.hpp b/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/subtract.hpp
new file mode 100644
index 000000000000..dfd9ac72b860
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/subtract.hpp
@@ -0,0 +1,646 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for elementwise evaluation of SUBTRACT(x1, x2)
+/// function.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <complex>
+#include <cstddef>
+#include <cstdint>
+#include <sycl/sycl.hpp>
+#include <type_traits>
+#include <vector>
+
+#include "vec_size_util.hpp"
+
+#include "utils/offset_utils.hpp"
+#include "utils/type_dispatch_building.hpp"
+#include "utils/type_utils.hpp"
+
+#include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+#include "kernels/elementwise_functions/common_inplace.hpp"
+
+namespace dpctl::tensor::kernels::subtract
+{
+
+using dpctl::tensor::ssize_t;
+namespace td_ns = dpctl::tensor::type_dispatch;
+namespace tu_ns = dpctl::tensor::type_utils;
+
+template <typename argT1, typename argT2, typename resT>
+struct SubtractFunctor
+{
+
+    using supports_sg_loadstore = std::negation<
+        std::disjunction<tu_ns::is_complex<argT1>, tu_ns::is_complex<argT2>>>;
+    using supports_vec = std::negation<
+        std::disjunction<tu_ns::is_complex<argT1>, tu_ns::is_complex<argT2>>>;
+
+    resT operator()(const argT1 &in1, const argT2 &in2) const
+    {
+        return in1 - in2;
+    }
+
+    template <int vec_sz>
+    sycl::vec<resT, vec_sz>
+        operator()(const sycl::vec<argT1, vec_sz> &in1,
+                   const sycl::vec<argT2, vec_sz> &in2) const
+    {
+        auto tmp = in1 - in2;
+        if constexpr (std::is_same_v<resT,
+                                     typename decltype(tmp)::element_type>) {
+            return tmp;
+        }
+        else {
+            using dpctl::tensor::type_utils::vec_cast;
+
+            return vec_cast<resT, typename decltype(tmp)::element_type, vec_sz>(
+                tmp);
+        }
+    }
+};
+
+template <typename argT1,
+          typename argT2,
+          typename resT,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
+          bool enable_sg_loadstore = true>
+using SubtractContigFunctor =
+    elementwise_common::BinaryContigFunctor<argT1,
+                                            argT2,
+                                            resT,
+                                            SubtractFunctor<argT1, argT2, resT>,
+                                            vec_sz,
+                                            n_vecs,
+                                            enable_sg_loadstore>;
+
+template <typename argT1, typename argT2, typename resT, typename IndexerT>
+using SubtractStridedFunctor = elementwise_common::BinaryStridedFunctor<
+    argT1,
+    argT2,
+    resT,
+    IndexerT,
+    SubtractFunctor<argT1, argT2, resT>>;
+
+template <typename T1, typename T2>
+struct SubtractOutputType
+{
+    using value_type = typename std::disjunction<
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint8_t,
+                                        T2,
+                                        std::uint8_t,
+                                        std::uint8_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::int8_t,
+                                        T2,
+                                        std::int8_t,
+                                        std::int8_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint16_t,
+                                        T2,
+                                        std::uint16_t,
+                                        std::uint16_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::int16_t,
+                                        T2,
+                                        std::int16_t,
+                                        std::int16_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint32_t,
+                                        T2,
+                                        std::uint32_t,
+                                        std::uint32_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::int32_t,
+                                        T2,
+                                        std::int32_t,
+                                        std::int32_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint64_t,
+                                        T2,
+                                        std::uint64_t,
+                                        std::uint64_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::int64_t,
+                                        T2,
+                                        std::int64_t,
+                                        std::int64_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        sycl::half,
+                                        T2,
+                                        sycl::half,
+                                        sycl::half>,
+        td_ns::BinaryTypeMapResultEntry<T1, float, T2, float, float>,
+        td_ns::BinaryTypeMapResultEntry<T1, double, T2, double, double>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::complex<float>,
+                                        T2,
+                                        std::complex<float>,
+                                        std::complex<float>>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::complex<double>,
+                                        T2,
+                                        std::complex<double>,
+                                        std::complex<double>>,
+        td_ns::DefaultResultEntry<void>>::result_type;
+
+    static constexpr bool is_defined = !std::is_same_v<value_type, void>;
+};
+
+namespace hyperparam_detail
+{
+
+namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
+
+using vsu_ns::BinaryContigHyperparameterSetEntry;
+using vsu_ns::ContigHyperparameterSetDefault;
+
+template <typename argTy1, typename argTy2>
+struct SubtractContigHyperparameterSet
+{
+    using value_type =
+        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
+
+    constexpr static auto vec_sz = value_type::vec_sz;
+    constexpr static auto n_vecs = value_type::n_vecs;
+};
+
+} // end of namespace hyperparam_detail
+
+template <typename argT1,
+          typename argT2,
+          typename resT,
+          std::uint8_t vec_sz,
+          std::uint8_t n_vecs>
+class subtract_contig_kernel;
+
+template <typename argTy1, typename argTy2>
+sycl::event subtract_contig_impl(sycl::queue &exec_q,
+                                 std::size_t nelems,
+                                 const char *arg1_p,
+                                 ssize_t arg1_offset,
+                                 const char *arg2_p,
+                                 ssize_t arg2_offset,
+                                 char *res_p,
+                                 ssize_t res_offset,
+                                 const std::vector<sycl::event> &depends = {})
+{
+    using SubHS =
+        hyperparam_detail::SubtractContigHyperparameterSet<argTy1, argTy2>;
+    static constexpr std::uint8_t vec_sz = SubHS::vec_sz;
+    static constexpr std::uint8_t n_vecs = SubHS::n_vecs;
+
+    return elementwise_common::binary_contig_impl<
+        argTy1, argTy2, SubtractOutputType, SubtractContigFunctor,
+        subtract_contig_kernel, vec_sz, n_vecs>(
+        exec_q, nelems, arg1_p, arg1_offset, arg2_p, arg2_offset, res_p,
+        res_offset, depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct SubtractContigFactory
+{
+    fnT get()
+    {
+        if constexpr (!SubtractOutputType<T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = subtract_contig_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+
+template <typename fnT, typename T1, typename T2>
+struct SubtractTypeMapFactory
+{
+    /*! @brief get typeid for output type of divide(T1 x, T2 y) */
+    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
+    {
+        using rT = typename SubtractOutputType<T1, T2>::value_type;
+        return td_ns::GetTypeid<rT>{}.get();
+    }
+};
+
+template <typename T1, typename T2, typename resT, typename IndexerT>
+class subtract_strided_kernel;
+
+template <typename argTy1, typename argTy2>
+sycl::event
+    subtract_strided_impl(sycl::queue &exec_q,
+                          std::size_t nelems,
+                          int nd,
+                          const ssize_t *shape_and_strides,
+                          const char *arg1_p,
+                          ssize_t arg1_offset,
+                          const char *arg2_p,
+                          ssize_t arg2_offset,
+                          char *res_p,
+                          ssize_t res_offset,
+                          const std::vector<sycl::event> &depends,
+                          const std::vector<sycl::event> &additional_depends)
+{
+    return elementwise_common::binary_strided_impl<
+        argTy1, argTy2, SubtractOutputType, SubtractStridedFunctor,
+        subtract_strided_kernel>(exec_q, nelems, nd, shape_and_strides, arg1_p,
+                                 arg1_offset, arg2_p, arg2_offset, res_p,
+                                 res_offset, depends, additional_depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct SubtractStridedFactory
+{
+    fnT get()
+    {
+        if constexpr (!SubtractOutputType<T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = subtract_strided_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+
+template <typename argT1, typename argT2, typename resT>
+using SubtractContigMatrixContigRowBroadcastingFunctor =
+    elementwise_common::BinaryContigMatrixContigRowBroadcastingFunctor<
+        argT1,
+        argT2,
+        resT,
+        SubtractFunctor<argT1, argT2, resT>>;
+
+template <typename argT1, typename argT2, typename resT>
+using SubtractContigRowContigMatrixBroadcastingFunctor =
+    elementwise_common::BinaryContigRowContigMatrixBroadcastingFunctor<
+        argT1,
+        argT2,
+        resT,
+        SubtractFunctor<argT1, argT2, resT>>;
+
+template <typename argT1, typename argT2, typename resT>
+class subtract_matrix_row_broadcast_sg_krn;
+
+template <typename argT1, typename argT2, typename resT>
+class subtract_row_matrix_broadcast_sg_krn;
+
+template <typename argT1, typename argT2, typename resT>
+sycl::event subtract_contig_matrix_contig_row_broadcast_impl(
+    sycl::queue &exec_q,
+    std::vector<sycl::event> &host_tasks,
+    std::size_t n0,
+    std::size_t n1,
+    const char *mat_p, // typeless pointer to (n0, n1) C-contiguous matrix
+    ssize_t mat_offset,
+    const char *vec_p, // typeless pointer to (n1,) contiguous row
+    ssize_t vec_offset,
+    char *res_p, // typeless pointer to (n0, n1) result C-contig. matrix,
+                 //    res[i,j] = mat[i,j] - vec[j]
+    ssize_t res_offset,
+    const std::vector<sycl::event> &depends = {})
+{
+    return elementwise_common::binary_contig_matrix_contig_row_broadcast_impl<
+        argT1, argT2, resT, SubtractContigMatrixContigRowBroadcastingFunctor,
+        subtract_matrix_row_broadcast_sg_krn>(exec_q, host_tasks, n0, n1, mat_p,
+                                              mat_offset, vec_p, vec_offset,
+                                              res_p, res_offset, depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct SubtractContigMatrixContigRowBroadcastFactory
+{
+    fnT get()
+    {
+        if constexpr (!SubtractOutputType<T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            using resT = typename SubtractOutputType<T1, T2>::value_type;
+            if constexpr (dpctl::tensor::type_utils::is_complex<T1>::value ||
+                          dpctl::tensor::type_utils::is_complex<T2>::value ||
+                          dpctl::tensor::type_utils::is_complex<resT>::value)
+            {
+                fnT fn = nullptr;
+                return fn;
+            }
+            else {
+                fnT fn =
+                    subtract_contig_matrix_contig_row_broadcast_impl<T1, T2,
+                                                                     resT>;
+                return fn;
+            }
+        }
+    }
+};
+
+template <typename argT1, typename argT2, typename resT>
+sycl::event subtract_contig_row_contig_matrix_broadcast_impl(
+    sycl::queue &exec_q,
+    std::vector<sycl::event> &host_tasks,
+    std::size_t n0,
+    std::size_t n1,
+    const char *vec_p, // typeless pointer to (n1,) contiguous row
+    ssize_t vec_offset,
+    const char *mat_p, // typeless pointer to (n0, n1) C-contiguous matrix
+    ssize_t mat_offset,
+    char *res_p, // typeless pointer to (n0, n1) result C-contig. matrix,
+                 //    res[i,j] = op(vec[j], mat[i,j])
+    ssize_t res_offset,
+    const std::vector<sycl::event> &depends = {})
+{
+    return elementwise_common::binary_contig_row_contig_matrix_broadcast_impl<
+        argT1, argT2, resT, SubtractContigRowContigMatrixBroadcastingFunctor,
+        subtract_row_matrix_broadcast_sg_krn>(exec_q, host_tasks, n0, n1, vec_p,
+                                              vec_offset, mat_p, mat_offset,
+                                              res_p, res_offset, depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct SubtractContigRowContigMatrixBroadcastFactory
+{
+    fnT get()
+    {
+        if constexpr (!SubtractOutputType<T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            using resT = typename SubtractOutputType<T1, T2>::value_type;
+            if constexpr (dpctl::tensor::type_utils::is_complex<T1>::value ||
+                          dpctl::tensor::type_utils::is_complex<T2>::value ||
+                          dpctl::tensor::type_utils::is_complex<resT>::value)
+            {
+                fnT fn = nullptr;
+                return fn;
+            }
+            else {
+                fnT fn =
+                    subtract_contig_row_contig_matrix_broadcast_impl<T1, T2,
+                                                                     resT>;
+                return fn;
+            }
+        }
+    }
+};
+
+template <typename argT, typename resT>
+struct SubtractInplaceFunctor
+{
+
+    using supports_sg_loadstore = std::negation<
+        std::disjunction<tu_ns::is_complex<argT>, tu_ns::is_complex<resT>>>;
+    using supports_vec = std::negation<
+        std::disjunction<tu_ns::is_complex<argT>, tu_ns::is_complex<resT>>>;
+
+    void operator()(resT &res, const argT &in)
+    {
+        res -= in;
+    }
+
+    template <int vec_sz>
+    void operator()(sycl::vec<resT, vec_sz> &res,
+                    const sycl::vec<argT, vec_sz> &in)
+    {
+        res -= in;
+    }
+};
+
+template <typename argT,
+          typename resT,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
+          bool enable_sg_loadstore = true>
+using SubtractInplaceContigFunctor =
+    elementwise_common::BinaryInplaceContigFunctor<
+        argT,
+        resT,
+        SubtractInplaceFunctor<argT, resT>,
+        vec_sz,
+        n_vecs,
+        enable_sg_loadstore>;
+
+template <typename argT, typename resT, typename IndexerT>
+using SubtractInplaceStridedFunctor =
+    elementwise_common::BinaryInplaceStridedFunctor<
+        argT,
+        resT,
+        IndexerT,
+        SubtractInplaceFunctor<argT, resT>>;
+
+template <typename argT,
+          typename resT,
+          std::uint8_t vec_sz,
+          std::uint8_t n_vecs>
+class subtract_inplace_contig_kernel;
+
+/* @brief Types supported by in-place subtraction */
+template <typename argTy, typename resTy>
+struct SubtractInplaceTypePairSupport
+{
+    /* value if true a kernel for <argTy, resTy> must be instantiated  */
+    static constexpr bool is_defined = std::disjunction<
+        td_ns::TypePairDefinedEntry<argTy, std::int8_t, resTy, std::int8_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, resTy, std::uint8_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int16_t, resTy, std::int16_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, resTy, std::uint16_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int32_t, resTy, std::int32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint32_t, resTy, std::uint32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int64_t, resTy, std::int64_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint64_t, resTy, std::uint64_t>,
+        td_ns::TypePairDefinedEntry<argTy, sycl::half, resTy, sycl::half>,
+        td_ns::TypePairDefinedEntry<argTy, float, resTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, double, resTy, double>,
+        td_ns::TypePairDefinedEntry<argTy,
+                                    std::complex<float>,
+                                    resTy,
+                                    std::complex<float>>,
+        td_ns::TypePairDefinedEntry<argTy,
+                                    std::complex<double>,
+                                    resTy,
+                                    std::complex<double>>,
+        // fall-through
+        td_ns::NotDefinedEntry>::is_defined;
+};
+
+template <typename fnT, typename argT, typename resT>
+struct SubtractInplaceTypeMapFactory
+{
+    /*! @brief get typeid for output type of x -= y */
+    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
+    {
+        if constexpr (SubtractInplaceTypePairSupport<argT, resT>::is_defined) {
+            return td_ns::GetTypeid<resT>{}.get();
+        }
+        else {
+            return td_ns::GetTypeid<void>{}.get();
+        }
+    }
+};
+
+template <typename argTy, typename resTy>
+sycl::event
+    subtract_inplace_contig_impl(sycl::queue &exec_q,
+                                 std::size_t nelems,
+                                 const char *arg_p,
+                                 ssize_t arg_offset,
+                                 char *res_p,
+                                 ssize_t res_offset,
+                                 const std::vector<sycl::event> &depends = {})
+{
+    using SubHS =
+        hyperparam_detail::SubtractContigHyperparameterSet<resTy, argTy>;
+    static constexpr std::uint8_t vec_sz = SubHS::vec_sz;
+    static constexpr std::uint8_t n_vecs = SubHS::n_vecs;
+
+    return elementwise_common::binary_inplace_contig_impl<
+        argTy, resTy, SubtractInplaceContigFunctor,
+        subtract_inplace_contig_kernel, vec_sz, n_vecs>(
+        exec_q, nelems, arg_p, arg_offset, res_p, res_offset, depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct SubtractInplaceContigFactory
+{
+    fnT get()
+    {
+        if constexpr (!SubtractInplaceTypePairSupport<T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = subtract_inplace_contig_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+
+template <typename resT, typename argT, typename IndexerT>
+class subtract_inplace_strided_kernel;
+
+template <typename argTy, typename resTy>
+sycl::event subtract_inplace_strided_impl(
+    sycl::queue &exec_q,
+    std::size_t nelems,
+    int nd,
+    const ssize_t *shape_and_strides,
+    const char *arg_p,
+    ssize_t arg_offset,
+    char *res_p,
+    ssize_t res_offset,
+    const std::vector<sycl::event> &depends,
+    const std::vector<sycl::event> &additional_depends)
+{
+    return elementwise_common::binary_inplace_strided_impl<
+        argTy, resTy, SubtractInplaceStridedFunctor,
+        subtract_inplace_strided_kernel>(exec_q, nelems, nd, shape_and_strides,
+                                         arg_p, arg_offset, res_p, res_offset,
+                                         depends, additional_depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct SubtractInplaceStridedFactory
+{
+    fnT get()
+    {
+        if constexpr (!SubtractInplaceTypePairSupport<T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = subtract_inplace_strided_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+
+template <typename argT, typename resT>
+class subtract_inplace_row_matrix_broadcast_sg_krn;
+
+template <typename argT, typename resT>
+using SubtractInplaceRowMatrixBroadcastingFunctor =
+    elementwise_common::BinaryInplaceRowMatrixBroadcastingFunctor<
+        argT,
+        resT,
+        SubtractInplaceFunctor<argT, resT>>;
+
+template <typename argT, typename resT>
+sycl::event subtract_inplace_row_matrix_broadcast_impl(
+    sycl::queue &exec_q,
+    std::vector<sycl::event> &host_tasks,
+    std::size_t n0,
+    std::size_t n1,
+    const char *vec_p, // typeless pointer to (n1,) contiguous row
+    ssize_t vec_offset,
+    char *mat_p, // typeless pointer to (n0, n1) C-contiguous matrix
+    ssize_t mat_offset,
+    const std::vector<sycl::event> &depends = {})
+{
+    return elementwise_common::binary_inplace_row_matrix_broadcast_impl<
+        argT, resT, SubtractInplaceRowMatrixBroadcastingFunctor,
+        subtract_inplace_row_matrix_broadcast_sg_krn>(
+        exec_q, host_tasks, n0, n1, vec_p, vec_offset, mat_p, mat_offset,
+        depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct SubtractInplaceRowMatrixBroadcastFactory
+{
+    fnT get()
+    {
+        if constexpr (!SubtractInplaceTypePairSupport<T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            if constexpr (dpctl::tensor::type_utils::is_complex<T1>::value ||
+                          dpctl::tensor::type_utils::is_complex<T2>::value)
+            {
+                fnT fn = nullptr;
+                return fn;
+            }
+            else {
+                fnT fn = subtract_inplace_row_matrix_broadcast_impl<T1, T2>;
+                return fn;
+            }
+        }
+    }
+};
+
+} // namespace dpctl::tensor::kernels::subtract
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/true_divide.hpp b/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/true_divide.hpp
index f8219764071f..1372663b96c5 100644
--- a/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/true_divide.hpp
+++ b/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/true_divide.hpp
@@ -29,7 +29,7 @@
 //===---------------------------------------------------------------------===//
 ///
 /// \file
-/// This file defines kernels for elementwise evaluation of DIVIDE(x1, x2)
+/// This file defines kernels for elementwise evaluation of TRUE_DIVIDE(x1, x2)
 /// function.
 //===---------------------------------------------------------------------===//
 
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/copysign.cpp b/dpctl_ext/tensor/libtensor/source/elementwise_functions/copysign.cpp
new file mode 100644
index 000000000000..8dca1635459a
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/elementwise_functions/copysign.cpp
@@ -0,0 +1,146 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "copysign.hpp"
+#include "elementwise_functions.hpp"
+#include "utils/type_dispatch.hpp"
+#include "utils/type_dispatch_building.hpp"
+
+#include "kernels/elementwise_functions/common.hpp"
+#include "kernels/elementwise_functions/copysign.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
+using ew_cmn_ns::binary_contig_impl_fn_ptr_t;
+using ew_cmn_ns::binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t;
+using ew_cmn_ns::binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t;
+using ew_cmn_ns::binary_strided_impl_fn_ptr_t;
+
+// B25: ===== COPYSIGN (x1, x2)
+namespace impl
+{
+namespace copysign_fn_ns = dpctl::tensor::kernels::copysign;
+
+static binary_contig_impl_fn_ptr_t
+    copysign_contig_dispatch_table[td_ns::num_types][td_ns::num_types];
+static int copysign_output_id_table[td_ns::num_types][td_ns::num_types];
+
+static binary_strided_impl_fn_ptr_t
+    copysign_strided_dispatch_table[td_ns::num_types][td_ns::num_types];
+
+void populate_copysign_dispatch_tables(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = copysign_fn_ns;
+
+    // which input types are supported, and what is the type of the result
+    using fn_ns::CopysignTypeMapFactory;
+    DispatchTableBuilder<int, CopysignTypeMapFactory, num_types> dtb1;
+    dtb1.populate_dispatch_table(copysign_output_id_table);
+
+    // function pointers for operation on general strided arrays
+    using fn_ns::CopysignStridedFactory;
+    DispatchTableBuilder<binary_strided_impl_fn_ptr_t, CopysignStridedFactory,
+                         num_types>
+        dtb2;
+    dtb2.populate_dispatch_table(copysign_strided_dispatch_table);
+
+    // function pointers for operation on contiguous inputs and output
+    using fn_ns::CopysignContigFactory;
+    DispatchTableBuilder<binary_contig_impl_fn_ptr_t, CopysignContigFactory,
+                         num_types>
+        dtb3;
+    dtb3.populate_dispatch_table(copysign_contig_dispatch_table);
+};
+
+} // namespace impl
+
+void init_copysign(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        impl::populate_copysign_dispatch_tables();
+        using impl::copysign_contig_dispatch_table;
+        using impl::copysign_output_id_table;
+        using impl::copysign_strided_dispatch_table;
+
+        auto copysign_pyapi = [&](const arrayT &src1, const arrayT &src2,
+                                  const arrayT &dst, sycl::queue &exec_q,
+                                  const event_vecT &depends = {}) {
+            return py_binary_ufunc(
+                src1, src2, dst, exec_q, depends, copysign_output_id_table,
+                // function pointers to handle operation on contiguous arrays
+                // (pointers may be nullptr)
+                copysign_contig_dispatch_table,
+                // function pointers to handle operation on strided arrays (most
+                // general case)
+                copysign_strided_dispatch_table,
+                // function pointers to handle operation of c-contig matrix and
+                // c-contig row with broadcasting (may be nullptr)
+                td_ns::NullPtrTable<
+                    binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{},
+                // function pointers to handle operation of c-contig matrix and
+                // c-contig row with broadcasting (may be nullptr)
+                td_ns::NullPtrTable<
+                    binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{});
+        };
+        auto copysign_result_type_pyapi = [&](const py::dtype &dtype1,
+                                              const py::dtype &dtype2) {
+            return py_binary_ufunc_result_type(dtype1, dtype2,
+                                               copysign_output_id_table);
+        };
+        m.def("_copysign", copysign_pyapi, "", py::arg("src1"), py::arg("src2"),
+              py::arg("dst"), py::arg("sycl_queue"),
+              py::arg("depends") = py::list());
+        m.def("_copysign_result_type", copysign_result_type_pyapi, "");
+    }
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/copysign.hpp b/dpctl_ext/tensor/libtensor/source/elementwise_functions/copysign.hpp
new file mode 100644
index 000000000000..875443d792c2
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/elementwise_functions/copysign.hpp
@@ -0,0 +1,46 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern void init_copysign(py::module_ m);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/elementwise_common.cpp b/dpctl_ext/tensor/libtensor/source/elementwise_functions/elementwise_common.cpp
index 8170f047c488..dc09318d66ad 100644
--- a/dpctl_ext/tensor/libtensor/source/elementwise_functions/elementwise_common.cpp
+++ b/dpctl_ext/tensor/libtensor/source/elementwise_functions/elementwise_common.cpp
@@ -54,7 +54,7 @@
 #include "cbrt.hpp"
 #include "ceil.hpp"
 #include "conj.hpp"
-// #include "copysign.hpp"
+#include "copysign.hpp"
 #include "cos.hpp"
 #include "cosh.hpp"
 #include "equal.hpp"
@@ -77,22 +77,22 @@
 #include "log1p.hpp"
 #include "log2.hpp"
 #include "logaddexp.hpp"
-// #include "logical_and.hpp"
+#include "logical_and.hpp"
 #include "logical_not.hpp"
-// #include "logical_or.hpp"
-// #include "logical_xor.hpp"
-// #include "maximum.hpp"
-// #include "minimum.hpp"
-// #include "multiply.hpp"
+#include "logical_or.hpp"
+#include "logical_xor.hpp"
+#include "maximum.hpp"
+#include "minimum.hpp"
+#include "multiply.hpp"
 #include "negative.hpp"
-// #include "nextafter.hpp"
-// #include "not_equal.hpp"
+#include "nextafter.hpp"
+#include "not_equal.hpp"
 #include "positive.hpp"
-// #include "pow.hpp"
+#include "pow.hpp"
 #include "proj.hpp"
 #include "real.hpp"
 #include "reciprocal.hpp"
-// #include "remainder.hpp"
+#include "remainder.hpp"
 #include "round.hpp"
 #include "rsqrt.hpp"
 #include "sign.hpp"
@@ -101,7 +101,7 @@
 #include "sinh.hpp"
 #include "sqrt.hpp"
 #include "square.hpp"
-// #include "subtract.hpp"
+#include "subtract.hpp"
 #include "tan.hpp"
 #include "tanh.hpp"
 #include "true_divide.hpp"
@@ -134,7 +134,7 @@ void init_elementwise_functions(py::module_ m)
     init_cbrt(m);
     init_ceil(m);
     init_conj(m);
-    // init_copysign(m);
+    init_copysign(m);
     init_cos(m);
     init_cosh(m);
     init_divide(m);
@@ -158,21 +158,22 @@ void init_elementwise_functions(py::module_ m)
     init_log1p(m);
     init_log2(m);
     init_logaddexp(m);
-    // init_logical_and(m);
+    init_logical_and(m);
     init_logical_not(m);
-    // init_logical_or(m);
-    // init_logical_xor(m);
-    // init_minimum(m);
-    // init_multiply(m);
-    // init_nextafter(m);
+    init_logical_or(m);
+    init_logical_xor(m);
+    init_maximum(m);
+    init_minimum(m);
+    init_multiply(m);
+    init_nextafter(m);
     init_negative(m);
-    // init_not_equal(m);
+    init_not_equal(m);
     init_positive(m);
-    // init_pow(m);
+    init_pow(m);
     init_proj(m);
     init_real(m);
     init_reciprocal(m);
-    // init_remainder(m);
+    init_remainder(m);
     init_round(m);
     init_rsqrt(m);
     init_sign(m);
@@ -181,7 +182,7 @@ void init_elementwise_functions(py::module_ m)
     init_sinh(m);
     init_sqrt(m);
     init_square(m);
-    // init_subtract(m);
+    init_subtract(m);
     init_tan(m);
     init_tanh(m);
     init_trunc(m);
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/logical_and.cpp b/dpctl_ext/tensor/libtensor/source/elementwise_functions/logical_and.cpp
new file mode 100644
index 000000000000..90c0b52a6aa2
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/elementwise_functions/logical_and.cpp
@@ -0,0 +1,146 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "elementwise_functions.hpp"
+#include "logical_and.hpp"
+#include "utils/type_dispatch.hpp"
+#include "utils/type_dispatch_building.hpp"
+
+#include "kernels/elementwise_functions/common.hpp"
+#include "kernels/elementwise_functions/logical_and.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
+using ew_cmn_ns::binary_contig_impl_fn_ptr_t;
+using ew_cmn_ns::binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t;
+using ew_cmn_ns::binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t;
+using ew_cmn_ns::binary_strided_impl_fn_ptr_t;
+
+// B16: ===== LOGICAL_AND (x1, x2)
+namespace impl
+{
+namespace logical_and_fn_ns = dpctl::tensor::kernels::logical_and;
+
+static binary_contig_impl_fn_ptr_t
+    logical_and_contig_dispatch_table[td_ns::num_types][td_ns::num_types];
+static int logical_and_output_id_table[td_ns::num_types][td_ns::num_types];
+
+static binary_strided_impl_fn_ptr_t
+    logical_and_strided_dispatch_table[td_ns::num_types][td_ns::num_types];
+
+void populate_logical_and_dispatch_tables(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = logical_and_fn_ns;
+
+    // which input types are supported, and what is the type of the result
+    using fn_ns::LogicalAndTypeMapFactory;
+    DispatchTableBuilder<int, LogicalAndTypeMapFactory, num_types> dtb1;
+    dtb1.populate_dispatch_table(logical_and_output_id_table);
+
+    // function pointers for operation on general strided arrays
+    using fn_ns::LogicalAndStridedFactory;
+    DispatchTableBuilder<binary_strided_impl_fn_ptr_t, LogicalAndStridedFactory,
+                         num_types>
+        dtb2;
+    dtb2.populate_dispatch_table(logical_and_strided_dispatch_table);
+
+    // function pointers for operation on contiguous inputs and output
+    using fn_ns::LogicalAndContigFactory;
+    DispatchTableBuilder<binary_contig_impl_fn_ptr_t, LogicalAndContigFactory,
+                         num_types>
+        dtb3;
+    dtb3.populate_dispatch_table(logical_and_contig_dispatch_table);
+};
+
+} // namespace impl
+
+void init_logical_and(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        impl::populate_logical_and_dispatch_tables();
+        using impl::logical_and_contig_dispatch_table;
+        using impl::logical_and_output_id_table;
+        using impl::logical_and_strided_dispatch_table;
+
+        auto logical_and_pyapi = [&](const arrayT &src1, const arrayT &src2,
+                                     const arrayT &dst, sycl::queue &exec_q,
+                                     const event_vecT &depends = {}) {
+            return py_binary_ufunc(
+                src1, src2, dst, exec_q, depends, logical_and_output_id_table,
+                // function pointers to handle operation on contiguous arrays
+                // (pointers may be nullptr)
+                logical_and_contig_dispatch_table,
+                // function pointers to handle operation on strided arrays (most
+                // general case)
+                logical_and_strided_dispatch_table,
+                // function pointers to handle operation of c-contig matrix and
+                // c-contig row with broadcasting (may be nullptr)
+                td_ns::NullPtrTable<
+                    binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{},
+                // function pointers to handle operation of c-contig matrix and
+                // c-contig row with broadcasting (may be nullptr)
+                td_ns::NullPtrTable<
+                    binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{});
+        };
+        auto logical_and_result_type_pyapi = [&](const py::dtype &dtype1,
+                                                 const py::dtype &dtype2) {
+            return py_binary_ufunc_result_type(dtype1, dtype2,
+                                               logical_and_output_id_table);
+        };
+        m.def("_logical_and", logical_and_pyapi, "", py::arg("src1"),
+              py::arg("src2"), py::arg("dst"), py::arg("sycl_queue"),
+              py::arg("depends") = py::list());
+        m.def("_logical_and_result_type", logical_and_result_type_pyapi, "");
+    }
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/logical_and.hpp b/dpctl_ext/tensor/libtensor/source/elementwise_functions/logical_and.hpp
new file mode 100644
index 000000000000..c22a98f24146
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/elementwise_functions/logical_and.hpp
@@ -0,0 +1,46 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern void init_logical_and(py::module_ m);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/logical_or.cpp b/dpctl_ext/tensor/libtensor/source/elementwise_functions/logical_or.cpp
new file mode 100644
index 000000000000..38c981792345
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/elementwise_functions/logical_or.cpp
@@ -0,0 +1,146 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "elementwise_functions.hpp"
+#include "logical_or.hpp"
+#include "utils/type_dispatch.hpp"
+#include "utils/type_dispatch_building.hpp"
+
+#include "kernels/elementwise_functions/common.hpp"
+#include "kernels/elementwise_functions/logical_or.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
+using ew_cmn_ns::binary_contig_impl_fn_ptr_t;
+using ew_cmn_ns::binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t;
+using ew_cmn_ns::binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t;
+using ew_cmn_ns::binary_strided_impl_fn_ptr_t;
+
+// B17: ===== LOGICAL_OR (x1, x2)
+namespace impl
+{
+namespace logical_or_fn_ns = dpctl::tensor::kernels::logical_or;
+
+static binary_contig_impl_fn_ptr_t
+    logical_or_contig_dispatch_table[td_ns::num_types][td_ns::num_types];
+static int logical_or_output_id_table[td_ns::num_types][td_ns::num_types];
+
+static binary_strided_impl_fn_ptr_t
+    logical_or_strided_dispatch_table[td_ns::num_types][td_ns::num_types];
+
+void populate_logical_or_dispatch_tables(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = logical_or_fn_ns;
+
+    // which input types are supported, and what is the type of the result
+    using fn_ns::LogicalOrTypeMapFactory;
+    DispatchTableBuilder<int, LogicalOrTypeMapFactory, num_types> dtb1;
+    dtb1.populate_dispatch_table(logical_or_output_id_table);
+
+    // function pointers for operation on general strided arrays
+    using fn_ns::LogicalOrStridedFactory;
+    DispatchTableBuilder<binary_strided_impl_fn_ptr_t, LogicalOrStridedFactory,
+                         num_types>
+        dtb2;
+    dtb2.populate_dispatch_table(logical_or_strided_dispatch_table);
+
+    // function pointers for operation on contiguous inputs and output
+    using fn_ns::LogicalOrContigFactory;
+    DispatchTableBuilder<binary_contig_impl_fn_ptr_t, LogicalOrContigFactory,
+                         num_types>
+        dtb3;
+    dtb3.populate_dispatch_table(logical_or_contig_dispatch_table);
+};
+
+} // namespace impl
+
+void init_logical_or(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        impl::populate_logical_or_dispatch_tables();
+        using impl::logical_or_contig_dispatch_table;
+        using impl::logical_or_output_id_table;
+        using impl::logical_or_strided_dispatch_table;
+
+        auto logical_or_pyapi = [&](const arrayT &src1, const arrayT &src2,
+                                    const arrayT &dst, sycl::queue &exec_q,
+                                    const event_vecT &depends = {}) {
+            return py_binary_ufunc(
+                src1, src2, dst, exec_q, depends, logical_or_output_id_table,
+                // function pointers to handle operation on contiguous arrays
+                // (pointers may be nullptr)
+                logical_or_contig_dispatch_table,
+                // function pointers to handle operation on strided arrays (most
+                // general case)
+                logical_or_strided_dispatch_table,
+                // function pointers to handle operation of c-contig matrix and
+                // c-contig row with broadcasting (may be nullptr)
+                td_ns::NullPtrTable<
+                    binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{},
+                // function pointers to handle operation of c-contig matrix and
+                // c-contig row with broadcasting (may be nullptr)
+                td_ns::NullPtrTable<
+                    binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{});
+        };
+        auto logical_or_result_type_pyapi = [&](const py::dtype &dtype1,
+                                                const py::dtype &dtype2) {
+            return py_binary_ufunc_result_type(dtype1, dtype2,
+                                               logical_or_output_id_table);
+        };
+        m.def("_logical_or", logical_or_pyapi, "", py::arg("src1"),
+              py::arg("src2"), py::arg("dst"), py::arg("sycl_queue"),
+              py::arg("depends") = py::list());
+        m.def("_logical_or_result_type", logical_or_result_type_pyapi, "");
+    }
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/logical_or.hpp b/dpctl_ext/tensor/libtensor/source/elementwise_functions/logical_or.hpp
new file mode 100644
index 000000000000..11e83fe8cedf
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/elementwise_functions/logical_or.hpp
@@ -0,0 +1,46 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern void init_logical_or(py::module_ m);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/logical_xor.cpp b/dpctl_ext/tensor/libtensor/source/elementwise_functions/logical_xor.cpp
new file mode 100644
index 000000000000..759133ca6120
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/elementwise_functions/logical_xor.cpp
@@ -0,0 +1,146 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "elementwise_functions.hpp"
+#include "logical_xor.hpp"
+#include "utils/type_dispatch.hpp"
+#include "utils/type_dispatch_building.hpp"
+
+#include "kernels/elementwise_functions/common.hpp"
+#include "kernels/elementwise_functions/logical_xor.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
+using ew_cmn_ns::binary_contig_impl_fn_ptr_t;
+using ew_cmn_ns::binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t;
+using ew_cmn_ns::binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t;
+using ew_cmn_ns::binary_strided_impl_fn_ptr_t;
+
+// B18: ===== LOGICAL_XOR (x1, x2)
+namespace impl
+{
+namespace logical_xor_fn_ns = dpctl::tensor::kernels::logical_xor;
+
+static binary_contig_impl_fn_ptr_t
+    logical_xor_contig_dispatch_table[td_ns::num_types][td_ns::num_types];
+static int logical_xor_output_id_table[td_ns::num_types][td_ns::num_types];
+
+static binary_strided_impl_fn_ptr_t
+    logical_xor_strided_dispatch_table[td_ns::num_types][td_ns::num_types];
+
+void populate_logical_xor_dispatch_tables(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = logical_xor_fn_ns;
+
+    // which input types are supported, and what is the type of the result
+    using fn_ns::LogicalXorTypeMapFactory;
+    DispatchTableBuilder<int, LogicalXorTypeMapFactory, num_types> dtb1;
+    dtb1.populate_dispatch_table(logical_xor_output_id_table);
+
+    // function pointers for operation on general strided arrays
+    using fn_ns::LogicalXorStridedFactory;
+    DispatchTableBuilder<binary_strided_impl_fn_ptr_t, LogicalXorStridedFactory,
+                         num_types>
+        dtb2;
+    dtb2.populate_dispatch_table(logical_xor_strided_dispatch_table);
+
+    // function pointers for operation on contiguous inputs and output
+    using fn_ns::LogicalXorContigFactory;
+    DispatchTableBuilder<binary_contig_impl_fn_ptr_t, LogicalXorContigFactory,
+                         num_types>
+        dtb3;
+    dtb3.populate_dispatch_table(logical_xor_contig_dispatch_table);
+};
+
+} // namespace impl
+
+void init_logical_xor(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        impl::populate_logical_xor_dispatch_tables();
+        using impl::logical_xor_contig_dispatch_table;
+        using impl::logical_xor_output_id_table;
+        using impl::logical_xor_strided_dispatch_table;
+
+        auto logical_xor_pyapi = [&](const arrayT &src1, const arrayT &src2,
+                                     const arrayT &dst, sycl::queue &exec_q,
+                                     const event_vecT &depends = {}) {
+            return py_binary_ufunc(
+                src1, src2, dst, exec_q, depends, logical_xor_output_id_table,
+                // function pointers to handle operation on contiguous arrays
+                // (pointers may be nullptr)
+                logical_xor_contig_dispatch_table,
+                // function pointers to handle operation on strided arrays (most
+                // general case)
+                logical_xor_strided_dispatch_table,
+                // function pointers to handle operation of c-contig matrix and
+                // c-contig row with broadcasting (may be nullptr)
+                td_ns::NullPtrTable<
+                    binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{},
+                // function pointers to handle operation of c-contig matrix and
+                // c-contig row with broadcasting (may be nullptr)
+                td_ns::NullPtrTable<
+                    binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{});
+        };
+        auto logical_xor_result_type_pyapi = [&](const py::dtype &dtype1,
+                                                 const py::dtype &dtype2) {
+            return py_binary_ufunc_result_type(dtype1, dtype2,
+                                               logical_xor_output_id_table);
+        };
+        m.def("_logical_xor", logical_xor_pyapi, "", py::arg("src1"),
+              py::arg("src2"), py::arg("dst"), py::arg("sycl_queue"),
+              py::arg("depends") = py::list());
+        m.def("_logical_xor_result_type", logical_xor_result_type_pyapi, "");
+    }
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/logical_xor.hpp b/dpctl_ext/tensor/libtensor/source/elementwise_functions/logical_xor.hpp
new file mode 100644
index 000000000000..24c163249128
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/elementwise_functions/logical_xor.hpp
@@ -0,0 +1,46 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern void init_logical_xor(py::module_ m);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/maximum.cpp b/dpctl_ext/tensor/libtensor/source/elementwise_functions/maximum.cpp
new file mode 100644
index 000000000000..8fda65c43dca
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/elementwise_functions/maximum.cpp
@@ -0,0 +1,146 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "elementwise_functions.hpp"
+#include "maximum.hpp"
+#include "utils/type_dispatch.hpp"
+#include "utils/type_dispatch_building.hpp"
+
+#include "kernels/elementwise_functions/common.hpp"
+#include "kernels/elementwise_functions/maximum.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
+using ew_cmn_ns::binary_contig_impl_fn_ptr_t;
+using ew_cmn_ns::binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t;
+using ew_cmn_ns::binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t;
+using ew_cmn_ns::binary_strided_impl_fn_ptr_t;
+
+// B26: ===== MAXIMUM (x1, x2)
+namespace impl
+{
+namespace maximum_fn_ns = dpctl::tensor::kernels::maximum;
+
+static binary_contig_impl_fn_ptr_t
+    maximum_contig_dispatch_table[td_ns::num_types][td_ns::num_types];
+static int maximum_output_id_table[td_ns::num_types][td_ns::num_types];
+
+static binary_strided_impl_fn_ptr_t
+    maximum_strided_dispatch_table[td_ns::num_types][td_ns::num_types];
+
+void populate_maximum_dispatch_tables(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = maximum_fn_ns;
+
+    // which input types are supported, and what is the type of the result
+    using fn_ns::MaximumTypeMapFactory;
+    DispatchTableBuilder<int, MaximumTypeMapFactory, num_types> dtb1;
+    dtb1.populate_dispatch_table(maximum_output_id_table);
+
+    // function pointers for operation on general strided arrays
+    using fn_ns::MaximumStridedFactory;
+    DispatchTableBuilder<binary_strided_impl_fn_ptr_t, MaximumStridedFactory,
+                         num_types>
+        dtb2;
+    dtb2.populate_dispatch_table(maximum_strided_dispatch_table);
+
+    // function pointers for operation on contiguous inputs and output
+    using fn_ns::MaximumContigFactory;
+    DispatchTableBuilder<binary_contig_impl_fn_ptr_t, MaximumContigFactory,
+                         num_types>
+        dtb3;
+    dtb3.populate_dispatch_table(maximum_contig_dispatch_table);
+};
+
+} // namespace impl
+
+void init_maximum(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        impl::populate_maximum_dispatch_tables();
+        using impl::maximum_contig_dispatch_table;
+        using impl::maximum_output_id_table;
+        using impl::maximum_strided_dispatch_table;
+
+        auto maximum_pyapi = [&](const arrayT &src1, const arrayT &src2,
+                                 const arrayT &dst, sycl::queue &exec_q,
+                                 const event_vecT &depends = {}) {
+            return py_binary_ufunc(
+                src1, src2, dst, exec_q, depends, maximum_output_id_table,
+                // function pointers to handle operation on contiguous arrays
+                // (pointers may be nullptr)
+                maximum_contig_dispatch_table,
+                // function pointers to handle operation on strided arrays (most
+                // general case)
+                maximum_strided_dispatch_table,
+                // function pointers to handle operation of c-contig matrix and
+                // c-contig row with broadcasting (may be nullptr)
+                td_ns::NullPtrTable<
+                    binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{},
+                // function pointers to handle operation of c-contig matrix and
+                // c-contig row with broadcasting (may be nullptr)
+                td_ns::NullPtrTable<
+                    binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{});
+        };
+        auto maximum_result_type_pyapi = [&](const py::dtype &dtype1,
+                                             const py::dtype &dtype2) {
+            return py_binary_ufunc_result_type(dtype1, dtype2,
+                                               maximum_output_id_table);
+        };
+        m.def("_maximum", maximum_pyapi, "", py::arg("src1"), py::arg("src2"),
+              py::arg("dst"), py::arg("sycl_queue"),
+              py::arg("depends") = py::list());
+        m.def("_maximum_result_type", maximum_result_type_pyapi, "");
+    }
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/maximum.hpp b/dpctl_ext/tensor/libtensor/source/elementwise_functions/maximum.hpp
new file mode 100644
index 000000000000..1f8fc027ac1d
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/elementwise_functions/maximum.hpp
@@ -0,0 +1,46 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern void init_maximum(py::module_ m);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/minimum.cpp b/dpctl_ext/tensor/libtensor/source/elementwise_functions/minimum.cpp
new file mode 100644
index 000000000000..7055ce5c72f5
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/elementwise_functions/minimum.cpp
@@ -0,0 +1,146 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "elementwise_functions.hpp"
+#include "minimum.hpp"
+#include "utils/type_dispatch.hpp"
+#include "utils/type_dispatch_building.hpp"
+
+#include "kernels/elementwise_functions/common.hpp"
+#include "kernels/elementwise_functions/minimum.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
+using ew_cmn_ns::binary_contig_impl_fn_ptr_t;
+using ew_cmn_ns::binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t;
+using ew_cmn_ns::binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t;
+using ew_cmn_ns::binary_strided_impl_fn_ptr_t;
+
+// B27: ===== MINIMUM (x1, x2)
+namespace impl
+{
+namespace minimum_fn_ns = dpctl::tensor::kernels::minimum;
+
+static binary_contig_impl_fn_ptr_t
+    minimum_contig_dispatch_table[td_ns::num_types][td_ns::num_types];
+static int minimum_output_id_table[td_ns::num_types][td_ns::num_types];
+
+static binary_strided_impl_fn_ptr_t
+    minimum_strided_dispatch_table[td_ns::num_types][td_ns::num_types];
+
+void populate_minimum_dispatch_tables(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = minimum_fn_ns;
+
+    // which input types are supported, and what is the type of the result
+    using fn_ns::MinimumTypeMapFactory;
+    DispatchTableBuilder<int, MinimumTypeMapFactory, num_types> dtb1;
+    dtb1.populate_dispatch_table(minimum_output_id_table);
+
+    // function pointers for operation on general strided arrays
+    using fn_ns::MinimumStridedFactory;
+    DispatchTableBuilder<binary_strided_impl_fn_ptr_t, MinimumStridedFactory,
+                         num_types>
+        dtb2;
+    dtb2.populate_dispatch_table(minimum_strided_dispatch_table);
+
+    // function pointers for operation on contiguous inputs and output
+    using fn_ns::MinimumContigFactory;
+    DispatchTableBuilder<binary_contig_impl_fn_ptr_t, MinimumContigFactory,
+                         num_types>
+        dtb3;
+    dtb3.populate_dispatch_table(minimum_contig_dispatch_table);
+};
+
+} // namespace impl
+
+void init_minimum(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        impl::populate_minimum_dispatch_tables();
+        using impl::minimum_contig_dispatch_table;
+        using impl::minimum_output_id_table;
+        using impl::minimum_strided_dispatch_table;
+
+        auto minimum_pyapi = [&](const arrayT &src1, const arrayT &src2,
+                                 const arrayT &dst, sycl::queue &exec_q,
+                                 const event_vecT &depends = {}) {
+            return py_binary_ufunc(
+                src1, src2, dst, exec_q, depends, minimum_output_id_table,
+                // function pointers to handle operation on contiguous arrays
+                // (pointers may be nullptr)
+                minimum_contig_dispatch_table,
+                // function pointers to handle operation on strided arrays (most
+                // general case)
+                minimum_strided_dispatch_table,
+                // function pointers to handle operation of c-contig matrix and
+                // c-contig row with broadcasting (may be nullptr)
+                td_ns::NullPtrTable<
+                    binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{},
+                // function pointers to handle operation of c-contig matrix and
+                // c-contig row with broadcasting (may be nullptr)
+                td_ns::NullPtrTable<
+                    binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{});
+        };
+        auto minimum_result_type_pyapi = [&](const py::dtype &dtype1,
+                                             const py::dtype &dtype2) {
+            return py_binary_ufunc_result_type(dtype1, dtype2,
+                                               minimum_output_id_table);
+        };
+        m.def("_minimum", minimum_pyapi, "", py::arg("src1"), py::arg("src2"),
+              py::arg("dst"), py::arg("sycl_queue"),
+              py::arg("depends") = py::list());
+        m.def("_minimum_result_type", minimum_result_type_pyapi, "");
+    }
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/minimum.hpp b/dpctl_ext/tensor/libtensor/source/elementwise_functions/minimum.hpp
new file mode 100644
index 000000000000..be2e18a9b37c
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/elementwise_functions/minimum.hpp
@@ -0,0 +1,46 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern void init_minimum(py::module_ m);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/multiply.cpp b/dpctl_ext/tensor/libtensor/source/elementwise_functions/multiply.cpp
new file mode 100644
index 000000000000..5d25f8cc7b19
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/elementwise_functions/multiply.cpp
@@ -0,0 +1,244 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "elementwise_functions.hpp"
+#include "multiply.hpp"
+#include "utils/type_dispatch.hpp"
+#include "utils/type_dispatch_building.hpp"
+
+#include "kernels/elementwise_functions/common.hpp"
+#include "kernels/elementwise_functions/common_inplace.hpp"
+#include "kernels/elementwise_functions/multiply.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
+using ew_cmn_ns::binary_contig_impl_fn_ptr_t;
+using ew_cmn_ns::binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t;
+using ew_cmn_ns::binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t;
+using ew_cmn_ns::binary_strided_impl_fn_ptr_t;
+
+using ew_cmn_ns::binary_inplace_contig_impl_fn_ptr_t;
+using ew_cmn_ns::binary_inplace_row_matrix_broadcast_impl_fn_ptr_t;
+using ew_cmn_ns::binary_inplace_strided_impl_fn_ptr_t;
+
+// B19: ===== MULTIPLY (x1, x2)
+namespace impl
+{
+
+namespace multiply_fn_ns = dpctl::tensor::kernels::multiply;
+
+static binary_contig_impl_fn_ptr_t
+    multiply_contig_dispatch_table[td_ns::num_types][td_ns::num_types];
+
+static int multiply_output_id_table[td_ns::num_types][td_ns::num_types];
+static int multiply_inplace_output_id_table[td_ns::num_types][td_ns::num_types];
+
+static binary_strided_impl_fn_ptr_t
+    multiply_strided_dispatch_table[td_ns::num_types][td_ns::num_types];
+
+// mul(matrix, row)
+static binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t
+    multiply_contig_matrix_contig_row_broadcast_dispatch_table
+        [td_ns::num_types][td_ns::num_types];
+
+// mul(row, matrix)
+static binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t
+    multiply_contig_row_contig_matrix_broadcast_dispatch_table
+        [td_ns::num_types][td_ns::num_types];
+
+static binary_inplace_contig_impl_fn_ptr_t
+    multiply_inplace_contig_dispatch_table[td_ns::num_types][td_ns::num_types];
+static binary_inplace_strided_impl_fn_ptr_t
+    multiply_inplace_strided_dispatch_table[td_ns::num_types][td_ns::num_types];
+static binary_inplace_row_matrix_broadcast_impl_fn_ptr_t
+    multiply_inplace_row_matrix_dispatch_table[td_ns::num_types]
+                                              [td_ns::num_types];
+
+void populate_multiply_dispatch_tables(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = multiply_fn_ns;
+
+    // which input types are supported, and what is the type of the result
+    using fn_ns::MultiplyTypeMapFactory;
+    DispatchTableBuilder<int, MultiplyTypeMapFactory, num_types> dtb1;
+    dtb1.populate_dispatch_table(multiply_output_id_table);
+
+    // function pointers for operation on general strided arrays
+    using fn_ns::MultiplyStridedFactory;
+    DispatchTableBuilder<binary_strided_impl_fn_ptr_t, MultiplyStridedFactory,
+                         num_types>
+        dtb2;
+    dtb2.populate_dispatch_table(multiply_strided_dispatch_table);
+
+    // function pointers for operation on contiguous inputs and output
+    using fn_ns::MultiplyContigFactory;
+    DispatchTableBuilder<binary_contig_impl_fn_ptr_t, MultiplyContigFactory,
+                         num_types>
+        dtb3;
+    dtb3.populate_dispatch_table(multiply_contig_dispatch_table);
+
+    // function pointers for operation on contiguous matrix, contiguous row
+    // with contiguous matrix output
+    using fn_ns::MultiplyContigMatrixContigRowBroadcastFactory;
+    DispatchTableBuilder<
+        binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t,
+        MultiplyContigMatrixContigRowBroadcastFactory, num_types>
+        dtb4;
+    dtb4.populate_dispatch_table(
+        multiply_contig_matrix_contig_row_broadcast_dispatch_table);
+
+    // function pointers for operation on contiguous row, contiguous matrix
+    // with contiguous matrix output
+    using fn_ns::MultiplyContigRowContigMatrixBroadcastFactory;
+    DispatchTableBuilder<
+        binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t,
+        MultiplyContigRowContigMatrixBroadcastFactory, num_types>
+        dtb5;
+    dtb5.populate_dispatch_table(
+        multiply_contig_row_contig_matrix_broadcast_dispatch_table);
+
+    // function pointers for inplace operation on general strided arrays
+    using fn_ns::MultiplyInplaceStridedFactory;
+    DispatchTableBuilder<binary_inplace_strided_impl_fn_ptr_t,
+                         MultiplyInplaceStridedFactory, num_types>
+        dtb6;
+    dtb6.populate_dispatch_table(multiply_inplace_strided_dispatch_table);
+
+    // function pointers for inplace operation on contiguous inputs and output
+    using fn_ns::MultiplyInplaceContigFactory;
+    DispatchTableBuilder<binary_inplace_contig_impl_fn_ptr_t,
+                         MultiplyInplaceContigFactory, num_types>
+        dtb7;
+    dtb7.populate_dispatch_table(multiply_inplace_contig_dispatch_table);
+
+    // function pointers for inplace operation on contiguous matrix
+    // and contiguous row
+    using fn_ns::MultiplyInplaceRowMatrixBroadcastFactory;
+    DispatchTableBuilder<binary_inplace_row_matrix_broadcast_impl_fn_ptr_t,
+                         MultiplyInplaceRowMatrixBroadcastFactory, num_types>
+        dtb8;
+    dtb8.populate_dispatch_table(multiply_inplace_row_matrix_dispatch_table);
+
+    // which types are supported by the in-place kernels
+    using fn_ns::MultiplyInplaceTypeMapFactory;
+    DispatchTableBuilder<int, MultiplyInplaceTypeMapFactory, num_types> dtb9;
+    dtb9.populate_dispatch_table(multiply_inplace_output_id_table);
+};
+
+} // namespace impl
+
+void init_multiply(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        impl::populate_multiply_dispatch_tables();
+        using impl::multiply_contig_dispatch_table;
+        using impl::multiply_contig_matrix_contig_row_broadcast_dispatch_table;
+        using impl::multiply_contig_row_contig_matrix_broadcast_dispatch_table;
+        using impl::multiply_output_id_table;
+        using impl::multiply_strided_dispatch_table;
+
+        auto multiply_pyapi = [&](const arrayT &src1, const arrayT &src2,
+                                  const arrayT &dst, sycl::queue &exec_q,
+                                  const event_vecT &depends = {}) {
+            return py_binary_ufunc(
+                src1, src2, dst, exec_q, depends, multiply_output_id_table,
+                // function pointers to handle operation on contiguous
+                // arrays (pointers may be nullptr)
+                multiply_contig_dispatch_table,
+                // function pointers to handle operation on strided arrays
+                // (most general case)
+                multiply_strided_dispatch_table,
+                // function pointers to handle operation of c-contig matrix
+                // and c-contig row with broadcasting (may be nullptr)
+                multiply_contig_matrix_contig_row_broadcast_dispatch_table,
+                // function pointers to handle operation of c-contig matrix
+                // and c-contig row with broadcasting (may be nullptr)
+                multiply_contig_row_contig_matrix_broadcast_dispatch_table);
+        };
+        auto multiply_result_type_pyapi = [&](const py::dtype &dtype1,
+                                              const py::dtype &dtype2) {
+            return py_binary_ufunc_result_type(dtype1, dtype2,
+                                               multiply_output_id_table);
+        };
+        m.def("_multiply", multiply_pyapi, "", py::arg("src1"), py::arg("src2"),
+              py::arg("dst"), py::arg("sycl_queue"),
+              py::arg("depends") = py::list());
+        m.def("_multiply_result_type", multiply_result_type_pyapi, "");
+
+        using impl::multiply_inplace_contig_dispatch_table;
+        using impl::multiply_inplace_output_id_table;
+        using impl::multiply_inplace_row_matrix_dispatch_table;
+        using impl::multiply_inplace_strided_dispatch_table;
+
+        auto multiply_inplace_pyapi = [&](const arrayT &src, const arrayT &dst,
+                                          sycl::queue &exec_q,
+                                          const event_vecT &depends = {}) {
+            return py_binary_inplace_ufunc(
+                src, dst, exec_q, depends, multiply_inplace_output_id_table,
+                // function pointers to handle inplace operation on
+                // contiguous arrays (pointers may be nullptr)
+                multiply_inplace_contig_dispatch_table,
+                // function pointers to handle inplace operation on strided
+                // arrays (most general case)
+                multiply_inplace_strided_dispatch_table,
+                // function pointers to handle inplace operation on
+                // c-contig matrix with c-contig row with broadcasting
+                // (may be nullptr)
+                multiply_inplace_row_matrix_dispatch_table);
+        };
+        m.def("_multiply_inplace", multiply_inplace_pyapi, "", py::arg("lhs"),
+              py::arg("rhs"), py::arg("sycl_queue"),
+              py::arg("depends") = py::list());
+    }
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/multiply.hpp b/dpctl_ext/tensor/libtensor/source/elementwise_functions/multiply.hpp
new file mode 100644
index 000000000000..a4ed946a8501
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/elementwise_functions/multiply.hpp
@@ -0,0 +1,46 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern void init_multiply(py::module_ m);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/nextafter.cpp b/dpctl_ext/tensor/libtensor/source/elementwise_functions/nextafter.cpp
new file mode 100644
index 000000000000..42e1ac9bd4c3
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/elementwise_functions/nextafter.cpp
@@ -0,0 +1,146 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "elementwise_functions.hpp"
+#include "nextafter.hpp"
+#include "utils/type_dispatch.hpp"
+#include "utils/type_dispatch_building.hpp"
+
+#include "kernels/elementwise_functions/common.hpp"
+#include "kernels/elementwise_functions/nextafter.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
+using ew_cmn_ns::binary_contig_impl_fn_ptr_t;
+using ew_cmn_ns::binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t;
+using ew_cmn_ns::binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t;
+using ew_cmn_ns::binary_strided_impl_fn_ptr_t;
+
+// B28: ===== NEXTAFTER (x1, x2)
+namespace impl
+{
+namespace nextafter_fn_ns = dpctl::tensor::kernels::nextafter;
+
+static binary_contig_impl_fn_ptr_t
+    nextafter_contig_dispatch_table[td_ns::num_types][td_ns::num_types];
+static int nextafter_output_id_table[td_ns::num_types][td_ns::num_types];
+
+static binary_strided_impl_fn_ptr_t
+    nextafter_strided_dispatch_table[td_ns::num_types][td_ns::num_types];
+
+void populate_nextafter_dispatch_tables(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = nextafter_fn_ns;
+
+    // which input types are supported, and what is the type of the result
+    using fn_ns::NextafterTypeMapFactory;
+    DispatchTableBuilder<int, NextafterTypeMapFactory, num_types> dtb1;
+    dtb1.populate_dispatch_table(nextafter_output_id_table);
+
+    // function pointers for operation on general strided arrays
+    using fn_ns::NextafterStridedFactory;
+    DispatchTableBuilder<binary_strided_impl_fn_ptr_t, NextafterStridedFactory,
+                         num_types>
+        dtb2;
+    dtb2.populate_dispatch_table(nextafter_strided_dispatch_table);
+
+    // function pointers for operation on contiguous inputs and output
+    using fn_ns::NextafterContigFactory;
+    DispatchTableBuilder<binary_contig_impl_fn_ptr_t, NextafterContigFactory,
+                         num_types>
+        dtb3;
+    dtb3.populate_dispatch_table(nextafter_contig_dispatch_table);
+};
+
+} // namespace impl
+
+void init_nextafter(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        impl::populate_nextafter_dispatch_tables();
+        using impl::nextafter_contig_dispatch_table;
+        using impl::nextafter_output_id_table;
+        using impl::nextafter_strided_dispatch_table;
+
+        auto nextafter_pyapi = [&](const arrayT &src1, const arrayT &src2,
+                                   const arrayT &dst, sycl::queue &exec_q,
+                                   const event_vecT &depends = {}) {
+            return py_binary_ufunc(
+                src1, src2, dst, exec_q, depends, nextafter_output_id_table,
+                // function pointers to handle operation on contiguous arrays
+                // (pointers may be nullptr)
+                nextafter_contig_dispatch_table,
+                // function pointers to handle operation on strided arrays (most
+                // general case)
+                nextafter_strided_dispatch_table,
+                // function pointers to handle operation of c-contig matrix and
+                // c-contig row with broadcasting (may be nullptr)
+                td_ns::NullPtrTable<
+                    binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{},
+                // function pointers to handle operation of c-contig matrix and
+                // c-contig row with broadcasting (may be nullptr)
+                td_ns::NullPtrTable<
+                    binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{});
+        };
+        auto nextafter_result_type_pyapi = [&](const py::dtype &dtype1,
+                                               const py::dtype &dtype2) {
+            return py_binary_ufunc_result_type(dtype1, dtype2,
+                                               nextafter_output_id_table);
+        };
+        m.def("_nextafter", nextafter_pyapi, "", py::arg("src1"),
+              py::arg("src2"), py::arg("dst"), py::arg("sycl_queue"),
+              py::arg("depends") = py::list());
+        m.def("_nextafter_result_type", nextafter_result_type_pyapi, "");
+    }
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/nextafter.hpp b/dpctl_ext/tensor/libtensor/source/elementwise_functions/nextafter.hpp
new file mode 100644
index 000000000000..76ad701d4012
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/elementwise_functions/nextafter.hpp
@@ -0,0 +1,46 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern void init_nextafter(py::module_ m);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/not_equal.cpp b/dpctl_ext/tensor/libtensor/source/elementwise_functions/not_equal.cpp
new file mode 100644
index 000000000000..dcbbf0cf015e
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/elementwise_functions/not_equal.cpp
@@ -0,0 +1,146 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "elementwise_functions.hpp"
+#include "not_equal.hpp"
+#include "utils/type_dispatch.hpp"
+#include "utils/type_dispatch_building.hpp"
+
+#include "kernels/elementwise_functions/common.hpp"
+#include "kernels/elementwise_functions/not_equal.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
+using ew_cmn_ns::binary_contig_impl_fn_ptr_t;
+using ew_cmn_ns::binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t;
+using ew_cmn_ns::binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t;
+using ew_cmn_ns::binary_strided_impl_fn_ptr_t;
+
+// B20: ===== NOT_EQUAL (x1, x2)
+namespace impl
+{
+namespace not_equal_fn_ns = dpctl::tensor::kernels::not_equal;
+
+static binary_contig_impl_fn_ptr_t
+    not_equal_contig_dispatch_table[td_ns::num_types][td_ns::num_types];
+static int not_equal_output_id_table[td_ns::num_types][td_ns::num_types];
+
+static binary_strided_impl_fn_ptr_t
+    not_equal_strided_dispatch_table[td_ns::num_types][td_ns::num_types];
+
+void populate_not_equal_dispatch_tables(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = not_equal_fn_ns;
+
+    // which input types are supported, and what is the type of the result
+    using fn_ns::NotEqualTypeMapFactory;
+    DispatchTableBuilder<int, NotEqualTypeMapFactory, num_types> dtb1;
+    dtb1.populate_dispatch_table(not_equal_output_id_table);
+
+    // function pointers for operation on general strided arrays
+    using fn_ns::NotEqualStridedFactory;
+    DispatchTableBuilder<binary_strided_impl_fn_ptr_t, NotEqualStridedFactory,
+                         num_types>
+        dtb2;
+    dtb2.populate_dispatch_table(not_equal_strided_dispatch_table);
+
+    // function pointers for operation on contiguous inputs and output
+    using fn_ns::NotEqualContigFactory;
+    DispatchTableBuilder<binary_contig_impl_fn_ptr_t, NotEqualContigFactory,
+                         num_types>
+        dtb3;
+    dtb3.populate_dispatch_table(not_equal_contig_dispatch_table);
+};
+
+} // namespace impl
+
+void init_not_equal(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        impl::populate_not_equal_dispatch_tables();
+        using impl::not_equal_contig_dispatch_table;
+        using impl::not_equal_output_id_table;
+        using impl::not_equal_strided_dispatch_table;
+
+        auto not_equal_pyapi = [&](const arrayT &src1, const arrayT &src2,
+                                   const arrayT &dst, sycl::queue &exec_q,
+                                   const event_vecT &depends = {}) {
+            return py_binary_ufunc(
+                src1, src2, dst, exec_q, depends, not_equal_output_id_table,
+                // function pointers to handle operation on contiguous arrays
+                // (pointers may be nullptr)
+                not_equal_contig_dispatch_table,
+                // function pointers to handle operation on strided arrays (most
+                // general case)
+                not_equal_strided_dispatch_table,
+                // function pointers to handle operation of c-contig matrix and
+                // c-contig row with broadcasting (may be nullptr)
+                td_ns::NullPtrTable<
+                    binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{},
+                // function pointers to handle operation of c-contig matrix and
+                // c-contig row with broadcasting (may be nullptr)
+                td_ns::NullPtrTable<
+                    binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{});
+        };
+        auto not_equal_result_type_pyapi = [&](const py::dtype &dtype1,
+                                               const py::dtype &dtype2) {
+            return py_binary_ufunc_result_type(dtype1, dtype2,
+                                               not_equal_output_id_table);
+        };
+        m.def("_not_equal", not_equal_pyapi, "", py::arg("src1"),
+              py::arg("src2"), py::arg("dst"), py::arg("sycl_queue"),
+              py::arg("depends") = py::list());
+        m.def("_not_equal_result_type", not_equal_result_type_pyapi, "");
+    }
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/not_equal.hpp b/dpctl_ext/tensor/libtensor/source/elementwise_functions/not_equal.hpp
new file mode 100644
index 000000000000..c6c99bb793bc
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/elementwise_functions/not_equal.hpp
@@ -0,0 +1,46 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern void init_not_equal(py::module_ m);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/pow.cpp b/dpctl_ext/tensor/libtensor/source/elementwise_functions/pow.cpp
new file mode 100644
index 000000000000..990515fa5402
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/elementwise_functions/pow.cpp
@@ -0,0 +1,203 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "elementwise_functions.hpp"
+#include "pow.hpp"
+#include "utils/type_dispatch.hpp"
+#include "utils/type_dispatch_building.hpp"
+
+#include "kernels/elementwise_functions/common.hpp"
+#include "kernels/elementwise_functions/common_inplace.hpp"
+#include "kernels/elementwise_functions/pow.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
+using ew_cmn_ns::binary_contig_impl_fn_ptr_t;
+using ew_cmn_ns::binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t;
+using ew_cmn_ns::binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t;
+using ew_cmn_ns::binary_strided_impl_fn_ptr_t;
+
+using ew_cmn_ns::binary_inplace_contig_impl_fn_ptr_t;
+using ew_cmn_ns::binary_inplace_row_matrix_broadcast_impl_fn_ptr_t;
+using ew_cmn_ns::binary_inplace_strided_impl_fn_ptr_t;
+
+// B21: ===== POW (x1, x2)
+namespace impl
+{
+
+namespace pow_fn_ns = dpctl::tensor::kernels::pow;
+
+static binary_contig_impl_fn_ptr_t pow_contig_dispatch_table[td_ns::num_types]
+                                                            [td_ns::num_types];
+
+static int pow_output_id_table[td_ns::num_types][td_ns::num_types];
+static int pow_inplace_output_id_table[td_ns::num_types][td_ns::num_types];
+
+static binary_strided_impl_fn_ptr_t
+    pow_strided_dispatch_table[td_ns::num_types][td_ns::num_types];
+
+static binary_inplace_contig_impl_fn_ptr_t
+    pow_inplace_contig_dispatch_table[td_ns::num_types][td_ns::num_types];
+static binary_inplace_strided_impl_fn_ptr_t
+    pow_inplace_strided_dispatch_table[td_ns::num_types][td_ns::num_types];
+
+void populate_pow_dispatch_tables(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = pow_fn_ns;
+
+    // which input types are supported, and what is the type of the result
+    using fn_ns::PowTypeMapFactory;
+    DispatchTableBuilder<int, PowTypeMapFactory, num_types> dtb1;
+    dtb1.populate_dispatch_table(pow_output_id_table);
+
+    // function pointers for operation on general strided arrays
+    using fn_ns::PowStridedFactory;
+    DispatchTableBuilder<binary_strided_impl_fn_ptr_t, PowStridedFactory,
+                         num_types>
+        dtb2;
+    dtb2.populate_dispatch_table(pow_strided_dispatch_table);
+
+    // function pointers for operation on contiguous inputs and output
+    using fn_ns::PowContigFactory;
+    DispatchTableBuilder<binary_contig_impl_fn_ptr_t, PowContigFactory,
+                         num_types>
+        dtb3;
+    dtb3.populate_dispatch_table(pow_contig_dispatch_table);
+
+    // function pointers for inplace operation on general strided arrays
+    using fn_ns::PowInplaceStridedFactory;
+    DispatchTableBuilder<binary_inplace_strided_impl_fn_ptr_t,
+                         PowInplaceStridedFactory, num_types>
+        dtb4;
+    dtb4.populate_dispatch_table(pow_inplace_strided_dispatch_table);
+
+    // function pointers for inplace operation on contiguous inputs and output
+    using fn_ns::PowInplaceContigFactory;
+    DispatchTableBuilder<binary_inplace_contig_impl_fn_ptr_t,
+                         PowInplaceContigFactory, num_types>
+        dtb5;
+    dtb5.populate_dispatch_table(pow_inplace_contig_dispatch_table);
+
+    // which types are supported by the in-place kernels
+    using fn_ns::PowInplaceTypeMapFactory;
+    DispatchTableBuilder<int, PowInplaceTypeMapFactory, num_types> dtb6;
+    dtb6.populate_dispatch_table(pow_inplace_output_id_table);
+};
+
+} // namespace impl
+
+void init_pow(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        impl::populate_pow_dispatch_tables();
+        using impl::pow_contig_dispatch_table;
+        using impl::pow_output_id_table;
+        using impl::pow_strided_dispatch_table;
+
+        auto pow_pyapi = [&](const arrayT &src1, const arrayT &src2,
+                             const arrayT &dst, sycl::queue &exec_q,
+                             const event_vecT &depends = {}) {
+            return py_binary_ufunc(
+                src1, src2, dst, exec_q, depends, pow_output_id_table,
+                // function pointers to handle operation on contiguous arrays
+                // (pointers may be nullptr)
+                pow_contig_dispatch_table,
+                // function pointers to handle operation on strided arrays (most
+                // general case)
+                pow_strided_dispatch_table,
+                // function pointers to handle operation of c-contig matrix and
+                // c-contig row with broadcasting (may be nullptr)
+                td_ns::NullPtrTable<
+                    binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{},
+                // function pointers to handle operation of c-contig matrix and
+                // c-contig row with broadcasting (may be nullptr)
+                td_ns::NullPtrTable<
+                    binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{});
+        };
+        auto pow_result_type_pyapi = [&](const py::dtype &dtype1,
+                                         const py::dtype &dtype2) {
+            return py_binary_ufunc_result_type(dtype1, dtype2,
+                                               pow_output_id_table);
+        };
+        m.def("_pow", pow_pyapi, "", py::arg("src1"), py::arg("src2"),
+              py::arg("dst"), py::arg("sycl_queue"),
+              py::arg("depends") = py::list());
+        m.def("_pow_result_type", pow_result_type_pyapi, "");
+
+        using impl::pow_inplace_contig_dispatch_table;
+        using impl::pow_inplace_output_id_table;
+        using impl::pow_inplace_strided_dispatch_table;
+
+        auto pow_inplace_pyapi = [&](const arrayT &src, const arrayT &dst,
+                                     sycl::queue &exec_q,
+                                     const event_vecT &depends = {}) {
+            return py_binary_inplace_ufunc(
+                src, dst, exec_q, depends, pow_inplace_output_id_table,
+                // function pointers to handle inplace operation on
+                // contiguous arrays (pointers may be nullptr)
+                pow_inplace_contig_dispatch_table,
+                // function pointers to handle inplace operation on strided
+                // arrays (most general case)
+                pow_inplace_strided_dispatch_table,
+                // function pointers to handle inplace operation on
+                // c-contig matrix with c-contig row with broadcasting
+                // (may be nullptr)
+                td_ns::NullPtrTable<
+                    binary_inplace_row_matrix_broadcast_impl_fn_ptr_t>{});
+        };
+        m.def("_pow_inplace", pow_inplace_pyapi, "", py::arg("lhs"),
+              py::arg("rhs"), py::arg("sycl_queue"),
+              py::arg("depends") = py::list());
+    }
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/pow.hpp b/dpctl_ext/tensor/libtensor/source/elementwise_functions/pow.hpp
new file mode 100644
index 000000000000..197a23b80d8a
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/elementwise_functions/pow.hpp
@@ -0,0 +1,46 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern void init_pow(py::module_ m);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/remainder.cpp b/dpctl_ext/tensor/libtensor/source/elementwise_functions/remainder.cpp
new file mode 100644
index 000000000000..8bdcdbe1b3dd
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/elementwise_functions/remainder.cpp
@@ -0,0 +1,205 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "elementwise_functions.hpp"
+#include "remainder.hpp"
+#include "utils/type_dispatch.hpp"
+#include "utils/type_dispatch_building.hpp"
+
+#include "kernels/elementwise_functions/common.hpp"
+#include "kernels/elementwise_functions/common_inplace.hpp"
+#include "kernels/elementwise_functions/remainder.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
+using ew_cmn_ns::binary_contig_impl_fn_ptr_t;
+using ew_cmn_ns::binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t;
+using ew_cmn_ns::binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t;
+using ew_cmn_ns::binary_strided_impl_fn_ptr_t;
+
+using ew_cmn_ns::binary_inplace_contig_impl_fn_ptr_t;
+using ew_cmn_ns::binary_inplace_row_matrix_broadcast_impl_fn_ptr_t;
+using ew_cmn_ns::binary_inplace_strided_impl_fn_ptr_t;
+
+// B22: ===== REMAINDER (x1, x2)
+namespace impl
+{
+
+namespace remainder_fn_ns = dpctl::tensor::kernels::remainder;
+
+static binary_contig_impl_fn_ptr_t
+    remainder_contig_dispatch_table[td_ns::num_types][td_ns::num_types];
+
+static int remainder_output_id_table[td_ns::num_types][td_ns::num_types];
+static int remainder_inplace_output_id_table[td_ns::num_types]
+                                            [td_ns::num_types];
+
+static binary_strided_impl_fn_ptr_t
+    remainder_strided_dispatch_table[td_ns::num_types][td_ns::num_types];
+
+static binary_inplace_contig_impl_fn_ptr_t
+    remainder_inplace_contig_dispatch_table[td_ns::num_types][td_ns::num_types];
+static binary_inplace_strided_impl_fn_ptr_t
+    remainder_inplace_strided_dispatch_table[td_ns::num_types]
+                                            [td_ns::num_types];
+
+void populate_remainder_dispatch_tables(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = remainder_fn_ns;
+
+    // which input types are supported, and what is the type of the result
+    using fn_ns::RemainderTypeMapFactory;
+    DispatchTableBuilder<int, RemainderTypeMapFactory, num_types> dtb1;
+    dtb1.populate_dispatch_table(remainder_output_id_table);
+
+    // function pointers for operation on general strided arrays
+    using fn_ns::RemainderStridedFactory;
+    DispatchTableBuilder<binary_strided_impl_fn_ptr_t, RemainderStridedFactory,
+                         num_types>
+        dtb2;
+    dtb2.populate_dispatch_table(remainder_strided_dispatch_table);
+
+    // function pointers for operation on contiguous inputs and output
+    using fn_ns::RemainderContigFactory;
+    DispatchTableBuilder<binary_contig_impl_fn_ptr_t, RemainderContigFactory,
+                         num_types>
+        dtb3;
+    dtb3.populate_dispatch_table(remainder_contig_dispatch_table);
+
+    // function pointers for inplace operation on general strided arrays
+    using fn_ns::RemainderInplaceStridedFactory;
+    DispatchTableBuilder<binary_inplace_strided_impl_fn_ptr_t,
+                         RemainderInplaceStridedFactory, num_types>
+        dtb4;
+    dtb4.populate_dispatch_table(remainder_inplace_strided_dispatch_table);
+
+    // function pointers for inplace operation on contiguous inputs and output
+    using fn_ns::RemainderInplaceContigFactory;
+    DispatchTableBuilder<binary_inplace_contig_impl_fn_ptr_t,
+                         RemainderInplaceContigFactory, num_types>
+        dtb5;
+    dtb5.populate_dispatch_table(remainder_inplace_contig_dispatch_table);
+
+    // which types are supported by the in-place kernels
+    using fn_ns::RemainderInplaceTypeMapFactory;
+    DispatchTableBuilder<int, RemainderInplaceTypeMapFactory, num_types> dtb6;
+    dtb6.populate_dispatch_table(remainder_inplace_output_id_table);
+}
+
+} // namespace impl
+
+void init_remainder(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        impl::populate_remainder_dispatch_tables();
+        using impl::remainder_contig_dispatch_table;
+        using impl::remainder_output_id_table;
+        using impl::remainder_strided_dispatch_table;
+
+        auto remainder_pyapi = [&](const arrayT &src1, const arrayT &src2,
+                                   const arrayT &dst, sycl::queue &exec_q,
+                                   const event_vecT &depends = {}) {
+            return py_binary_ufunc(
+                src1, src2, dst, exec_q, depends, remainder_output_id_table,
+                // function pointers to handle operation on contiguous arrays
+                // (pointers may be nullptr)
+                remainder_contig_dispatch_table,
+                // function pointers to handle operation on strided arrays (most
+                // general case)
+                remainder_strided_dispatch_table,
+                // function pointers to handle operation of c-contig matrix and
+                // c-contig row with broadcasting (may be nullptr)
+                td_ns::NullPtrTable<
+                    binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{},
+                // function pointers to handle operation of c-contig matrix and
+                // c-contig row with broadcasting (may be nullptr)
+                td_ns::NullPtrTable<
+                    binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{});
+        };
+        auto remainder_result_type_pyapi = [&](const py::dtype &dtype1,
+                                               const py::dtype &dtype2) {
+            return py_binary_ufunc_result_type(dtype1, dtype2,
+                                               remainder_output_id_table);
+        };
+        m.def("_remainder", remainder_pyapi, "", py::arg("src1"),
+              py::arg("src2"), py::arg("dst"), py::arg("sycl_queue"),
+              py::arg("depends") = py::list());
+        m.def("_remainder_result_type", remainder_result_type_pyapi, "");
+
+        using impl::remainder_inplace_contig_dispatch_table;
+        using impl::remainder_inplace_output_id_table;
+        using impl::remainder_inplace_strided_dispatch_table;
+
+        auto remainder_inplace_pyapi = [&](const arrayT &src, const arrayT &dst,
+                                           sycl::queue &exec_q,
+                                           const event_vecT &depends = {}) {
+            return py_binary_inplace_ufunc(
+                src, dst, exec_q, depends, remainder_inplace_output_id_table,
+                // function pointers to handle inplace operation on
+                // contiguous arrays (pointers may be nullptr)
+                remainder_inplace_contig_dispatch_table,
+                // function pointers to handle inplace operation on strided
+                // arrays (most general case)
+                remainder_inplace_strided_dispatch_table,
+                // function pointers to handle inplace operation on
+                // c-contig matrix with c-contig row with broadcasting
+                // (may be nullptr)
+                td_ns::NullPtrTable<
+                    binary_inplace_row_matrix_broadcast_impl_fn_ptr_t>{});
+        };
+        m.def("_remainder_inplace", remainder_inplace_pyapi, "", py::arg("lhs"),
+              py::arg("rhs"), py::arg("sycl_queue"),
+              py::arg("depends") = py::list());
+    }
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/remainder.hpp b/dpctl_ext/tensor/libtensor/source/elementwise_functions/remainder.hpp
new file mode 100644
index 000000000000..c00bdc9e0e6c
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/elementwise_functions/remainder.hpp
@@ -0,0 +1,46 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern void init_remainder(py::module_ m);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/subtract.cpp b/dpctl_ext/tensor/libtensor/source/elementwise_functions/subtract.cpp
new file mode 100644
index 000000000000..ec6edaa52dd5
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/elementwise_functions/subtract.cpp
@@ -0,0 +1,243 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "elementwise_functions.hpp"
+#include "subtract.hpp"
+#include "utils/type_dispatch.hpp"
+#include "utils/type_dispatch_building.hpp"
+
+#include "kernels/elementwise_functions/common.hpp"
+#include "kernels/elementwise_functions/common_inplace.hpp"
+#include "kernels/elementwise_functions/subtract.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
+using ew_cmn_ns::binary_contig_impl_fn_ptr_t;
+using ew_cmn_ns::binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t;
+using ew_cmn_ns::binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t;
+using ew_cmn_ns::binary_strided_impl_fn_ptr_t;
+
+using ew_cmn_ns::binary_inplace_contig_impl_fn_ptr_t;
+using ew_cmn_ns::binary_inplace_row_matrix_broadcast_impl_fn_ptr_t;
+using ew_cmn_ns::binary_inplace_strided_impl_fn_ptr_t;
+
+// B23: ===== SUBTRACT (x1, x2)
+namespace impl
+{
+namespace subtract_fn_ns = dpctl::tensor::kernels::subtract;
+
+static binary_contig_impl_fn_ptr_t
+    subtract_contig_dispatch_table[td_ns::num_types][td_ns::num_types];
+
+static int subtract_output_id_table[td_ns::num_types][td_ns::num_types];
+static int subtract_inplace_output_id_table[td_ns::num_types][td_ns::num_types];
+
+static binary_strided_impl_fn_ptr_t
+    subtract_strided_dispatch_table[td_ns::num_types][td_ns::num_types];
+
+// sub(matrix, row)
+static binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t
+    subtract_contig_matrix_contig_row_broadcast_dispatch_table
+        [td_ns::num_types][td_ns::num_types];
+
+// sub(row, matrix)
+static binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t
+    subtract_contig_row_contig_matrix_broadcast_dispatch_table
+        [td_ns::num_types][td_ns::num_types];
+
+static binary_inplace_contig_impl_fn_ptr_t
+    subtract_inplace_contig_dispatch_table[td_ns::num_types][td_ns::num_types];
+static binary_inplace_strided_impl_fn_ptr_t
+    subtract_inplace_strided_dispatch_table[td_ns::num_types][td_ns::num_types];
+static binary_inplace_row_matrix_broadcast_impl_fn_ptr_t
+    subtract_inplace_row_matrix_dispatch_table[td_ns::num_types]
+                                              [td_ns::num_types];
+
+void populate_subtract_dispatch_tables(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = subtract_fn_ns;
+
+    // which input types are supported, and what is the type of the result
+    using fn_ns::SubtractTypeMapFactory;
+    DispatchTableBuilder<int, SubtractTypeMapFactory, num_types> dtb1;
+    dtb1.populate_dispatch_table(subtract_output_id_table);
+
+    // function pointers for operation on general strided arrays
+    using fn_ns::SubtractStridedFactory;
+    DispatchTableBuilder<binary_strided_impl_fn_ptr_t, SubtractStridedFactory,
+                         num_types>
+        dtb2;
+    dtb2.populate_dispatch_table(subtract_strided_dispatch_table);
+
+    // function pointers for operation on contiguous inputs and output
+    using fn_ns::SubtractContigFactory;
+    DispatchTableBuilder<binary_contig_impl_fn_ptr_t, SubtractContigFactory,
+                         num_types>
+        dtb3;
+    dtb3.populate_dispatch_table(subtract_contig_dispatch_table);
+
+    // function pointers for operation on contiguous matrix, contiguous row
+    // with contiguous matrix output
+    using fn_ns::SubtractContigMatrixContigRowBroadcastFactory;
+    DispatchTableBuilder<
+        binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t,
+        SubtractContigMatrixContigRowBroadcastFactory, num_types>
+        dtb4;
+    dtb4.populate_dispatch_table(
+        subtract_contig_matrix_contig_row_broadcast_dispatch_table);
+
+    // function pointers for operation on contiguous row, contiguous matrix
+    // with contiguous matrix output
+    using fn_ns::SubtractContigRowContigMatrixBroadcastFactory;
+    DispatchTableBuilder<
+        binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t,
+        SubtractContigRowContigMatrixBroadcastFactory, num_types>
+        dtb5;
+    dtb5.populate_dispatch_table(
+        subtract_contig_row_contig_matrix_broadcast_dispatch_table);
+
+    // function pointers for inplace operation on general strided arrays
+    using fn_ns::SubtractInplaceStridedFactory;
+    DispatchTableBuilder<binary_inplace_strided_impl_fn_ptr_t,
+                         SubtractInplaceStridedFactory, num_types>
+        dtb6;
+    dtb6.populate_dispatch_table(subtract_inplace_strided_dispatch_table);
+
+    // function pointers for inplace operation on contiguous inputs and output
+    using fn_ns::SubtractInplaceContigFactory;
+    DispatchTableBuilder<binary_inplace_contig_impl_fn_ptr_t,
+                         SubtractInplaceContigFactory, num_types>
+        dtb7;
+    dtb7.populate_dispatch_table(subtract_inplace_contig_dispatch_table);
+
+    // function pointers for inplace operation on contiguous matrix
+    // and contiguous row
+    using fn_ns::SubtractInplaceRowMatrixBroadcastFactory;
+    DispatchTableBuilder<binary_inplace_row_matrix_broadcast_impl_fn_ptr_t,
+                         SubtractInplaceRowMatrixBroadcastFactory, num_types>
+        dtb8;
+    dtb8.populate_dispatch_table(subtract_inplace_row_matrix_dispatch_table);
+
+    // which types are supported by the in-place kernels
+    using fn_ns::SubtractInplaceTypeMapFactory;
+    DispatchTableBuilder<int, SubtractInplaceTypeMapFactory, num_types> dtb9;
+    dtb9.populate_dispatch_table(subtract_inplace_output_id_table);
+};
+
+} // namespace impl
+
+void init_subtract(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        impl::populate_subtract_dispatch_tables();
+        using impl::subtract_contig_dispatch_table;
+        using impl::subtract_contig_matrix_contig_row_broadcast_dispatch_table;
+        using impl::subtract_contig_row_contig_matrix_broadcast_dispatch_table;
+        using impl::subtract_output_id_table;
+        using impl::subtract_strided_dispatch_table;
+
+        auto subtract_pyapi = [&](const arrayT &src1, const arrayT &src2,
+                                  const arrayT &dst, sycl::queue &exec_q,
+                                  const event_vecT &depends = {}) {
+            return py_binary_ufunc(
+                src1, src2, dst, exec_q, depends, subtract_output_id_table,
+                // function pointers to handle operation on contiguous
+                // arrays (pointers may be nullptr)
+                subtract_contig_dispatch_table,
+                // function pointers to handle operation on strided arrays
+                // (most general case)
+                subtract_strided_dispatch_table,
+                // function pointers to handle operation of c-contig matrix
+                // and c-contig row with broadcasting (may be nullptr)
+                subtract_contig_matrix_contig_row_broadcast_dispatch_table,
+                // function pointers to handle operation of c-contig matrix
+                // and c-contig row with broadcasting (may be nullptr)
+                subtract_contig_row_contig_matrix_broadcast_dispatch_table);
+        };
+        auto subtract_result_type_pyapi = [&](const py::dtype &dtype1,
+                                              const py::dtype &dtype2) {
+            return py_binary_ufunc_result_type(dtype1, dtype2,
+                                               subtract_output_id_table);
+        };
+        m.def("_subtract", subtract_pyapi, "", py::arg("src1"), py::arg("src2"),
+              py::arg("dst"), py::arg("sycl_queue"),
+              py::arg("depends") = py::list());
+        m.def("_subtract_result_type", subtract_result_type_pyapi, "");
+
+        using impl::subtract_inplace_contig_dispatch_table;
+        using impl::subtract_inplace_output_id_table;
+        using impl::subtract_inplace_row_matrix_dispatch_table;
+        using impl::subtract_inplace_strided_dispatch_table;
+
+        auto subtract_inplace_pyapi = [&](const arrayT &src, const arrayT &dst,
+                                          sycl::queue &exec_q,
+                                          const event_vecT &depends = {}) {
+            return py_binary_inplace_ufunc(
+                src, dst, exec_q, depends, subtract_inplace_output_id_table,
+                // function pointers to handle inplace operation on
+                // contiguous arrays (pointers may be nullptr)
+                subtract_inplace_contig_dispatch_table,
+                // function pointers to handle inplace operation on strided
+                // arrays (most general case)
+                subtract_inplace_strided_dispatch_table,
+                // function pointers to handle inplace operation on
+                // c-contig matrix with c-contig row with broadcasting
+                // (may be nullptr)
+                subtract_inplace_row_matrix_dispatch_table);
+        };
+        m.def("_subtract_inplace", subtract_inplace_pyapi, "", py::arg("lhs"),
+              py::arg("rhs"), py::arg("sycl_queue"),
+              py::arg("depends") = py::list());
+    }
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/subtract.hpp b/dpctl_ext/tensor/libtensor/source/elementwise_functions/subtract.hpp
new file mode 100644
index 000000000000..89cdfd6d0ea0
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/elementwise_functions/subtract.hpp
@@ -0,0 +1,42 @@
+//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2025 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions,
+/// specifically functions for elementwise operations.
+//===----------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+extern void init_subtract(py::module_ m);
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpnp/dpnp_iface_logic.py b/dpnp/dpnp_iface_logic.py
index b901206f8763..616d1e548a34 100644
--- a/dpnp/dpnp_iface_logic.py
+++ b/dpnp/dpnp_iface_logic.py
@@ -43,15 +43,13 @@
 # pylint: disable=duplicate-code
 # pylint: disable=no-name-in-module
 
-
-import dpctl.tensor._tensor_elementwise_impl as ti
 import dpctl.utils as dpu
 import numpy
 
 # TODO: revert to `import dpctl.tensor...`
 # when dpnp fully migrates dpctl/tensor
 import dpctl_ext.tensor as dpt
-import dpctl_ext.tensor._tensor_elementwise_impl as ti_ext
+import dpctl_ext.tensor._tensor_elementwise_impl as ti
 import dpnp
 import dpnp.backend.extensions.ufunc._ufunc_impl as ufi
 
@@ -661,8 +659,8 @@ def array_equiv(a1, a2):
 
 equal = DPNPBinaryFunc(
     "equal",
-    ti_ext._equal_result_type,
-    ti_ext._equal,
+    ti._equal_result_type,
+    ti._equal,
     _EQUAL_DOCSTRING,
 )
 
@@ -737,8 +735,8 @@ def array_equiv(a1, a2):
 
 greater = DPNPBinaryFunc(
     "greater",
-    ti_ext._greater_result_type,
-    ti_ext._greater,
+    ti._greater_result_type,
+    ti._greater,
     _GREATER_DOCSTRING,
 )
 
@@ -814,8 +812,8 @@ def array_equiv(a1, a2):
 
 greater_equal = DPNPBinaryFunc(
     "greater_equal",
-    ti_ext._greater_equal_result_type,
-    ti_ext._greater_equal,
+    ti._greater_equal_result_type,
+    ti._greater_equal,
     _GREATER_EQUAL_DOCSTRING,
 )
 
@@ -1095,8 +1093,8 @@ def iscomplexobj(x):
 
 isfinite = DPNPUnaryFunc(
     "isfinite",
-    ti_ext._isfinite_result_type,
-    ti_ext._isfinite,
+    ti._isfinite_result_type,
+    ti._isfinite,
     _ISFINITE_DOCSTRING,
 )
 
@@ -1338,8 +1336,8 @@ def isin(
 
 isinf = DPNPUnaryFunc(
     "isinf",
-    ti_ext._isinf_result_type,
-    ti_ext._isinf,
+    ti._isinf_result_type,
+    ti._isinf,
     _ISINF_DOCSTRING,
 )
 
@@ -1396,8 +1394,8 @@ def isin(
 
 isnan = DPNPUnaryFunc(
     "isnan",
-    ti_ext._isnan_result_type,
-    ti_ext._isnan,
+    ti._isnan_result_type,
+    ti._isnan,
     _ISNAN_DOCSTRING,
 )
 
@@ -1750,8 +1748,8 @@ def isscalar(element):
 
 less = DPNPBinaryFunc(
     "less",
-    ti_ext._less_result_type,
-    ti_ext._less,
+    ti._less_result_type,
+    ti._less,
     _LESS_DOCSTRING,
 )
 
@@ -1826,8 +1824,8 @@ def isscalar(element):
 
 less_equal = DPNPBinaryFunc(
     "less_equal",
-    ti_ext._less_equal_result_type,
-    ti_ext._less_equal,
+    ti._less_equal_result_type,
+    ti._less_equal,
     _LESS_EQUAL_DOCSTRING,
 )
 
@@ -1969,8 +1967,8 @@ def isscalar(element):
 
 logical_not = DPNPUnaryFunc(
     "logical_not",
-    ti_ext._logical_not_result_type,
-    ti_ext._logical_not,
+    ti._logical_not_result_type,
+    ti._logical_not,
     _LOGICAL_NOT_DOCSTRING,
 )
 
diff --git a/dpnp/dpnp_iface_mathematical.py b/dpnp/dpnp_iface_mathematical.py
index 54a17cec0c37..89bc08681604 100644
--- a/dpnp/dpnp_iface_mathematical.py
+++ b/dpnp/dpnp_iface_mathematical.py
@@ -47,14 +47,13 @@
 import builtins
 import warnings
 
-import dpctl.tensor._tensor_elementwise_impl as ti
 import dpctl.utils as dpu
 import numpy
 
 # TODO: revert to `import dpctl.tensor...`
 # when dpnp fully migrates dpctl/tensor
 import dpctl_ext.tensor as dpt
-import dpctl_ext.tensor._tensor_elementwise_impl as ti_ext
+import dpctl_ext.tensor._tensor_elementwise_impl as ti
 import dpctl_ext.tensor._type_utils as dtu
 import dpnp
 import dpnp.backend.extensions.ufunc._ufunc_impl as ufi
@@ -385,8 +384,8 @@ def _validate_interp_param(param, name, exec_q, usm_type, dtype=None):
 
 abs = DPNPUnaryFunc(
     "abs",
-    ti_ext._abs_result_type,
-    ti_ext._abs,
+    ti._abs_result_type,
+    ti._abs,
     _ABS_DOCSTRING,
     mkl_fn_to_call="_mkl_abs_to_call",
     mkl_impl_fn="_abs",
@@ -469,8 +468,8 @@ def _validate_interp_param(param, name, exec_q, usm_type, dtype=None):
 
 add = DPNPBinaryFunc(
     "add",
-    ti_ext._add_result_type,
-    ti_ext._add,
+    ti._add_result_type,
+    ti._add,
     _ADD_DOCSTRING,
     mkl_fn_to_call="_mkl_add_to_call",
     mkl_impl_fn="_add",
@@ -541,8 +540,8 @@ def _validate_interp_param(param, name, exec_q, usm_type, dtype=None):
 
 angle = DPNPAngle(
     "angle",
-    ti_ext._angle_result_type,
-    ti_ext._angle,
+    ti._angle_result_type,
+    ti._angle,
     _ANGLE_DOCSTRING,
     mkl_fn_to_call="_mkl_arg_to_call",
     mkl_impl_fn="_arg",
@@ -647,8 +646,8 @@ def around(x, /, decimals=0, out=None):
 
 ceil = DPNPUnaryFunc(
     "ceil",
-    ti_ext._ceil_result_type,
-    ti_ext._ceil,
+    ti._ceil_result_type,
+    ti._ceil,
     _CEIL_DOCSTRING,
     mkl_fn_to_call="_mkl_ceil_to_call",
     mkl_impl_fn="_ceil",
@@ -782,8 +781,8 @@ def clip(a, /, min=None, max=None, *, out=None, order="K", **kwargs):
 
 conj = DPNPUnaryFunc(
     "conj",
-    ti_ext._conj_result_type,
-    ti_ext._conj,
+    ti._conj_result_type,
+    ti._conj,
     _CONJ_DOCSTRING,
     mkl_fn_to_call="_mkl_conj_to_call",
     mkl_impl_fn="_conj",
@@ -1558,12 +1557,12 @@ def diff(a, n=1, axis=-1, prepend=None, append=None):
 
 divide = DPNPBinaryFunc(
     "divide",
-    ti_ext._divide_result_type,
-    ti_ext._divide,
+    ti._divide_result_type,
+    ti._divide,
     _DIVIDE_DOCSTRING,
     mkl_fn_to_call="_mkl_div_to_call",
     mkl_impl_fn="_div",
-    binary_inplace_fn=ti_ext._divide_inplace,
+    binary_inplace_fn=ti._divide_inplace,
     acceptance_fn=dtu._acceptance_fn_divide,
 )
 
@@ -2057,8 +2056,8 @@ def ediff1d(ary, to_end=None, to_begin=None):
 
 floor = DPNPUnaryFunc(
     "floor",
-    ti_ext._floor_result_type,
-    ti_ext._floor,
+    ti._floor_result_type,
+    ti._floor,
     _FLOOR_DOCSTRING,
     mkl_fn_to_call="_mkl_floor_to_call",
     mkl_impl_fn="_floor",
@@ -2139,10 +2138,10 @@ def ediff1d(ary, to_end=None, to_begin=None):
 
 floor_divide = DPNPBinaryFunc(
     "floor_divide",
-    ti_ext._floor_divide_result_type,
-    ti_ext._floor_divide,
+    ti._floor_divide_result_type,
+    ti._floor_divide,
     _FLOOR_DIVIDE_DOCSTRING,
-    binary_inplace_fn=ti_ext._floor_divide_inplace,
+    binary_inplace_fn=ti._floor_divide_inplace,
 )
 
 
@@ -2941,8 +2940,8 @@ def gradient(f, *varargs, axis=None, edge_order=1):
 
 imag = DPNPImag(
     "imag",
-    ti_ext._imag_result_type,
-    ti_ext._imag,
+    ti._imag_result_type,
+    ti._imag,
     _IMAG_DOCSTRING,
 )
 
@@ -3852,8 +3851,8 @@ def _check_nan_inf(val, val_dt):
 
 negative = DPNPUnaryFunc(
     "negative",
-    ti_ext._negative_result_type,
-    ti_ext._negative,
+    ti._negative_result_type,
+    ti._negative,
     _NEGATIVE_DOCSTRING,
     acceptance_fn=acceptance_fn_negative,
 )
@@ -3988,8 +3987,8 @@ def _check_nan_inf(val, val_dt):
 
 positive = DPNPUnaryFunc(
     "positive",
-    ti_ext._positive_result_type,
-    ti_ext._positive,
+    ti._positive_result_type,
+    ti._positive,
     _POSITIVE_DOCSTRING,
     acceptance_fn=acceptance_fn_positive,
 )
@@ -4250,8 +4249,8 @@ def prod(
 
 proj = DPNPUnaryFunc(
     "proj",
-    ti_ext._proj_result_type,
-    ti_ext._proj,
+    ti._proj_result_type,
+    ti._proj,
     _PROJ_DOCSTRING,
 )
 
@@ -4313,8 +4312,8 @@ def prod(
 
 real = DPNPReal(
     "real",
-    ti_ext._real_result_type,
-    ti_ext._real,
+    ti._real_result_type,
+    ti._real,
     _REAL_DOCSTRING,
 )
 
@@ -4596,8 +4595,8 @@ def real_if_close(a, tol=100):
 
 round = DPNPRound(
     "round",
-    ti_ext._round_result_type,
-    ti_ext._round,
+    ti._round_result_type,
+    ti._round,
     _ROUND_DOCSTRING,
     mkl_fn_to_call="_mkl_round_to_call",
     mkl_impl_fn="_round",
@@ -4668,8 +4667,8 @@ def real_if_close(a, tol=100):
 
 sign = DPNPUnaryFunc(
     "sign",
-    ti_ext._sign_result_type,
-    ti_ext._sign,
+    ti._sign_result_type,
+    ti._sign,
     _SIGN_DOCSTRING,
     acceptance_fn=acceptance_fn_sign,
 )
@@ -4730,8 +4729,8 @@ def real_if_close(a, tol=100):
 
 signbit = DPNPUnaryFunc(
     "signbit",
-    ti_ext._signbit_result_type,
-    ti_ext._signbit,
+    ti._signbit_result_type,
+    ti._signbit,
     _SIGNBIT_DOCSTRING,
 )
 
@@ -5229,8 +5228,8 @@ def trapezoid(y, x=None, dx=1.0, axis=-1):
 
 trunc = DPNPUnaryFunc(
     "trunc",
-    ti_ext._trunc_result_type,
-    ti_ext._trunc,
+    ti._trunc_result_type,
+    ti._trunc,
     _TRUNC_DOCSTRING,
     mkl_fn_to_call="_mkl_trunc_to_call",
     mkl_impl_fn="_trunc",

From 8d1c75b19c8af4e54898bdd8613ebfe1690eb8b3 Mon Sep 17 00:00:00 2001
From: vlad-perevezentsev <vladislav.perevezentsev@intel.com>
Date: Thu, 19 Mar 2026 15:53:15 +0100
Subject: [PATCH 20/43] Extend `dpctl_ext.tensor` with the remaining functions 
 (#2806)

This PR extends `dpctl_ext.tensor` API with the remaining statistical
and testing functions adding `std(), var(), mean(), allclose()`
---
 dpctl_ext/tensor/__init__.py               |   6 +
 dpctl_ext/tensor/_clip.py                  |   2 +-
 dpctl_ext/tensor/_ctors.py                 |   2 +-
 dpctl_ext/tensor/_reduction.py             |   2 +-
 dpctl_ext/tensor/_set_functions.py         |   2 +-
 dpctl_ext/tensor/_statistical_functions.py | 384 +++++++++++++++++++++
 dpctl_ext/tensor/_testing.py               | 175 ++++++++++
 dpctl_ext/tensor/_utility_functions.py     |   2 +-
 dpnp/dpnp_algo/dpnp_elementwise_common.py  |  20 +-
 dpnp/dpnp_iface_statistics.py              |  11 +-
 dpnp/tests/test_sycl_queue.py              |   2 +-
 11 files changed, 586 insertions(+), 22 deletions(-)
 create mode 100644 dpctl_ext/tensor/_statistical_functions.py
 create mode 100644 dpctl_ext/tensor/_testing.py

diff --git a/dpctl_ext/tensor/__init__.py b/dpctl_ext/tensor/__init__.py
index 71ef714c642a..7a6923169c1f 100644
--- a/dpctl_ext/tensor/__init__.py
+++ b/dpctl_ext/tensor/__init__.py
@@ -179,6 +179,8 @@
     unique_values,
 )
 from ._sorting import argsort, sort, top_k
+from ._statistical_functions import mean, std, var
+from ._testing import allclose
 from ._type_utils import can_cast, finfo, iinfo, isdtype, result_type
 from ._utility_functions import all, any, diff
 
@@ -188,6 +190,7 @@
     "acosh",
     "add",
     "all",
+    "allclose",
     "angle",
     "any",
     "arange",
@@ -267,6 +270,7 @@
     "log10",
     "max",
     "maximum",
+    "mean",
     "meshgrid",
     "min",
     "minimum",
@@ -308,6 +312,7 @@
     "square",
     "squeeze",
     "stack",
+    "std",
     "subtract",
     "sum",
     "swapaxes",
@@ -327,6 +332,7 @@
     "unique_inverse",
     "unique_values",
     "unstack",
+    "var",
     "vecdot",
     "where",
     "zeros",
diff --git a/dpctl_ext/tensor/_clip.py b/dpctl_ext/tensor/_clip.py
index ef07269c4ea0..c21d601966bd 100644
--- a/dpctl_ext/tensor/_clip.py
+++ b/dpctl_ext/tensor/_clip.py
@@ -28,12 +28,12 @@
 
 import dpctl
 import dpctl.tensor as dpt
-import dpctl.tensor._tensor_elementwise_impl as tei
 from dpctl.utils import ExecutionPlacementError, SequentialOrderManager
 
 # TODO: revert to `import dpctl.tensor...`
 # when dpnp fully migrates dpctl/tensor
 import dpctl_ext.tensor as dpt_ext
+import dpctl_ext.tensor._tensor_elementwise_impl as tei
 import dpctl_ext.tensor._tensor_impl as ti
 
 from ._copy_utils import (
diff --git a/dpctl_ext/tensor/_ctors.py b/dpctl_ext/tensor/_ctors.py
index 0b7650873fe3..21c3d0077189 100644
--- a/dpctl_ext/tensor/_ctors.py
+++ b/dpctl_ext/tensor/_ctors.py
@@ -361,7 +361,7 @@ def _copy_through_host_walker(seq_o, usm_res):
             )
             is None
         ):
-            usm_res[...] = dpt.asnumpy(seq_o).copy()
+            usm_res[...] = dpt_ext.asnumpy(seq_o).copy()
             return
         else:
             usm_res[...] = seq_o
diff --git a/dpctl_ext/tensor/_reduction.py b/dpctl_ext/tensor/_reduction.py
index b8fdcf4f37e6..2daf07b81d85 100644
--- a/dpctl_ext/tensor/_reduction.py
+++ b/dpctl_ext/tensor/_reduction.py
@@ -506,7 +506,7 @@ def count_nonzero(x, /, *, axis=None, keepdims=False, out=None):
             type.
     """
     if x.dtype != dpt.bool:
-        x = dpt.astype(x, dpt.bool, copy=False)
+        x = dpt_ext.astype(x, dpt.bool, copy=False)
     return sum(
         x,
         axis=axis,
diff --git a/dpctl_ext/tensor/_set_functions.py b/dpctl_ext/tensor/_set_functions.py
index 93f81f044fd2..2672e082d18e 100644
--- a/dpctl_ext/tensor/_set_functions.py
+++ b/dpctl_ext/tensor/_set_functions.py
@@ -30,11 +30,11 @@
 
 import dpctl.tensor as dpt
 import dpctl.utils as du
-from dpctl.tensor._tensor_elementwise_impl import _not_equal, _subtract
 
 # TODO: revert to `import dpctl.tensor...`
 # when dpnp fully migrates dpctl/tensor
 import dpctl_ext.tensor as dpt_ext
+from dpctl_ext.tensor._tensor_elementwise_impl import _not_equal, _subtract
 
 from ._copy_utils import _empty_like_orderK
 from ._scalar_utils import (
diff --git a/dpctl_ext/tensor/_statistical_functions.py b/dpctl_ext/tensor/_statistical_functions.py
new file mode 100644
index 000000000000..5513dfa7a65f
--- /dev/null
+++ b/dpctl_ext/tensor/_statistical_functions.py
@@ -0,0 +1,384 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+
+import dpctl.tensor as dpt
+import dpctl.utils as du
+
+# TODO: revert to `import dpctl.tensor...`
+# when dpnp fully migrates dpctl/tensor
+import dpctl_ext.tensor as dpt_ext
+import dpctl_ext.tensor._tensor_elementwise_impl as tei
+import dpctl_ext.tensor._tensor_impl as ti
+import dpctl_ext.tensor._tensor_reductions_impl as tri
+
+from ._numpy_helper import normalize_axis_tuple
+
+
+def _var_impl(x, axis, correction, keepdims):
+    nd = x.ndim
+    if axis is None:
+        axis = tuple(range(nd))
+    if not isinstance(axis, (tuple, list)):
+        axis = (axis,)
+    axis = normalize_axis_tuple(axis, nd, "axis")
+    perm = []
+    nelems = 1
+    for i in range(nd):
+        if i not in axis:
+            perm.append(i)
+        else:
+            nelems *= x.shape[i]
+    red_nd = len(axis)
+    perm = perm + list(axis)
+    q = x.sycl_queue
+    inp_dt = x.dtype
+    res_dt = (
+        inp_dt
+        if inp_dt.kind == "f"
+        else dpt.dtype(ti.default_device_fp_type(q))
+    )
+    res_usm_type = x.usm_type
+
+    _manager = du.SequentialOrderManager[q]
+    dep_evs = _manager.submitted_events
+    if inp_dt != res_dt:
+        buf = dpt_ext.empty_like(x, dtype=res_dt)
+        ht_e_buf, c_e1 = ti._copy_usm_ndarray_into_usm_ndarray(
+            src=x, dst=buf, sycl_queue=q, depends=dep_evs
+        )
+        _manager.add_event_pair(ht_e_buf, c_e1)
+    else:
+        buf = x
+    # calculate mean
+    buf2 = dpt_ext.permute_dims(buf, perm)
+    res_shape = buf2.shape[: nd - red_nd]
+    # use keepdims=True path for later broadcasting
+    if red_nd == 0:
+        mean_ary = dpt_ext.empty_like(buf)
+        dep_evs = _manager.submitted_events
+        ht_e1, c_e2 = ti._copy_usm_ndarray_into_usm_ndarray(
+            src=buf, dst=mean_ary, sycl_queue=q, depends=dep_evs
+        )
+        _manager.add_event_pair(ht_e1, c_e2)
+    else:
+        mean_ary = dpt_ext.empty(
+            res_shape,
+            dtype=res_dt,
+            usm_type=res_usm_type,
+            sycl_queue=q,
+        )
+        dep_evs = _manager.submitted_events
+        ht_e1, r_e1 = tri._sum_over_axis(
+            src=buf2,
+            trailing_dims_to_reduce=red_nd,
+            dst=mean_ary,
+            sycl_queue=q,
+            depends=dep_evs,
+        )
+        _manager.add_event_pair(ht_e1, r_e1)
+
+        mean_ary_shape = res_shape + (1,) * red_nd
+        inv_perm = sorted(range(nd), key=lambda d: perm[d])
+        mean_ary = dpt_ext.permute_dims(
+            dpt_ext.reshape(mean_ary, mean_ary_shape), inv_perm
+        )
+    # divide in-place to get mean
+    mean_ary_shape = mean_ary.shape
+
+    dep_evs = _manager.submitted_events
+    ht_e2, d_e1 = tei._divide_by_scalar(
+        src=mean_ary, scalar=nelems, dst=mean_ary, sycl_queue=q, depends=dep_evs
+    )
+    _manager.add_event_pair(ht_e2, d_e1)
+
+    # subtract mean from original array to get deviations
+    dev_ary = dpt_ext.empty_like(buf)
+    if mean_ary_shape != buf.shape:
+        mean_ary = dpt_ext.broadcast_to(mean_ary, buf.shape)
+    ht_e4, su_e = tei._subtract(
+        src1=buf, src2=mean_ary, dst=dev_ary, sycl_queue=q, depends=[d_e1]
+    )
+    _manager.add_event_pair(ht_e4, su_e)
+    # square deviations
+    ht_e5, sq_e = tei._square(
+        src=dev_ary, dst=dev_ary, sycl_queue=q, depends=[su_e]
+    )
+    _manager.add_event_pair(ht_e5, sq_e)
+
+    # take sum of squared deviations
+    dev_ary2 = dpt_ext.permute_dims(dev_ary, perm)
+    if red_nd == 0:
+        res = dev_ary
+    else:
+        res = dpt_ext.empty(
+            res_shape,
+            dtype=res_dt,
+            usm_type=res_usm_type,
+            sycl_queue=q,
+        )
+        ht_e6, r_e2 = tri._sum_over_axis(
+            src=dev_ary2,
+            trailing_dims_to_reduce=red_nd,
+            dst=res,
+            sycl_queue=q,
+            depends=[sq_e],
+        )
+        _manager.add_event_pair(ht_e6, r_e2)
+
+        if keepdims:
+            res_shape = res_shape + (1,) * red_nd
+            inv_perm = sorted(range(nd), key=lambda d: perm[d])
+            res = dpt_ext.permute_dims(
+                dpt_ext.reshape(res, res_shape), inv_perm
+            )
+    res_shape = res.shape
+    # when nelems - correction <= 0, yield nans
+    div = max(nelems - correction, 0)
+    if not div:
+        div = dpt.nan
+    dep_evs = _manager.submitted_events
+    ht_e7, d_e2 = tei._divide_by_scalar(
+        src=res, scalar=div, dst=res, sycl_queue=q, depends=dep_evs
+    )
+    _manager.add_event_pair(ht_e7, d_e2)
+    return res, [d_e2]
+
+
+def mean(x, axis=None, keepdims=False):
+    """mean(x, axis=None, keepdims=False)
+
+    Calculates the arithmetic mean of elements in the input array `x`.
+
+    Args:
+        x (usm_ndarray):
+            input array.
+        axis (Optional[int, Tuple[int, ...]]):
+            axis or axes along which the arithmetic means must be computed. If
+            a tuple of unique integers, the means are computed over multiple
+            axes. If `None`, the mean is computed over the entire array.
+            Default: `None`.
+        keepdims (Optional[bool]):
+            if `True`, the reduced axes (dimensions) are included in the result
+            as singleton dimensions, so that the returned array remains
+            compatible with the input array according to Array Broadcasting
+            rules. Otherwise, if `False`, the reduced axes are not included in
+            the returned array. Default: `False`.
+    Returns:
+        usm_ndarray:
+            an array containing the arithmetic means. If the mean was computed
+            over the entire array, a zero-dimensional array is returned.
+
+            If `x` has a floating-point data type, the returned array will have
+            the same data type as `x`.
+            If `x` has a boolean or integral data type, the returned array
+            will have the default floating point data type for the device
+            where input array `x` is allocated.
+    """
+    if not isinstance(x, dpt.usm_ndarray):
+        raise TypeError(f"Expected dpctl.tensor.usm_ndarray, got {type(x)}")
+    nd = x.ndim
+    if axis is None:
+        axis = tuple(range(nd))
+    if not isinstance(axis, (tuple, list)):
+        axis = (axis,)
+    axis = normalize_axis_tuple(axis, nd, "axis")
+    perm = []
+    nelems = 1
+    for i in range(nd):
+        if i not in axis:
+            perm.append(i)
+        else:
+            nelems *= x.shape[i]
+    sum_nd = len(axis)
+    perm = perm + list(axis)
+    arr2 = dpt_ext.permute_dims(x, perm)
+    res_shape = arr2.shape[: nd - sum_nd]
+    q = x.sycl_queue
+    inp_dt = x.dtype
+    res_dt = (
+        x.dtype
+        if x.dtype.kind in "fc"
+        else dpt.dtype(ti.default_device_fp_type(q))
+    )
+    res_usm_type = x.usm_type
+    if sum_nd == 0:
+        return dpt_ext.astype(x, res_dt, copy=True)
+
+    _manager = du.SequentialOrderManager[q]
+    dep_evs = _manager.submitted_events
+    if tri._sum_over_axis_dtype_supported(inp_dt, res_dt, res_usm_type, q):
+        res = dpt_ext.empty(
+            res_shape, dtype=res_dt, usm_type=res_usm_type, sycl_queue=q
+        )
+        ht_e1, r_e = tri._sum_over_axis(
+            src=arr2,
+            trailing_dims_to_reduce=sum_nd,
+            dst=res,
+            sycl_queue=q,
+            depends=dep_evs,
+        )
+        _manager.add_event_pair(ht_e1, r_e)
+    else:
+        tmp = dpt_ext.empty(
+            arr2.shape, dtype=res_dt, usm_type=res_usm_type, sycl_queue=q
+        )
+        ht_e_cpy, cpy_e = ti._copy_usm_ndarray_into_usm_ndarray(
+            src=arr2, dst=tmp, sycl_queue=q, depends=dep_evs
+        )
+        _manager.add_event_pair(ht_e_cpy, cpy_e)
+        res = dpt_ext.empty(
+            res_shape, dtype=res_dt, usm_type=res_usm_type, sycl_queue=q
+        )
+        ht_e_red, r_e = tri._sum_over_axis(
+            src=tmp,
+            trailing_dims_to_reduce=sum_nd,
+            dst=res,
+            sycl_queue=q,
+            depends=[cpy_e],
+        )
+        _manager.add_event_pair(ht_e_red, r_e)
+
+    if keepdims:
+        res_shape = res_shape + (1,) * sum_nd
+        inv_perm = sorted(range(nd), key=lambda d: perm[d])
+        res = dpt_ext.permute_dims(dpt_ext.reshape(res, res_shape), inv_perm)
+
+    dep_evs = _manager.submitted_events
+    ht_e2, div_e = tei._divide_by_scalar(
+        src=res, scalar=nelems, dst=res, sycl_queue=q, depends=dep_evs
+    )
+    _manager.add_event_pair(ht_e2, div_e)
+    return res
+
+
+def var(x, axis=None, correction=0.0, keepdims=False):
+    """var(x, axis=None, correction=0.0, keepdims=False)
+
+    Calculates the variance of elements in the input array `x`.
+
+    Args:
+        x (usm_ndarray):
+            input array.
+        axis (Optional[int, Tuple[int, ...]]):
+            axis or axes along which the variances must be computed. If a tuple
+            of unique integers, the variances are computed over multiple axes.
+            If `None`, the variance is computed over the entire array.
+            Default: `None`.
+        correction (Optional[float, int]):
+            degrees of freedom adjustment. The divisor used in calculating the
+            variance is `N - correction`, where `N` corresponds to the total
+            number of elements over which the variance is calculated.
+            Default: `0.0`.
+        keepdims (Optional[bool]):
+            if `True`, the reduced axes (dimensions) are included in the result
+            as singleton dimensions, so that the returned array remains
+            compatible with the input array according to Array Broadcasting
+            rules. Otherwise, if `False`, the reduced axes are not included in
+            the returned array. Default: `False`.
+    Returns:
+        usm_ndarray:
+            an array containing the variances. If the variance was computed
+            over the entire array, a zero-dimensional array is returned.
+
+            If `x` has a real-valued floating-point data type, the returned
+            array will have the same data type as `x`.
+            If `x` has a boolean or integral data type, the returned array
+            will have the default floating point data type for the device
+            where input array `x` is allocated.
+    """
+    if not isinstance(x, dpt.usm_ndarray):
+        raise TypeError(f"Expected dpctl.tensor.usm_ndarray, got {type(x)}")
+
+    if not isinstance(correction, (int, float)):
+        raise TypeError(
+            "Expected a Python integer or float for `correction`, got"
+            f"{type(x)}"
+        )
+
+    if x.dtype.kind == "c":
+        raise ValueError("`var` does not support complex types")
+
+    res, _ = _var_impl(x, axis, correction, keepdims)
+    return res
+
+
+def std(x, axis=None, correction=0.0, keepdims=False):
+    """std(x, axis=None, correction=0.0, keepdims=False)
+
+    Calculates the standard deviation of elements in the input array `x`.
+
+    Args:
+        x (usm_ndarray):
+            input array.
+        axis (Optional[int, Tuple[int, ...]]):
+            axis or axes along which the standard deviations must be computed.
+            If a tuple of unique integers, the standard deviations are computed
+            over multiple axes. If `None`, the standard deviation is computed
+            over the entire array. Default: `None`.
+        correction (Optional[float, int]):
+            degrees of freedom adjustment. The divisor used in calculating the
+            standard deviation is `N - correction`, where `N` corresponds to the
+            total number of elements over which the standard deviation is
+            calculated. Default: `0.0`.
+        keepdims (Optional[bool]):
+            if `True`, the reduced axes (dimensions) are included in the result
+            as singleton dimensions, so that the returned array remains
+            compatible with the input array according to Array Broadcasting
+            rules. Otherwise, if `False`, the reduced axes are not included in
+            the returned array. Default: `False`.
+    Returns:
+        usm_ndarray:
+            an array containing the standard deviations. If the standard
+            deviation was computed over the entire array, a zero-dimensional
+            array is returned.
+
+            If `x` has a real-valued floating-point data type, the returned
+            array will have the same data type as `x`.
+            If `x` has a boolean or integral data type, the returned array
+            will have the default floating point data type for the device
+            where input array `x` is allocated.
+    """
+    if not isinstance(x, dpt.usm_ndarray):
+        raise TypeError(f"Expected dpctl.tensor.usm_ndarray, got {type(x)}")
+
+    if not isinstance(correction, (int, float)):
+        raise TypeError(
+            "Expected a Python integer or float for `correction`,"
+            f"got {type(x)}"
+        )
+
+    if x.dtype.kind == "c":
+        raise ValueError("`std` does not support complex types")
+
+    exec_q = x.sycl_queue
+    _manager = du.SequentialOrderManager[exec_q]
+    res, deps = _var_impl(x, axis, correction, keepdims)
+    ht_ev, sqrt_ev = tei._sqrt(
+        src=res, dst=res, sycl_queue=exec_q, depends=deps
+    )
+    _manager.add_event_pair(ht_ev, sqrt_ev)
+    return res
diff --git a/dpctl_ext/tensor/_testing.py b/dpctl_ext/tensor/_testing.py
new file mode 100644
index 000000000000..c0f475212232
--- /dev/null
+++ b/dpctl_ext/tensor/_testing.py
@@ -0,0 +1,175 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+import dpctl.tensor as dpt
+import dpctl.utils as du
+import numpy as np
+
+# TODO: revert to `import dpctl.tensor...`
+# when dpnp fully migrates dpctl/tensor
+import dpctl_ext.tensor as dpt_ext
+
+from ._manipulation_functions import _broadcast_shape_impl
+from ._type_utils import _to_device_supported_dtype
+
+
+def _allclose_complex_fp(z1, z2, atol, rtol, equal_nan):
+    z1r = dpt_ext.real(z1)
+    z1i = dpt_ext.imag(z1)
+    z2r = dpt_ext.real(z2)
+    z2i = dpt_ext.imag(z2)
+    if equal_nan:
+        check1 = dpt_ext.all(
+            dpt_ext.isnan(z1r) == dpt_ext.isnan(z2r)
+        ) and dpt_ext.all(dpt_ext.isnan(z1i) == dpt_ext.isnan(z2i))
+    else:
+        check1 = (
+            dpt_ext.logical_not(dpt_ext.any(dpt_ext.isnan(z1r)))
+            and dpt_ext.logical_not(dpt_ext.any(dpt_ext.isnan(z1i)))
+        ) and (
+            dpt_ext.logical_not(dpt_ext.any(dpt_ext.isnan(z2r)))
+            and dpt_ext.logical_not(dpt_ext.any(dpt_ext.isnan(z2i)))
+        )
+    if not check1:
+        return check1
+    mr = dpt_ext.isinf(z1r)
+    mi = dpt_ext.isinf(z1i)
+    check2 = dpt_ext.all(mr == dpt_ext.isinf(z2r)) and dpt_ext.all(
+        mi == dpt_ext.isinf(z2i)
+    )
+    if not check2:
+        return check2
+    check3 = dpt_ext.all(z1r[mr] == z2r[mr]) and dpt_ext.all(z1i[mi] == z2i[mi])
+    if not check3:
+        return check3
+    mr = dpt_ext.isfinite(z1r)
+    mi = dpt_ext.isfinite(z1i)
+    mv1 = z1r[mr]
+    mv2 = z2r[mr]
+    check4 = dpt_ext.all(
+        dpt_ext.abs(mv1 - mv2)
+        < dpt_ext.maximum(
+            atol, rtol * dpt_ext.maximum(dpt_ext.abs(mv1), dpt_ext.abs(mv2))
+        )
+    )
+    if not check4:
+        return check4
+    mv1 = z1i[mi]
+    mv2 = z2i[mi]
+    check5 = dpt_ext.all(
+        dpt_ext.abs(mv1 - mv2)
+        <= dpt_ext.maximum(
+            atol, rtol * dpt_ext.maximum(dpt_ext.abs(mv1), dpt_ext.abs(mv2))
+        )
+    )
+    return check5
+
+
+def _allclose_real_fp(r1, r2, atol, rtol, equal_nan):
+    if equal_nan:
+        check1 = dpt_ext.all(dpt_ext.isnan(r1) == dpt_ext.isnan(r2))
+    else:
+        check1 = dpt_ext.logical_not(
+            dpt_ext.any(dpt_ext.isnan(r1))
+        ) and dpt_ext.logical_not(dpt_ext.any(dpt_ext.isnan(r2)))
+    if not check1:
+        return check1
+    mr = dpt_ext.isinf(r1)
+    check2 = dpt_ext.all(mr == dpt_ext.isinf(r2))
+    if not check2:
+        return check2
+    check3 = dpt_ext.all(r1[mr] == r2[mr])
+    if not check3:
+        return check3
+    m = dpt_ext.isfinite(r1)
+    mv1 = r1[m]
+    mv2 = r2[m]
+    check4 = dpt_ext.all(
+        dpt_ext.abs(mv1 - mv2)
+        <= dpt_ext.maximum(
+            atol, rtol * dpt_ext.maximum(dpt_ext.abs(mv1), dpt_ext.abs(mv2))
+        )
+    )
+    return check4
+
+
+def _allclose_others(r1, r2):
+    return dpt_ext.all(r1 == r2)
+
+
+def allclose(a1, a2, atol=1e-8, rtol=1e-5, equal_nan=False):
+    """allclose(a1, a2, atol=1e-8, rtol=1e-5, equal_nan=False)
+
+    Returns True if two arrays are element-wise equal within tolerances.
+
+    The testing is based on the following elementwise comparison:
+
+           abs(a - b) <= max(atol, rtol * max(abs(a), abs(b)))
+    """
+    if not isinstance(a1, dpt.usm_ndarray):
+        raise TypeError(
+            f"Expected dpctl.tensor.usm_ndarray type, got {type(a1)}."
+        )
+    if not isinstance(a2, dpt.usm_ndarray):
+        raise TypeError(
+            f"Expected dpctl.tensor.usm_ndarray type, got {type(a2)}."
+        )
+    atol = float(atol)
+    rtol = float(rtol)
+    if atol < 0.0 or rtol < 0.0:
+        raise ValueError(
+            "Absolute and relative tolerances must be non-negative"
+        )
+    equal_nan = bool(equal_nan)
+    exec_q = du.get_execution_queue(tuple(a.sycl_queue for a in (a1, a2)))
+    if exec_q is None:
+        raise du.ExecutionPlacementError(
+            "Execution placement can not be unambiguously inferred "
+            "from input arguments."
+        )
+    res_sh = _broadcast_shape_impl([a1.shape, a2.shape])
+    b1 = a1
+    b2 = a2
+    if b1.dtype == b2.dtype:
+        res_dt = b1.dtype
+    else:
+        res_dt = np.promote_types(b1.dtype, b2.dtype)
+        res_dt = _to_device_supported_dtype(res_dt, exec_q.sycl_device)
+        b1 = dpt_ext.astype(b1, res_dt)
+        b2 = dpt_ext.astype(b2, res_dt)
+
+    b1 = dpt_ext.broadcast_to(b1, res_sh)
+    b2 = dpt_ext.broadcast_to(b2, res_sh)
+
+    k = b1.dtype.kind
+    if k == "c":
+        return _allclose_complex_fp(b1, b2, atol, rtol, equal_nan)
+    elif k == "f":
+        return _allclose_real_fp(b1, b2, atol, rtol, equal_nan)
+    else:
+        return _allclose_others(b1, b2)
diff --git a/dpctl_ext/tensor/_utility_functions.py b/dpctl_ext/tensor/_utility_functions.py
index a122ac3d6cea..821f0954017a 100644
--- a/dpctl_ext/tensor/_utility_functions.py
+++ b/dpctl_ext/tensor/_utility_functions.py
@@ -489,7 +489,7 @@ def diff(x, /, *, axis=-1, n=1, prepend=None, append=None):
         slice(None) if i != axis else slice(None, -1) for i in range(x_nd)
     )
 
-    diff_op = dpt.not_equal if x.dtype == dpt.bool else dpt.subtract
+    diff_op = dpt_ext.not_equal if x.dtype == dpt.bool else dpt_ext.subtract
     if n > 1:
         arr_tmp0 = diff_op(arr[sl0], arr[sl1])
         arr_tmp1 = diff_op(arr_tmp0[sl0], arr_tmp0[sl1])
diff --git a/dpnp/dpnp_algo/dpnp_elementwise_common.py b/dpnp/dpnp_algo/dpnp_elementwise_common.py
index d7eeccf78489..6aaf46f7ad9c 100644
--- a/dpnp/dpnp_algo/dpnp_elementwise_common.py
+++ b/dpnp/dpnp_algo/dpnp_elementwise_common.py
@@ -30,18 +30,8 @@
 from functools import wraps
 
 import dpctl.tensor as dpt
-import dpctl.tensor._type_utils as dtu
 import dpctl.utils as dpu
 import numpy
-from dpctl.tensor._elementwise_common import (
-    BinaryElementwiseFunc,
-    UnaryElementwiseFunc,
-)
-from dpctl.tensor._scalar_utils import (
-    _get_dtype,
-    _get_shape,
-    _validate_dtype,
-)
 
 # pylint: disable=no-name-in-module
 # TODO: revert to `import dpctl.tensor...`
@@ -49,8 +39,18 @@
 import dpctl_ext.tensor as dpt_ext
 import dpctl_ext.tensor._copy_utils as dtc
 import dpctl_ext.tensor._tensor_impl as dti
+import dpctl_ext.tensor._type_utils as dtu
 import dpnp
 import dpnp.backend.extensions.vm._vm_impl as vmi
+from dpctl_ext.tensor._elementwise_common import (
+    BinaryElementwiseFunc,
+    UnaryElementwiseFunc,
+)
+from dpctl_ext.tensor._scalar_utils import (
+    _get_dtype,
+    _get_shape,
+    _validate_dtype,
+)
 from dpnp.dpnp_array import dpnp_array
 from dpnp.dpnp_utils import get_usm_allocations
 from dpnp.dpnp_utils.dpnp_utils_common import (
diff --git a/dpnp/dpnp_iface_statistics.py b/dpnp/dpnp_iface_statistics.py
index 75fe215837b9..1d89d14c8df8 100644
--- a/dpnp/dpnp_iface_statistics.py
+++ b/dpnp/dpnp_iface_statistics.py
@@ -41,14 +41,13 @@
 
 import math
 
-import dpctl.tensor as dpt
-import dpctl.tensor._tensor_elementwise_impl as ti
 import dpctl.utils as dpu
 import numpy
 
 # TODO: revert to `import dpctl.tensor...`
 # when dpnp fully migrates dpctl/tensor
-import dpctl_ext.tensor as dpt_ext
+import dpctl_ext.tensor as dpt
+import dpctl_ext.tensor._tensor_elementwise_impl as ti
 import dpnp
 
 # pylint: disable=no-name-in-module
@@ -1118,7 +1117,7 @@ def max(a, axis=None, out=None, keepdims=False, initial=None, where=True):
     return dpnp_wrap_reduction_call(
         usm_a,
         out,
-        dpt_ext.max,
+        dpt.max,
         a.dtype,
         axis=axis,
         keepdims=keepdims,
@@ -1207,7 +1206,7 @@ def mean(a, /, axis=None, dtype=None, out=None, keepdims=False, *, where=True):
     usm_a = dpnp.get_usm_ndarray(a)
     usm_res = dpt.mean(usm_a, axis=axis, keepdims=keepdims)
     if dtype is not None:
-        usm_res = dpt_ext.astype(usm_res, dtype)
+        usm_res = dpt.astype(usm_res, dtype)
 
     return dpnp.get_result_array(usm_res, out, casting="unsafe")
 
@@ -1395,7 +1394,7 @@ def min(a, axis=None, out=None, keepdims=False, initial=None, where=True):
     return dpnp_wrap_reduction_call(
         usm_a,
         out,
-        dpt_ext.min,
+        dpt.min,
         a.dtype,
         axis=axis,
         keepdims=keepdims,
diff --git a/dpnp/tests/test_sycl_queue.py b/dpnp/tests/test_sycl_queue.py
index a9c076a7c476..4485d79b2213 100644
--- a/dpnp/tests/test_sycl_queue.py
+++ b/dpnp/tests/test_sycl_queue.py
@@ -1103,7 +1103,7 @@ def test_from_dlpack(arr_dtype, shape, device):
 @pytest.mark.parametrize("device", valid_dev, ids=dev_ids)
 @pytest.mark.parametrize("arr_dtype", get_all_dtypes(no_float16=True))
 def test_from_dlpack_with_dpt(arr_dtype, device):
-    X = dpctl.tensor.empty((64,), dtype=arr_dtype, device=device)
+    X = dpt.empty((64,), dtype=arr_dtype, device=device)
     Y = dpnp.from_dlpack(X)
     assert_array_equal(X, Y)
     assert isinstance(Y, dpnp.dpnp_array.dpnp_array)

From 851628bdbd0a25755359eb423adec813f0a75152 Mon Sep 17 00:00:00 2001
From: vlad-perevezentsev <vladislav.perevezentsev@intel.com>
Date: Tue, 31 Mar 2026 10:06:15 +0200
Subject: [PATCH 21/43] Move `usm_ndarray`  into `dpctl_ext.tensor` (#2807)

This PR proposes to migrate the tensor interface (`usm_ndarray, dlpack,
flags`) into `dpctl_ext/tensor` making `dpnp` independent of `dpctl's`
tensor module.

Updates:
> -  Introduce `dpctl_ext_capi.h`
> - Implement a clean CMake interface library `DpctlExtCAPI` to properly
propagate generated headers to consumers
> - Update remaining imports from `dpctl.tensor` to `dpctl_ext.tensor`
> - Link all backend extensions against `DpctlExtCAPI` to ensure
consistent access to the C-API
---
 .github/workflows/build-sphinx.yml            |    2 +-
 .github/workflows/generate_coverage.yaml      |    4 +-
 .gitignore                                    |    1 +
 CMakeLists.txt                                |   13 +-
 dpctl_ext/CMakeLists.txt                      |   93 +-
 dpctl_ext/apis/include/dpctl_ext_capi.h       |  106 +
 dpctl_ext/tensor/CMakeLists.txt               |   10 +
 dpctl_ext/tensor/__init__.pxd                 |   36 +
 dpctl_ext/tensor/__init__.py                  |   74 +
 dpctl_ext/tensor/_accumulation.py             |   25 +-
 dpctl_ext/tensor/_array_api.py                |  256 +++
 dpctl_ext/tensor/_clip.py                     |   81 +-
 dpctl_ext/tensor/_constants.py                |   36 +
 dpctl_ext/tensor/_copy_utils.py               |  175 +-
 dpctl_ext/tensor/_ctors.py                    |   53 +-
 dpctl_ext/tensor/_data_types.py               |  104 +
 dpctl_ext/tensor/_device.py                   |  195 ++
 dpctl_ext/tensor/_dldevice_conversions.py     |   52 +
 dpctl_ext/tensor/_dlpack.pxd                  |   73 +
 dpctl_ext/tensor/_dlpack.pyx                  | 1245 +++++++++++
 dpctl_ext/tensor/_elementwise_common.py       |   67 +-
 dpctl_ext/tensor/_flags.pyx                   |  175 ++
 dpctl_ext/tensor/_indexing_functions.py       |   19 +-
 dpctl_ext/tensor/_linear_algebra_functions.py |  111 +-
 dpctl_ext/tensor/_manipulation_functions.py   |   61 +-
 dpctl_ext/tensor/_print.py                    |  503 +++++
 dpctl_ext/tensor/_reduction.py                |   41 +-
 dpctl_ext/tensor/_reshape.py                  |    5 +-
 dpctl_ext/tensor/_scalar_utils.py             |    7 +-
 dpctl_ext/tensor/_search_functions.py         |   29 +-
 dpctl_ext/tensor/_searchsorted.py             |    8 +-
 dpctl_ext/tensor/_set_functions.py            |  123 +-
 dpctl_ext/tensor/_slicing.pxi                 |  383 ++++
 dpctl_ext/tensor/_sorting.py                  |   47 +-
 dpctl_ext/tensor/_statistical_functions.py    |   39 +-
 dpctl_ext/tensor/_stride_utils.pxi            |  314 +++
 dpctl_ext/tensor/_testing.py                  |   89 +-
 dpctl_ext/tensor/_type_utils.py               |   13 +-
 dpctl_ext/tensor/_types.pxi                   |  169 ++
 dpctl_ext/tensor/_usmarray.pxd                |   88 +
 dpctl_ext/tensor/_usmarray.pyx                | 1986 +++++++++++++++++
 dpctl_ext/tensor/_utility_functions.py        |   39 +-
 .../tensor/include/dlpack/LICENSE.third-party |  201 ++
 dpctl_ext/tensor/include/dlpack/README.md     |    7 +
 dpctl_ext/tensor/include/dlpack/dlpack.h      |  683 ++++++
 dpnp/__init__.py                              |    8 +-
 dpnp/backend/extensions/blas/CMakeLists.txt   |    2 +
 dpnp/backend/extensions/fft/CMakeLists.txt    |    2 +
 .../extensions/indexing/CMakeLists.txt        |    2 +
 dpnp/backend/extensions/lapack/CMakeLists.txt |    1 +
 .../extensions/statistics/CMakeLists.txt      |    2 +
 dpnp/backend/extensions/ufunc/CMakeLists.txt  |    2 +
 dpnp/backend/extensions/vm/CMakeLists.txt     |    2 +
 dpnp/backend/extensions/window/CMakeLists.txt |    2 +
 dpnp/backend/include/dpnp4pybind11.hpp        |   19 +-
 dpnp/dpnp_algo/dpnp_arraycreation.py          |   25 +-
 dpnp/dpnp_algo/dpnp_elementwise_common.py     |   39 +-
 dpnp/dpnp_array.py                            |    8 +-
 dpnp/dpnp_array_api_info.py                   |    4 +-
 dpnp/dpnp_iface.py                            |    9 +-
 dpnp/dpnp_iface_arraycreation.py              |   23 +-
 dpnp/dpnp_iface_indexing.py                   |   71 +-
 dpnp/dpnp_iface_manipulation.py               |   67 +-
 dpnp/dpnp_iface_statistics.py                 |   10 +-
 dpnp/dpnp_iface_types.py                      |    9 +-
 dpnp/dpnp_utils/dpnp_utils_statistics.py      |    5 +-
 dpnp/exceptions/__init__.py                   |    5 +-
 dpnp/memory/_memory.py                        |    5 +-
 dpnp/tests/test_mathematical.py               |   13 +-
 dpnp/tests/test_memory.py                     |    5 +-
 dpnp/tests/test_ndarray.py                    |    5 +-
 .../cupy/core_tests/test_dlpack.py            |    4 +-
 72 files changed, 7532 insertions(+), 658 deletions(-)
 create mode 100644 dpctl_ext/apis/include/dpctl_ext_capi.h
 create mode 100644 dpctl_ext/tensor/__init__.pxd
 create mode 100644 dpctl_ext/tensor/_array_api.py
 create mode 100644 dpctl_ext/tensor/_constants.py
 create mode 100644 dpctl_ext/tensor/_data_types.py
 create mode 100644 dpctl_ext/tensor/_device.py
 create mode 100644 dpctl_ext/tensor/_dldevice_conversions.py
 create mode 100644 dpctl_ext/tensor/_dlpack.pxd
 create mode 100644 dpctl_ext/tensor/_dlpack.pyx
 create mode 100644 dpctl_ext/tensor/_flags.pyx
 create mode 100644 dpctl_ext/tensor/_print.py
 create mode 100644 dpctl_ext/tensor/_slicing.pxi
 create mode 100644 dpctl_ext/tensor/_stride_utils.pxi
 create mode 100644 dpctl_ext/tensor/_types.pxi
 create mode 100644 dpctl_ext/tensor/_usmarray.pxd
 create mode 100644 dpctl_ext/tensor/_usmarray.pyx
 create mode 100644 dpctl_ext/tensor/include/dlpack/LICENSE.third-party
 create mode 100644 dpctl_ext/tensor/include/dlpack/README.md
 create mode 100644 dpctl_ext/tensor/include/dlpack/dlpack.h

diff --git a/.github/workflows/build-sphinx.yml b/.github/workflows/build-sphinx.yml
index 0745ca1ca9dc..6dcfd2109c27 100644
--- a/.github/workflows/build-sphinx.yml
+++ b/.github/workflows/build-sphinx.yml
@@ -26,7 +26,7 @@ jobs:
     name: Build and Deploy Docs
 
     runs-on: ubuntu-22.04
-    timeout-minutes: 60
+    timeout-minutes: 90
 
     permissions:
       # Needed to cancel any previous runs that are not completed for a given workflow
diff --git a/.github/workflows/generate_coverage.yaml b/.github/workflows/generate_coverage.yaml
index 2cbe97ab0242..f56018b4ef8e 100644
--- a/.github/workflows/generate_coverage.yaml
+++ b/.github/workflows/generate_coverage.yaml
@@ -11,7 +11,7 @@ jobs:
     name: Generate coverage and push to Coveralls.io
 
     runs-on: ubuntu-latest
-    timeout-minutes: 120
+    timeout-minutes: 150
 
     permissions:
       # Needed to cancel any previous runs that are not completed for a given workflow
@@ -122,7 +122,7 @@ jobs:
         uses: nick-fields/retry@ce71cc2ab81d554ebbe88c79ab5975992d79ba08 # v3.0.2
         with:
           shell: bash
-          timeout_minutes: 60
+          timeout_minutes: 120
           max_attempts: 5
           retry_on: error
           command: |
diff --git a/.gitignore b/.gitignore
index 0cfebe53f623..f8ed987fa0d9 100644
--- a/.gitignore
+++ b/.gitignore
@@ -36,3 +36,4 @@ core
 # TODO: revert to `dpctl/`
 # when dpnp fully migrates dpctl/tensor
 dpctl_ext/**/*.cpython*.so
+dpctl_ext/include/
diff --git a/CMakeLists.txt b/CMakeLists.txt
index c7bb7f650dac..5db9fe9a6759 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -344,5 +344,16 @@ if(DEFINED SKBUILD)
     set(_ignore_me ${SKBUILD})
 endif()
 
-add_subdirectory(dpnp)
+# DpctlExtCAPI: Interface library for dpctl_ext C-API
+# Provides access to:
+# 1. Public C-API headers from dpctl_ext/apis/include
+# 2. Generated Cython headers via per-target header interface libraries
+
+add_library(DpctlExtCAPI INTERFACE)
+target_include_directories(
+    DpctlExtCAPI
+    INTERFACE ${CMAKE_CURRENT_SOURCE_DIR}/dpctl_ext/apis/include
+)
+
 add_subdirectory(dpctl_ext)
+add_subdirectory(dpnp)
diff --git a/dpctl_ext/CMakeLists.txt b/dpctl_ext/CMakeLists.txt
index e58693091422..5baba4de80d0 100644
--- a/dpctl_ext/CMakeLists.txt
+++ b/dpctl_ext/CMakeLists.txt
@@ -112,8 +112,99 @@ else()
 endif()
 
 # at build time create include/ directory and copy header files over
-# set(DPCTL_INCLUDE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/include)
+set(DPCTL_INCLUDE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/include)
 
 set(CMAKE_INSTALL_RPATH "$ORIGIN")
 
+function(build_dpctl_ext _trgt _src _dest)
+    set(options SYCL)
+    cmake_parse_arguments(BUILD_DPCTL_EXT "${options}" "RELATIVE_PATH" "" ${ARGN})
+    add_cython_target(${_trgt} ${_src} CXX OUTPUT_VAR _generated_src)
+    set(_cythonize_trgt "${_trgt}_cythonize_pyx")
+    python_add_library(${_trgt} MODULE WITH_SOABI ${_generated_src})
+    if(BUILD_DPCTL_EXT_SYCL)
+        add_sycl_to_target(TARGET ${_trgt} SOURCES ${_generated_src})
+        target_compile_options(${_trgt} PRIVATE -fno-sycl-id-queries-fit-in-int)
+        target_link_options(${_trgt} PRIVATE -fsycl-device-code-split=per_kernel)
+        if(DPCTL_OFFLOAD_COMPRESS)
+            target_link_options(${_trgt} PRIVATE --offload-compress)
+        endif()
+        if(_dpctl_sycl_targets)
+            # make fat binary
+            target_compile_options(
+                ${_trgt}
+                PRIVATE ${_dpctl_sycl_target_compile_options}
+            )
+            target_link_options(${_trgt} PRIVATE ${_dpctl_sycl_target_link_options})
+        endif()
+    endif()
+    target_link_libraries(${_trgt} PRIVATE Python::NumPy)
+    if(DPCTL_GENERATE_COVERAGE)
+        target_compile_definitions(${_trgt} PRIVATE CYTHON_TRACE=1 CYTHON_TRACE_NOGIL=1)
+        if(BUILD_DPCTL_EXT_SYCL)
+            target_compile_options(${_trgt} PRIVATE -fno-sycl-use-footer)
+        endif()
+    endif()
+    # Dpctl
+    target_include_directories(${_trgt} PRIVATE ${Dpctl_INCLUDE_DIR})
+    target_link_directories(${_trgt} PRIVATE ${Dpctl_INCLUDE_DIR}/..)
+    target_link_libraries(${_trgt} PRIVATE DPCTLSyclInterface)
+    set(_linker_options "LINKER:${DPCTL_LDFLAGS}")
+    target_link_options(${_trgt} PRIVATE ${_linker_options})
+    get_filename_component(_name_wle ${_generated_src} NAME_WLE)
+    get_filename_component(_generated_src_dir ${_generated_src} DIRECTORY)
+    set(_generated_public_h "${_generated_src_dir}/${_name_wle}.h")
+    set(_generated_api_h "${_generated_src_dir}/${_name_wle}_api.h")
+
+    # TODO: create separate folder inside build folder that contains only
+    #   headers related to this target and appropriate folder structure to
+    #   eliminate shadow dependencies
+    # Go up two levels to build root for "dpctl_ext/tensor/_usmarray.h" resolution
+    get_filename_component(_parent_dir ${_generated_src_dir} DIRECTORY)
+    get_filename_component(_build_root ${_parent_dir} DIRECTORY)
+    # TODO: do not set directory if we did not generate header
+    target_include_directories(${_trgt} INTERFACE ${_build_root})
+    set(_rpath_value "$ORIGIN")
+    if(BUILD_DPCTL_EXT_RELATIVE_PATH)
+        set(_rpath_value "${_rpath_value}/${BUILD_DPCTL_EXT_RELATIVE_PATH}")
+    endif()
+    if(DPCTL_WITH_REDIST)
+        set(_rpath_value "${_rpath_value}:${_rpath_value}/../../..")
+    endif()
+    set_target_properties(${_trgt} PROPERTIES INSTALL_RPATH ${_rpath_value})
+
+    install(TARGETS ${_trgt} LIBRARY DESTINATION ${_dest})
+    install(
+        FILES ${_generated_api_h}
+        DESTINATION ${CMAKE_INSTALL_PREFIX}/dpctl_ext/include/${_dest}
+        OPTIONAL
+    )
+    install(
+        FILES ${_generated_public_h}
+        DESTINATION ${CMAKE_INSTALL_PREFIX}/dpctl_ext/include/${_dest}
+        OPTIONAL
+    )
+    if(DPCTL_GENERATE_COVERAGE)
+        get_filename_component(_original_src_dir ${_src} DIRECTORY)
+        file(RELATIVE_PATH _rel_dir ${CMAKE_SOURCE_DIR} ${_original_src_dir})
+        install(FILES ${_generated_src} DESTINATION ${CMAKE_INSTALL_PREFIX}/${_rel_dir})
+    endif()
+
+    # Create target with headers only, because python is managing all the
+    # library imports at runtime
+    set(_trgt_headers ${_trgt}_headers)
+    add_library(${_trgt_headers} INTERFACE)
+    add_dependencies(${_trgt_headers} ${_trgt})
+    get_target_property(_trgt_headers_dir ${_trgt} INTERFACE_INCLUDE_DIRECTORIES)
+    target_include_directories(${_trgt_headers} INTERFACE ${_trgt_headers_dir})
+endfunction()
+
+# Install dpctl_ext C-API headers (similar to dpctl's C-API installation)
+install(
+    DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/apis/include/
+    DESTINATION ${CMAKE_INSTALL_PREFIX}/dpctl_ext/include
+    FILES_MATCHING
+    REGEX "\\.h(pp)?$"
+)
+
 add_subdirectory(tensor)
diff --git a/dpctl_ext/apis/include/dpctl_ext_capi.h b/dpctl_ext/apis/include/dpctl_ext_capi.h
new file mode 100644
index 000000000000..65d332fb73cc
--- /dev/null
+++ b/dpctl_ext/apis/include/dpctl_ext_capi.h
@@ -0,0 +1,106 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file provides access to dpctl_ext's C-API, including:
+/// - dpctl C-API (from external dpctl package - SYCL interface)
+/// - dpctl_ext tensor C-API (usm_ndarray)
+//===---------------------------------------------------------------------===//
+
+#pragma once
+
+// Include dpctl C-API headers explicitly from external dpctl package (SYCL
+// interface)
+// TODO: Once dpctl removes its tensor module and stabilizes dpctl_capi.h,
+// we can simplify to just: #include "dpctl_capi.h"
+// For now, explicit includes ensure we only get SYCL interface without tensor.
+
+#include "syclinterface/dpctl_sycl_extension_interface.h"
+#include "syclinterface/dpctl_sycl_types.h"
+
+#ifdef __cplusplus
+#define CYTHON_EXTERN_C extern "C"
+#else
+#define CYTHON_EXTERN_C
+#endif
+
+#include "dpctl/_sycl_context.h"
+#include "dpctl/_sycl_context_api.h"
+#include "dpctl/_sycl_device.h"
+#include "dpctl/_sycl_device_api.h"
+#include "dpctl/_sycl_event.h"
+#include "dpctl/_sycl_event_api.h"
+#include "dpctl/_sycl_queue.h"
+#include "dpctl/_sycl_queue_api.h"
+#include "dpctl/memory/_memory.h"
+#include "dpctl/memory/_memory_api.h"
+#include "dpctl/program/_program.h"
+#include "dpctl/program/_program_api.h"
+
+// Include the generated Cython C-API headers for usm_ndarray
+// These headers are generated during build and placed in the build directory
+#include "dpctl_ext/tensor/_usmarray.h"
+#include "dpctl_ext/tensor/_usmarray_api.h"
+
+/*
+ * Function to import dpctl_ext C-API and make it available.
+ * This imports both:
+ * - dpctl C-API (from external dpctl package - SYCL interface)
+ * - dpctl_ext C-API (tensor interface - usm_ndarray)
+ *
+ * C functions can use dpctl_ext's C-API functions without linking to
+ * shared objects defining these symbols, if they call `import_dpctl_ext()`
+ * prior to using those symbols.
+ *
+ * It is declared inline to allow multiple definitions in
+ * different translation units.
+ *
+ * TODO: When dpctl_ext is renamed to dpctl.tensor:
+ *   - Rename this file: dpctl_ext_capi.h → dpctl/tensor/tensor_capi.h
+ *     (Use tensor_capi.h, NOT dpctl_capi.h, to avoid conflict with external
+ * dpctl)
+ *   - Rename this function: import_dpctl_ext() → import_dpctl_tensor()
+ *   - Include external dpctl_capi.h and simplify imports to use import_dpctl()
+ */
+static inline void import_dpctl_ext(void)
+{
+    // Import dpctl SYCL interface
+    // TODO: Once dpctl removes its tensor module and stabilizes dpctl_capi.h,
+    // we can simplify to just: import_dpctl()
+    import_dpctl___sycl_device();
+    import_dpctl___sycl_context();
+    import_dpctl___sycl_event();
+    import_dpctl___sycl_queue();
+    import_dpctl__memory___memory();
+    import_dpctl__program___program();
+    // Import dpctl_ext tensor interface
+    import_dpctl_ext__tensor___usmarray();
+    return;
+}
diff --git a/dpctl_ext/tensor/CMakeLists.txt b/dpctl_ext/tensor/CMakeLists.txt
index 6f286a8d7198..8df593b0838d 100644
--- a/dpctl_ext/tensor/CMakeLists.txt
+++ b/dpctl_ext/tensor/CMakeLists.txt
@@ -29,6 +29,15 @@
 
 find_package(Python COMPONENTS Development.Module)
 
+file(GLOB _cython_sources *.pyx)
+foreach(_cy_file ${_cython_sources})
+    get_filename_component(_trgt ${_cy_file} NAME_WLE)
+    build_dpctl_ext(${_trgt} ${_cy_file} "dpctl_ext/tensor" RELATIVE_PATH "..")
+    target_include_directories(${_trgt} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include)
+    # target_link_libraries(DpctlCAPI INTERFACE ${_trgt}_headers)
+    target_link_libraries(DpctlExtCAPI INTERFACE ${_trgt}_headers)
+endforeach()
+
 if(WIN32)
     if(${CMAKE_VERSION} VERSION_LESS "3.27")
         # this is a work-around for target_link_options inserting option after -link option, cause
@@ -338,6 +347,7 @@ foreach(python_module_name ${_py_trgts})
     #   dpctl4pybind11.hpp. It will allow to simplify dependency tree
     # NOTE: dpctl C-API is resolved at runtime via Python
     # target_link_libraries(${python_module_name} PRIVATE DpctlCAPI)
+    target_link_libraries(${python_module_name} PRIVATE DpctlExtCAPI)
     if(DPNP_WITH_REDIST)
         set_target_properties(
             ${python_module_name}
diff --git a/dpctl_ext/tensor/__init__.pxd b/dpctl_ext/tensor/__init__.pxd
new file mode 100644
index 000000000000..a4bcecfec1d1
--- /dev/null
+++ b/dpctl_ext/tensor/__init__.pxd
@@ -0,0 +1,36 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+""" This file declares the extension types and functions for the Cython API
+    implemented in _usmarray.pyx file.
+"""
+
+# distutils: language = c++
+# cython: language_level=3
+
+from ._usmarray cimport *
diff --git a/dpctl_ext/tensor/__init__.py b/dpctl_ext/tensor/__init__.py
index 7a6923169c1f..03980e194fd0 100644
--- a/dpctl_ext/tensor/__init__.py
+++ b/dpctl_ext/tensor/__init__.py
@@ -28,7 +28,9 @@
 
 
 from ._accumulation import cumulative_logsumexp, cumulative_prod, cumulative_sum
+from ._array_api import __array_api_version__, __array_namespace_info__
 from ._clip import clip
+from ._constants import e, inf, nan, newaxis, pi
 from ._copy_utils import (
     asnumpy,
     astype,
@@ -53,6 +55,29 @@
     zeros,
     zeros_like,
 )
+from ._data_types import (
+    bool,
+    complex64,
+    complex128,
+    dtype,
+    float16,
+    float32,
+    float64,
+    int8,
+    int16,
+    int32,
+    int64,
+    uint8,
+    uint16,
+    uint32,
+    uint64,
+)
+from ._device import Device
+from ._dldevice_conversions import (
+    dldevice_to_sycl_device,
+    sycl_device_to_dldevice,
+)
+from ._dlpack import from_dlpack
 from ._elementwise_funcs import (
     abs,
     acos,
@@ -157,6 +182,13 @@
     tile,
     unstack,
 )
+from ._print import (
+    get_print_options,
+    print_options,
+    set_print_options,
+    usm_ndarray_repr,
+    usm_ndarray_str,
+)
 from ._reduction import (
     argmax,
     argmin,
@@ -168,6 +200,12 @@
     reduce_hypot,
     sum,
 )
+
+# isort: off
+# placed here to avoid circular import
+from ._usmarray import DLDeviceType, usm_ndarray
+
+# isort: on
 from ._reshape import reshape
 from ._search_functions import where
 from ._searchsorted import searchsorted
@@ -185,6 +223,32 @@
 from ._utility_functions import all, any, diff
 
 __all__ = [
+    "Device",
+    "DLDeviceType",
+    "usm_ndarray",
+    # data types
+    "bool",
+    "dtype",
+    "int8",
+    "uint8",
+    "int16",
+    "uint16",
+    "int32",
+    "uint32",
+    "int64",
+    "uint64",
+    "float16",
+    "float32",
+    "float64",
+    "complex64",
+    "complex128",
+    # constants
+    "e",
+    "inf",
+    "nan",
+    "newaxis",
+    "pi",
+    # functions
     "abs",
     "acos",
     "acosh",
@@ -229,6 +293,7 @@
     "cumulative_sum",
     "diff",
     "divide",
+    "dldevice_to_sycl_device",
     "empty",
     "empty_like",
     "equal",
@@ -242,9 +307,11 @@
     "flip",
     "floor",
     "floor_divide",
+    "from_dlpack",
     "from_numpy",
     "full",
     "full_like",
+    "get_print_options",
     "greater",
     "greater_equal",
     "hypot",
@@ -288,6 +355,7 @@
     "place",
     "positive",
     "pow",
+    "print_options",
     "prod",
     "proj",
     "put",
@@ -303,6 +371,7 @@
     "round",
     "rsqrt",
     "searchsorted",
+    "set_print_options",
     "sign",
     "signbit",
     "sin",
@@ -316,6 +385,7 @@
     "subtract",
     "sum",
     "swapaxes",
+    "sycl_device_to_dldevice",
     "take",
     "take_along_axis",
     "tan",
@@ -332,9 +402,13 @@
     "unique_inverse",
     "unique_values",
     "unstack",
+    "usm_ndarray_repr",
+    "usm_ndarray_str",
     "var",
     "vecdot",
     "where",
     "zeros",
     "zeros_like",
+    "__array_api_version__",
+    "__array_namespace_info__",
 ]
diff --git a/dpctl_ext/tensor/_accumulation.py b/dpctl_ext/tensor/_accumulation.py
index 2dfe9656e198..8628628f3bf8 100644
--- a/dpctl_ext/tensor/_accumulation.py
+++ b/dpctl_ext/tensor/_accumulation.py
@@ -27,12 +27,11 @@
 # *****************************************************************************
 
 import dpctl
-import dpctl.tensor as dpt
 from dpctl.utils import ExecutionPlacementError, SequentialOrderManager
 
 # TODO: revert to `import dpctl.tensor...`
 # when dpnp fully migrates dpctl/tensor
-import dpctl_ext.tensor as dpt_ext
+import dpctl_ext.tensor as dpt
 import dpctl_ext.tensor._tensor_accumulation_impl as tai
 import dpctl_ext.tensor._tensor_impl as ti
 
@@ -82,7 +81,7 @@ def _accumulate_common(
         perm = [i for i in range(nd) if i != axis] + [
             axis,
         ]
-        arr = dpt_ext.permute_dims(x, perm)
+        arr = dpt.permute_dims(x, perm)
     q = x.sycl_queue
     inp_dt = x.dtype
     res_usm_type = x.usm_type
@@ -130,16 +129,16 @@ def _accumulate_common(
             )
         # permute out array dims if necessary
         if a1 != nd:
-            out = dpt_ext.permute_dims(out, perm)
+            out = dpt.permute_dims(out, perm)
             orig_out = out
         if ti._array_overlap(x, out) and implemented_types:
-            out = dpt_ext.empty_like(out)
+            out = dpt.empty_like(out)
     else:
-        out = dpt_ext.empty(
+        out = dpt.empty(
             res_sh, dtype=res_dt, usm_type=res_usm_type, sycl_queue=q
         )
         if a1 != nd:
-            out = dpt_ext.permute_dims(out, perm)
+            out = dpt.permute_dims(out, perm)
 
     _manager = SequentialOrderManager[q]
     depends = _manager.submitted_events
@@ -166,7 +165,7 @@ def _accumulate_common(
             out = orig_out
     else:
         if _dtype_supported(res_dt, res_dt):
-            tmp = dpt_ext.empty(
+            tmp = dpt.empty(
                 arr.shape, dtype=res_dt, usm_type=res_usm_type, sycl_queue=q
             )
             ht_e_cpy, cpy_e = ti._copy_usm_ndarray_into_usm_ndarray(
@@ -191,18 +190,18 @@ def _accumulate_common(
             _manager.add_event_pair(ht_e, acc_ev)
         else:
             buf_dt = _default_accumulation_type_fn(inp_dt, q)
-            tmp = dpt_ext.empty(
+            tmp = dpt.empty(
                 arr.shape, dtype=buf_dt, usm_type=res_usm_type, sycl_queue=q
             )
             ht_e_cpy, cpy_e = ti._copy_usm_ndarray_into_usm_ndarray(
                 src=arr, dst=tmp, sycl_queue=q, depends=depends
             )
             _manager.add_event_pair(ht_e_cpy, cpy_e)
-            tmp_res = dpt_ext.empty(
+            tmp_res = dpt.empty(
                 res_sh, dtype=buf_dt, usm_type=res_usm_type, sycl_queue=q
             )
             if a1 != nd:
-                tmp_res = dpt_ext.permute_dims(tmp_res, perm)
+                tmp_res = dpt.permute_dims(tmp_res, perm)
             if not include_initial:
                 ht_e, acc_ev = _accumulate_fn(
                     src=tmp,
@@ -225,10 +224,10 @@ def _accumulate_common(
             _manager.add_event_pair(ht_e_cpy2, cpy_e2)
 
     if appended_axis:
-        out = dpt_ext.squeeze(out)
+        out = dpt.squeeze(out)
     if a1 != nd:
         inv_perm = sorted(range(nd), key=lambda d: perm[d])
-        out = dpt_ext.permute_dims(out, inv_perm)
+        out = dpt.permute_dims(out, inv_perm)
 
     return out
 
diff --git a/dpctl_ext/tensor/_array_api.py b/dpctl_ext/tensor/_array_api.py
new file mode 100644
index 000000000000..09f71bc1bdd3
--- /dev/null
+++ b/dpctl_ext/tensor/_array_api.py
@@ -0,0 +1,256 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+import dpctl
+
+# TODO: revert to `import dpctl.tensor...`
+# when dpnp fully migrates dpctl/tensor
+import dpctl_ext.tensor as dpt
+
+from ._tensor_impl import (
+    default_device_complex_type,
+    default_device_fp_type,
+    default_device_index_type,
+    default_device_int_type,
+)
+
+
+def _isdtype_impl(dtype, kind):
+    if isinstance(kind, str):
+        if kind == "bool":
+            return dtype.kind == "b"
+        elif kind == "signed integer":
+            return dtype.kind == "i"
+        elif kind == "unsigned integer":
+            return dtype.kind == "u"
+        elif kind == "integral":
+            return dtype.kind in "iu"
+        elif kind == "real floating":
+            return dtype.kind == "f"
+        elif kind == "complex floating":
+            return dtype.kind == "c"
+        elif kind == "numeric":
+            return dtype.kind in "iufc"
+        else:
+            raise ValueError(f"Unrecognized data type kind: {kind}")
+
+    elif isinstance(kind, tuple):
+        return any(_isdtype_impl(dtype, k) for k in kind)
+    else:
+        raise TypeError(f"Unsupported type for dtype kind: {type(kind)}")
+
+
+def _get_device_impl(d):
+    if d is None:
+        return dpctl.select_default_device()
+    elif isinstance(d, dpctl.SyclDevice):
+        return d
+    elif isinstance(d, (dpt.Device, dpctl.SyclQueue)):
+        return d.sycl_device
+    else:
+        try:
+            return dpctl.SyclDevice(d)
+        except TypeError:
+            raise TypeError(f"Unsupported type for device argument: {type(d)}")
+
+
+__array_api_version__ = "2024.12"
+
+
+class Info:
+    """namespace returned by ``__array_namespace_info__()``"""
+
+    def __init__(self):
+        self._capabilities = {
+            "boolean indexing": True,
+            "data-dependent shapes": True,
+            "max dimensions": None,
+        }
+        self._all_dtypes = {
+            "bool": dpt.bool,
+            "float32": dpt.float32,
+            "float64": dpt.float64,
+            "complex64": dpt.complex64,
+            "complex128": dpt.complex128,
+            "int8": dpt.int8,
+            "int16": dpt.int16,
+            "int32": dpt.int32,
+            "int64": dpt.int64,
+            "uint8": dpt.uint8,
+            "uint16": dpt.uint16,
+            "uint32": dpt.uint32,
+            "uint64": dpt.uint64,
+        }
+
+    def capabilities(self):
+        """
+        capabilities()
+
+        Returns a dictionary of ``dpctl``'s capabilities.
+
+        The dictionary contains the following keys:
+            ``"boolean indexing"``:
+                boolean indicating ``dpctl``'s support of boolean indexing.
+                Value: ``True``
+            ``"data-dependent shapes"``:
+                boolean indicating ``dpctl``'s support of data-dependent shapes.
+                Value: ``True``
+            ``max dimensions``:
+                integer indication the maximum array dimension supported by ``dpctl``.
+                Value: ``None``
+
+        Returns:
+            dict:
+                dictionary of ``dpctl``'s capabilities
+        """
+        return self._capabilities.copy()
+
+    def default_device(self):
+        """
+        default_device()
+
+        Returns the default SYCL device.
+        """
+        return dpctl.select_default_device()
+
+    def default_dtypes(self, *, device=None):
+        """
+        default_dtypes(*, device=None)
+
+        Returns a dictionary of default data types for ``device``.
+
+        Args:
+            device (Optional[:class:`dpctl.SyclDevice`, :class:`dpctl.SyclQueue`, :class:`dpctl.tensor.Device`, str]):
+                array API concept of device used in getting default data types.
+                ``device`` can be ``None`` (in which case the default device
+                is used), an instance of :class:`dpctl.SyclDevice`, an instance
+                of :class:`dpctl.SyclQueue`, a :class:`dpctl.tensor.Device`
+                object returned by :attr:`dpctl.tensor.usm_ndarray.device`, or
+                a filter selector string.
+                Default: ``None``.
+
+        Returns:
+            dict:
+                a dictionary of default data types for ``device``:
+
+                    - ``"real floating"``: dtype
+                    - ``"complex floating"``: dtype
+                    - ``"integral"``: dtype
+                    - ``"indexing"``: dtype
+        """
+        device = _get_device_impl(device)
+        return {
+            "real floating": dpt.dtype(default_device_fp_type(device)),
+            "complex floating": dpt.dtype(default_device_complex_type(device)),
+            "integral": dpt.dtype(default_device_int_type(device)),
+            "indexing": dpt.dtype(default_device_index_type(device)),
+        }
+
+    def dtypes(self, *, device=None, kind=None):
+        """
+        dtypes(*, device=None, kind=None)
+
+        Returns a dictionary of all Array API data types of a specified
+        ``kind`` supported by ``device``.
+
+        This dictionary only includes data types supported by the
+        `Python Array API <https://data-apis.org/array-api/latest/>`_
+        specification.
+
+        Args:
+            device (Optional[:class:`dpctl.SyclDevice`, :class:`dpctl.SyclQueue`, :class:`dpctl.tensor.Device`, str]):
+                array API concept of device used in getting default data types.
+                ``device`` can be ``None`` (in which case the default device is
+                used), an instance of :class:`dpctl.SyclDevice`, an instance of
+                :class:`dpctl.SyclQueue`, a :class:`dpctl.tensor.Device`
+                object returned by :attr:`dpctl.tensor.usm_ndarray.device`, or
+                a filter selector string.
+                Default: ``None``.
+
+            kind (Optional[str, Tuple[str, ...]]):
+                data type kind.
+
+                - if ``kind`` is ``None``, returns a dictionary of all data
+                  types supported by `device`
+                - if ``kind`` is a string, returns a dictionary containing the
+                  data types belonging to the data type kind specified.
+
+                  Supports:
+
+                  * ``"bool"``
+                  * ``"signed integer"``
+                  * ``"unsigned integer"``
+                  * ``"integral"``
+                  * ``"real floating"``
+                  * ``"complex floating"``
+                  * ``"numeric"``
+
+                - if ``kind`` is a tuple, the tuple represents a union of
+                  ``kind`` strings, and returns a dictionary containing data
+                  types corresponding to the-specified union.
+
+                Default: ``None``.
+
+        Returns:
+            dict:
+                a dictionary of the supported data types of the specified
+                ``kind``
+        """
+        device = _get_device_impl(device)
+        _fp64 = device.has_aspect_fp64
+        if kind is None:
+            return {
+                key: val
+                for key, val in self._all_dtypes.items()
+                if _fp64 or (key != "float64" and key != "complex128")
+            }
+        else:
+            return {
+                key: val
+                for key, val in self._all_dtypes.items()
+                if (_fp64 or (key != "float64" and key != "complex128"))
+                and _isdtype_impl(val, kind)
+            }
+
+    def devices(self):
+        """
+        devices()
+
+        Returns a list of supported devices.
+        """
+        return dpctl.get_devices()
+
+
+def __array_namespace_info__():
+    """
+    __array_namespace_info__()
+
+    Returns a namespace with Array API namespace inspection utilities.
+
+    """
+    return Info()
diff --git a/dpctl_ext/tensor/_clip.py b/dpctl_ext/tensor/_clip.py
index c21d601966bd..8071f13bee19 100644
--- a/dpctl_ext/tensor/_clip.py
+++ b/dpctl_ext/tensor/_clip.py
@@ -27,12 +27,11 @@
 # *****************************************************************************
 
 import dpctl
-import dpctl.tensor as dpt
 from dpctl.utils import ExecutionPlacementError, SequentialOrderManager
 
 # TODO: revert to `import dpctl.tensor...`
 # when dpnp fully migrates dpctl/tensor
-import dpctl_ext.tensor as dpt_ext
+import dpctl_ext.tensor as dpt
 import dpctl_ext.tensor._tensor_elementwise_impl as tei
 import dpctl_ext.tensor._tensor_impl as ti
 
@@ -163,7 +162,7 @@ def _clip_none(x, val, out, order, _binary_fn):
 
         if ti._array_overlap(x, out):
             if not ti._same_logical_tensors(x, out):
-                out = dpt_ext.empty_like(out)
+                out = dpt.empty_like(out)
 
         if isinstance(val, dpt.usm_ndarray):
             if (
@@ -171,12 +170,12 @@ def _clip_none(x, val, out, order, _binary_fn):
                 and not ti._same_logical_tensors(val, out)
                 and val_dtype == res_dt
             ):
-                out = dpt_ext.empty_like(out)
+                out = dpt.empty_like(out)
 
     if isinstance(val, dpt.usm_ndarray):
         val_ary = val
     else:
-        val_ary = dpt_ext.asarray(val, dtype=val_dtype, sycl_queue=exec_q)
+        val_ary = dpt.asarray(val, dtype=val_dtype, sycl_queue=exec_q)
 
     if order == "A":
         order = (
@@ -197,7 +196,7 @@ def _clip_none(x, val, out, order, _binary_fn):
                     x, val_ary, res_dt, res_shape, res_usm_type, exec_q
                 )
             else:
-                out = dpt_ext.empty(
+                out = dpt.empty(
                     res_shape,
                     dtype=res_dt,
                     usm_type=res_usm_type,
@@ -205,9 +204,9 @@ def _clip_none(x, val, out, order, _binary_fn):
                     order=order,
                 )
         if x_shape != res_shape:
-            x = dpt_ext.broadcast_to(x, res_shape)
+            x = dpt.broadcast_to(x, res_shape)
         if val_ary.shape != res_shape:
-            val_ary = dpt_ext.broadcast_to(val_ary, res_shape)
+            val_ary = dpt.broadcast_to(val_ary, res_shape)
         _manager = SequentialOrderManager[exec_q]
         dep_evs = _manager.submitted_events
         ht_binary_ev, binary_ev = _binary_fn(
@@ -229,7 +228,7 @@ def _clip_none(x, val, out, order, _binary_fn):
         if order == "K":
             buf = _empty_like_orderK(val_ary, res_dt)
         else:
-            buf = dpt_ext.empty_like(val_ary, dtype=res_dt, order=order)
+            buf = dpt.empty_like(val_ary, dtype=res_dt, order=order)
         _manager = SequentialOrderManager[exec_q]
         dep_evs = _manager.submitted_events
         ht_copy_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
@@ -242,7 +241,7 @@ def _clip_none(x, val, out, order, _binary_fn):
                     x, buf, res_dt, res_shape, res_usm_type, exec_q
                 )
             else:
-                out = dpt_ext.empty(
+                out = dpt.empty(
                     res_shape,
                     dtype=res_dt,
                     usm_type=res_usm_type,
@@ -251,8 +250,8 @@ def _clip_none(x, val, out, order, _binary_fn):
                 )
 
         if x_shape != res_shape:
-            x = dpt_ext.broadcast_to(x, res_shape)
-        buf = dpt_ext.broadcast_to(buf, res_shape)
+            x = dpt.broadcast_to(x, res_shape)
+        buf = dpt.broadcast_to(buf, res_shape)
         ht_binary_ev, binary_ev = _binary_fn(
             src1=x,
             src2=buf,
@@ -313,9 +312,9 @@ def clip(x, /, min=None, max=None, out=None, order="K"):
     if order not in ["K", "C", "F", "A"]:
         order = "K"
     if x.dtype.kind in "iu":
-        if isinstance(min, int) and min <= dpt_ext.iinfo(x.dtype).min:
+        if isinstance(min, int) and min <= dpt.iinfo(x.dtype).min:
             min = None
-        if isinstance(max, int) and max >= dpt_ext.iinfo(x.dtype).max:
+        if isinstance(max, int) and max >= dpt.iinfo(x.dtype).max:
             max = None
     if min is None and max is None:
         exec_q = x.sycl_queue
@@ -353,14 +352,14 @@ def clip(x, /, min=None, max=None, out=None, order="K"):
 
             if ti._array_overlap(x, out):
                 if not ti._same_logical_tensors(x, out):
-                    out = dpt_ext.empty_like(out)
+                    out = dpt.empty_like(out)
                 else:
                     return out
         else:
             if order == "K":
                 out = _empty_like_orderK(x, x.dtype)
             else:
-                out = dpt_ext.empty_like(x, order=order)
+                out = dpt.empty_like(x, order=order)
 
         _manager = SequentialOrderManager[exec_q]
         dep_evs = _manager.submitted_events
@@ -519,7 +518,7 @@ def clip(x, /, min=None, max=None, out=None, order="K"):
 
             if ti._array_overlap(x, out):
                 if not ti._same_logical_tensors(x, out):
-                    out = dpt_ext.empty_like(out)
+                    out = dpt.empty_like(out)
 
             if isinstance(min, dpt.usm_ndarray):
                 if (
@@ -527,7 +526,7 @@ def clip(x, /, min=None, max=None, out=None, order="K"):
                     and not ti._same_logical_tensors(min, out)
                     and buf1_dt is None
                 ):
-                    out = dpt_ext.empty_like(out)
+                    out = dpt.empty_like(out)
 
             if isinstance(max, dpt.usm_ndarray):
                 if (
@@ -535,16 +534,16 @@ def clip(x, /, min=None, max=None, out=None, order="K"):
                     and not ti._same_logical_tensors(max, out)
                     and buf2_dt is None
                 ):
-                    out = dpt_ext.empty_like(out)
+                    out = dpt.empty_like(out)
 
         if isinstance(min, dpt.usm_ndarray):
             a_min = min
         else:
-            a_min = dpt_ext.asarray(min, dtype=min_dtype, sycl_queue=exec_q)
+            a_min = dpt.asarray(min, dtype=min_dtype, sycl_queue=exec_q)
         if isinstance(max, dpt.usm_ndarray):
             a_max = max
         else:
-            a_max = dpt_ext.asarray(max, dtype=max_dtype, sycl_queue=exec_q)
+            a_max = dpt.asarray(max, dtype=max_dtype, sycl_queue=exec_q)
 
         if order == "A":
             order = (
@@ -572,7 +571,7 @@ def clip(x, /, min=None, max=None, out=None, order="K"):
                         exec_q,
                     )
                 else:
-                    out = dpt_ext.empty(
+                    out = dpt.empty(
                         res_shape,
                         dtype=res_dt,
                         usm_type=res_usm_type,
@@ -580,11 +579,11 @@ def clip(x, /, min=None, max=None, out=None, order="K"):
                         order=order,
                     )
             if x_shape != res_shape:
-                x = dpt_ext.broadcast_to(x, res_shape)
+                x = dpt.broadcast_to(x, res_shape)
             if a_min.shape != res_shape:
-                a_min = dpt_ext.broadcast_to(a_min, res_shape)
+                a_min = dpt.broadcast_to(a_min, res_shape)
             if a_max.shape != res_shape:
-                a_max = dpt_ext.broadcast_to(a_max, res_shape)
+                a_max = dpt.broadcast_to(a_max, res_shape)
             _manager = SequentialOrderManager[exec_q]
             dep_ev = _manager.submitted_events
             ht_binary_ev, binary_ev = ti._clip(
@@ -612,7 +611,7 @@ def clip(x, /, min=None, max=None, out=None, order="K"):
             if order == "K":
                 buf2 = _empty_like_orderK(a_max, buf2_dt)
             else:
-                buf2 = dpt_ext.empty_like(a_max, dtype=buf2_dt, order=order)
+                buf2 = dpt.empty_like(a_max, dtype=buf2_dt, order=order)
             _manager = SequentialOrderManager[exec_q]
             dep_ev = _manager.submitted_events
             ht_copy_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
@@ -631,7 +630,7 @@ def clip(x, /, min=None, max=None, out=None, order="K"):
                         exec_q,
                     )
                 else:
-                    out = dpt_ext.empty(
+                    out = dpt.empty(
                         res_shape,
                         dtype=res_dt,
                         usm_type=res_usm_type,
@@ -639,10 +638,10 @@ def clip(x, /, min=None, max=None, out=None, order="K"):
                         order=order,
                     )
 
-            x = dpt_ext.broadcast_to(x, res_shape)
+            x = dpt.broadcast_to(x, res_shape)
             if a_min.shape != res_shape:
-                a_min = dpt_ext.broadcast_to(a_min, res_shape)
-            buf2 = dpt_ext.broadcast_to(buf2, res_shape)
+                a_min = dpt.broadcast_to(a_min, res_shape)
+            buf2 = dpt.broadcast_to(buf2, res_shape)
             ht_binary_ev, binary_ev = ti._clip(
                 src=x,
                 min=a_min,
@@ -668,7 +667,7 @@ def clip(x, /, min=None, max=None, out=None, order="K"):
             if order == "K":
                 buf1 = _empty_like_orderK(a_min, buf1_dt)
             else:
-                buf1 = dpt_ext.empty_like(a_min, dtype=buf1_dt, order=order)
+                buf1 = dpt.empty_like(a_min, dtype=buf1_dt, order=order)
             _manager = SequentialOrderManager[exec_q]
             dep_ev = _manager.submitted_events
             ht_copy_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
@@ -687,7 +686,7 @@ def clip(x, /, min=None, max=None, out=None, order="K"):
                         exec_q,
                     )
                 else:
-                    out = dpt_ext.empty(
+                    out = dpt.empty(
                         res_shape,
                         dtype=res_dt,
                         usm_type=res_usm_type,
@@ -695,10 +694,10 @@ def clip(x, /, min=None, max=None, out=None, order="K"):
                         order=order,
                     )
 
-            x = dpt_ext.broadcast_to(x, res_shape)
-            buf1 = dpt_ext.broadcast_to(buf1, res_shape)
+            x = dpt.broadcast_to(x, res_shape)
+            buf1 = dpt.broadcast_to(buf1, res_shape)
             if a_max.shape != res_shape:
-                a_max = dpt_ext.broadcast_to(a_max, res_shape)
+                a_max = dpt.broadcast_to(a_max, res_shape)
             ht_binary_ev, binary_ev = ti._clip(
                 src=x,
                 min=buf1,
@@ -736,7 +735,7 @@ def clip(x, /, min=None, max=None, out=None, order="K"):
         if order == "K":
             buf1 = _empty_like_orderK(a_min, buf1_dt)
         else:
-            buf1 = dpt_ext.empty_like(a_min, dtype=buf1_dt, order=order)
+            buf1 = dpt.empty_like(a_min, dtype=buf1_dt, order=order)
 
         _manager = SequentialOrderManager[exec_q]
         dep_evs = _manager.submitted_events
@@ -747,7 +746,7 @@ def clip(x, /, min=None, max=None, out=None, order="K"):
         if order == "K":
             buf2 = _empty_like_orderK(a_max, buf2_dt)
         else:
-            buf2 = dpt_ext.empty_like(a_max, dtype=buf2_dt, order=order)
+            buf2 = dpt.empty_like(a_max, dtype=buf2_dt, order=order)
         ht_copy2_ev, copy2_ev = ti._copy_usm_ndarray_into_usm_ndarray(
             src=a_max, dst=buf2, sycl_queue=exec_q, depends=dep_evs
         )
@@ -758,7 +757,7 @@ def clip(x, /, min=None, max=None, out=None, order="K"):
                     x, buf1, buf2, res_dt, res_shape, res_usm_type, exec_q
                 )
             else:
-                out = dpt_ext.empty(
+                out = dpt.empty(
                     res_shape,
                     dtype=res_dt,
                     usm_type=res_usm_type,
@@ -766,9 +765,9 @@ def clip(x, /, min=None, max=None, out=None, order="K"):
                     order=order,
                 )
 
-        x = dpt_ext.broadcast_to(x, res_shape)
-        buf1 = dpt_ext.broadcast_to(buf1, res_shape)
-        buf2 = dpt_ext.broadcast_to(buf2, res_shape)
+        x = dpt.broadcast_to(x, res_shape)
+        buf1 = dpt.broadcast_to(buf1, res_shape)
+        buf2 = dpt.broadcast_to(buf2, res_shape)
         ht_, clip_ev = ti._clip(
             src=x,
             min=buf1,
diff --git a/dpctl_ext/tensor/_constants.py b/dpctl_ext/tensor/_constants.py
new file mode 100644
index 000000000000..4c134bd9d375
--- /dev/null
+++ b/dpctl_ext/tensor/_constants.py
@@ -0,0 +1,36 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+import numpy as np
+
+newaxis = None
+
+pi = np.pi
+e = np.e
+nan = np.nan
+inf = np.inf
diff --git a/dpctl_ext/tensor/_copy_utils.py b/dpctl_ext/tensor/_copy_utils.py
index 37879997b788..b056511ac33b 100644
--- a/dpctl_ext/tensor/_copy_utils.py
+++ b/dpctl_ext/tensor/_copy_utils.py
@@ -32,17 +32,16 @@
 
 import dpctl
 import dpctl.memory as dpm
-import dpctl.tensor as dpt
 import dpctl.utils
 import numpy as np
-from dpctl.tensor._data_types import _get_dtype
-from dpctl.tensor._device import normalize_queue_device
 
 # TODO: revert to `import dpctl.tensor...`
 # when dpnp fully migrates dpctl/tensor
-import dpctl_ext.tensor as dpt_ext
+import dpctl_ext.tensor as dpt
 import dpctl_ext.tensor._tensor_impl as ti
 
+from ._data_types import _get_dtype
+from ._device import normalize_queue_device
 from ._numpy_helper import normalize_axis_index
 from ._type_utils import _dtype_supported_by_device_impl
 
@@ -91,7 +90,7 @@ def _copy_from_numpy(np_ary, usm_type="device", sycl_queue=None):
         )
     else:
         Xusm_dtype = dt
-    Xusm = dpt_ext.empty(
+    Xusm = dpt.empty(
         Xnp.shape, dtype=Xusm_dtype, usm_type=usm_type, sycl_queue=sycl_queue
     )
     _copy_from_numpy_into(Xusm, Xnp)
@@ -159,7 +158,7 @@ def _extract_impl(ary, ary_mask, axis=0):
     elif isinstance(ary_mask, np.ndarray):
         dst_usm_type = ary.usm_type
         exec_q = ary.sycl_queue
-        ary_mask = dpt_ext.asarray(
+        ary_mask = dpt.asarray(
             ary_mask, usm_type=dst_usm_type, sycl_queue=exec_q
         )
     else:
@@ -176,7 +175,7 @@ def _extract_impl(ary, ary_mask, axis=0):
         )
     mask_nelems = ary_mask.size
     cumsum_dt = dpt.int32 if mask_nelems < int32_t_max else dpt.int64
-    cumsum = dpt_ext.empty(mask_nelems, dtype=cumsum_dt, device=ary_mask.device)
+    cumsum = dpt.empty(mask_nelems, dtype=cumsum_dt, device=ary_mask.device)
     exec_q = cumsum.sycl_queue
     _manager = dpctl.utils.SequentialOrderManager[exec_q]
     dep_evs = _manager.submitted_events
@@ -184,7 +183,7 @@ def _extract_impl(ary, ary_mask, axis=0):
         ary_mask, cumsum, sycl_queue=exec_q, depends=dep_evs
     )
     dst_shape = ary.shape[:pp] + (mask_count,) + ary.shape[pp + mask_nd :]
-    dst = dpt_ext.empty(
+    dst = dpt.empty(
         dst_shape, dtype=ary.dtype, usm_type=dst_usm_type, device=ary.device
     )
     if dst.size == 0:
@@ -247,7 +246,7 @@ def _nonzero_impl(ary):
     usm_type = ary.usm_type
     mask_nelems = ary.size
     cumsum_dt = dpt.int32 if mask_nelems < int32_t_max else dpt.int64
-    cumsum = dpt_ext.empty(
+    cumsum = dpt.empty(
         mask_nelems, dtype=cumsum_dt, sycl_queue=exec_q, order="C"
     )
     _manager = dpctl.utils.SequentialOrderManager[exec_q]
@@ -256,7 +255,7 @@ def _nonzero_impl(ary):
         ary, cumsum, sycl_queue=exec_q, depends=dep_evs
     )
     indexes_dt = ti.default_device_index_type(exec_q.sycl_device)
-    indexes = dpt_ext.empty(
+    indexes = dpt.empty(
         (ary.ndim, mask_count),
         dtype=indexes_dt,
         usm_type=usm_type,
@@ -284,14 +283,14 @@ def _prepare_indices_arrays(inds, q, usm_type):
             lambda ind: (
                 ind
                 if isinstance(ind, dpt.usm_ndarray)
-                else dpt_ext.asarray(ind, usm_type=usm_type, sycl_queue=q)
+                else dpt.asarray(ind, usm_type=usm_type, sycl_queue=q)
             ),
             inds,
         )
     )
 
     # promote to a common integral type if possible
-    ind_dt = dpt_ext.result_type(*inds)
+    ind_dt = dpt.result_type(*inds)
     if ind_dt.kind not in "ui":
         raise ValueError(
             "cannot safely promote indices to an integer data type"
@@ -299,18 +298,122 @@ def _prepare_indices_arrays(inds, q, usm_type):
     inds = tuple(
         map(
             lambda ind: (
-                ind if ind.dtype == ind_dt else dpt_ext.astype(ind, ind_dt)
+                ind if ind.dtype == ind_dt else dpt.astype(ind, ind_dt)
             ),
             inds,
         )
     )
 
     # broadcast
-    inds = dpt_ext.broadcast_arrays(*inds)
+    inds = dpt.broadcast_arrays(*inds)
 
     return inds
 
 
+def _place_impl(ary, ary_mask, vals, axis=0):
+    """
+    Extract elements of ary by applying mask starting from slot
+    dimension axis.
+    """
+    if not isinstance(ary, dpt.usm_ndarray):
+        raise TypeError(
+            f"Expecting type dpctl.tensor.usm_ndarray, got {type(ary)}"
+        )
+    if isinstance(ary_mask, dpt.usm_ndarray):
+        exec_q = dpctl.utils.get_execution_queue(
+            (
+                ary.sycl_queue,
+                ary_mask.sycl_queue,
+            )
+        )
+        coerced_usm_type = dpctl.utils.get_coerced_usm_type(
+            (
+                ary.usm_type,
+                ary_mask.usm_type,
+            )
+        )
+        if exec_q is None:
+            raise dpctl.utils.ExecutionPlacementError(
+                "arrays have different associated queues. "
+                "Use `y.to_device(x.device)` to migrate."
+            )
+    elif isinstance(ary_mask, np.ndarray):
+        exec_q = ary.sycl_queue
+        coerced_usm_type = ary.usm_type
+        ary_mask = dpt.asarray(
+            ary_mask, usm_type=coerced_usm_type, sycl_queue=exec_q
+        )
+    else:
+        raise TypeError(
+            "Expecting type dpctl.tensor.usm_ndarray or numpy.ndarray, got "
+            f"{type(ary_mask)}"
+        )
+    if exec_q is not None:
+        if not isinstance(vals, dpt.usm_ndarray):
+            vals = dpt.asarray(
+                vals,
+                dtype=ary.dtype,
+                usm_type=coerced_usm_type,
+                sycl_queue=exec_q,
+            )
+        else:
+            exec_q = dpctl.utils.get_execution_queue((exec_q, vals.sycl_queue))
+            coerced_usm_type = dpctl.utils.get_coerced_usm_type(
+                (
+                    coerced_usm_type,
+                    vals.usm_type,
+                )
+            )
+    if exec_q is None:
+        raise dpctl.utils.ExecutionPlacementError(
+            "arrays have different associated queues. "
+            "Use `Y.to_device(X.device)` to migrate."
+        )
+    ary_nd = ary.ndim
+    pp = normalize_axis_index(operator.index(axis), ary_nd)
+    mask_nd = ary_mask.ndim
+    if pp < 0 or pp + mask_nd > ary_nd:
+        raise ValueError(
+            "Parameter p is inconsistent with input array dimensions"
+        )
+    mask_nelems = ary_mask.size
+    cumsum_dt = dpt.int32 if mask_nelems < int32_t_max else dpt.int64
+    cumsum = dpt.empty(
+        mask_nelems,
+        dtype=cumsum_dt,
+        usm_type=coerced_usm_type,
+        device=ary_mask.device,
+    )
+    exec_q = cumsum.sycl_queue
+    _manager = dpctl.utils.SequentialOrderManager[exec_q]
+    dep_ev = _manager.submitted_events
+    mask_count = ti.mask_positions(
+        ary_mask, cumsum, sycl_queue=exec_q, depends=dep_ev
+    )
+    expected_vals_shape = (
+        ary.shape[:pp] + (mask_count,) + ary.shape[pp + mask_nd :]
+    )
+    if vals.dtype == ary.dtype:
+        rhs = vals
+    else:
+        rhs = dpt.astype(vals, ary.dtype)
+    rhs = dpt.broadcast_to(rhs, expected_vals_shape)
+    if mask_nelems == 0:
+        return
+    dep_ev = _manager.submitted_events
+    hev, pl_ev = ti._place(
+        dst=ary,
+        cumsum=cumsum,
+        axis_start=pp,
+        axis_end=pp + mask_nd,
+        rhs=rhs,
+        sycl_queue=exec_q,
+        depends=dep_ev,
+    )
+    _manager.add_event_pair(hev, pl_ev)
+    return
+
+
 def _put_multi_index(ary, inds, p, vals, mode=0):
     if not isinstance(ary, dpt.usm_ndarray):
         raise TypeError(
@@ -332,7 +435,7 @@ def _put_multi_index(ary, inds, p, vals, mode=0):
 
     if exec_q is not None:
         if not isinstance(vals, dpt.usm_ndarray):
-            vals = dpt_ext.asarray(
+            vals = dpt.asarray(
                 vals,
                 dtype=ary.dtype,
                 usm_type=coerced_usm_type,
@@ -367,8 +470,8 @@ def _put_multi_index(ary, inds, p, vals, mode=0):
     if vals.dtype == ary.dtype:
         rhs = vals
     else:
-        rhs = dpt_ext.astype(vals, ary.dtype)
-    rhs = dpt_ext.broadcast_to(rhs, expected_vals_shape)
+        rhs = dpt.astype(vals, ary.dtype)
+    rhs = dpt.broadcast_to(rhs, expected_vals_shape)
     _manager = dpctl.utils.SequentialOrderManager[exec_q]
     dep_ev = _manager.submitted_events
     hev, put_ev = ti._put(
@@ -418,7 +521,7 @@ def _take_multi_index(ary, inds, p, mode=0):
     if 0 in ary_sh[p:p_end] and ind0.size != 0:
         raise IndexError("cannot take non-empty indices from an empty axis")
     res_shape = ary_sh[:p] + ind0.shape + ary_sh[p_end:]
-    res = dpt_ext.empty(
+    res = dpt.empty(
         res_shape, dtype=ary.dtype, usm_type=res_usm_type, sycl_queue=exec_q
     )
     _manager = dpctl.utils.SequentialOrderManager[exec_q]
@@ -681,9 +784,7 @@ def _make_empty_like_orderK(x, dt, usm_type, dev):
     inv_perm = sorted(range(x.ndim), key=lambda i: perm[i])
     sh = x.shape
     sh_sorted = tuple(sh[i] for i in perm)
-    R = dpt_ext.empty(
-        sh_sorted, dtype=dt, usm_type=usm_type, device=dev, order="C"
-    )
+    R = dpt.empty(sh_sorted, dtype=dt, usm_type=usm_type, device=dev, order="C")
     if min(st) < 0:
         st_sorted = [st[i] for i in perm]
         sl = tuple(
@@ -695,7 +796,7 @@ def _make_empty_like_orderK(x, dt, usm_type, dev):
             for i in range(x.ndim)
         )
         R = R[sl]
-    return dpt_ext.permute_dims(R, inv_perm)
+    return dpt.permute_dims(R, inv_perm)
 
 
 def _empty_like_orderK(x, dt, usm_type=None, dev=None):
@@ -714,11 +815,11 @@ def _empty_like_orderK(x, dt, usm_type=None, dev=None):
         dev = x.device
     fl = x.flags
     if fl["C"] or x.size <= 1:
-        return dpt_ext.empty_like(
+        return dpt.empty_like(
             x, dtype=dt, usm_type=usm_type, device=dev, order="C"
         )
     elif fl["F"]:
-        return dpt_ext.empty_like(
+        return dpt.empty_like(
             x, dtype=dt, usm_type=usm_type, device=dev, order="F"
         )
     return _make_empty_like_orderK(x, dt, usm_type, dev)
@@ -736,11 +837,11 @@ def _from_numpy_empty_like_orderK(x, dt, usm_type, dev):
         raise TypeError(f"Expected numpy.ndarray, got {type(x)}")
     fl = x.flags
     if fl["C"] or x.size <= 1:
-        return dpt_ext.empty(
+        return dpt.empty(
             x.shape, dtype=dt, usm_type=usm_type, device=dev, order="C"
         )
     elif fl["F"]:
-        return dpt_ext.empty(
+        return dpt.empty(
             x.shape, dtype=dt, usm_type=usm_type, device=dev, order="F"
         )
     return _make_empty_like_orderK(x, dt, usm_type, dev)
@@ -760,11 +861,11 @@ def _empty_like_pair_orderK(X1, X2, dt, res_shape, usm_type, dev):
     fl1 = X1.flags
     fl2 = X2.flags
     if fl1["C"] or fl2["C"]:
-        return dpt_ext.empty(
+        return dpt.empty(
             res_shape, dtype=dt, usm_type=usm_type, device=dev, order="C"
         )
     if fl1["F"] and fl2["F"]:
-        return dpt_ext.empty(
+        return dpt.empty(
             res_shape, dtype=dt, usm_type=usm_type, device=dev, order="F"
         )
     st1 = list(X1.strides)
@@ -787,9 +888,7 @@ def _empty_like_pair_orderK(X1, X2, dt, res_shape, usm_type, dev):
     st2_sorted = [st2[i] for i in perm]
     sh = res_shape
     sh_sorted = tuple(sh[i] for i in perm)
-    R = dpt_ext.empty(
-        sh_sorted, dtype=dt, usm_type=usm_type, device=dev, order="C"
-    )
+    R = dpt.empty(sh_sorted, dtype=dt, usm_type=usm_type, device=dev, order="C")
     if max(min(st1_sorted), min(st2_sorted)) < 0:
         sl = tuple(
             (
@@ -800,7 +899,7 @@ def _empty_like_pair_orderK(X1, X2, dt, res_shape, usm_type, dev):
             for i in range(nd1)
         )
         R = R[sl]
-    return dpt_ext.permute_dims(R, inv_perm)
+    return dpt.permute_dims(R, inv_perm)
 
 
 def _empty_like_triple_orderK(X1, X2, X3, dt, res_shape, usm_type, dev):
@@ -827,11 +926,11 @@ def _empty_like_triple_orderK(X1, X2, X3, dt, res_shape, usm_type, dev):
     fl2 = X2.flags
     fl3 = X3.flags
     if fl1["C"] or fl2["C"] or fl3["C"]:
-        return dpt_ext.empty(
+        return dpt.empty(
             res_shape, dtype=dt, usm_type=usm_type, device=dev, order="C"
         )
     if fl1["F"] and fl2["F"] and fl3["F"]:
-        return dpt_ext.empty(
+        return dpt.empty(
             res_shape, dtype=dt, usm_type=usm_type, device=dev, order="F"
         )
     st1 = list(X1.strides)
@@ -859,9 +958,7 @@ def _empty_like_triple_orderK(X1, X2, X3, dt, res_shape, usm_type, dev):
     st3_sorted = [st3[i] for i in perm]
     sh = res_shape
     sh_sorted = tuple(sh[i] for i in perm)
-    R = dpt_ext.empty(
-        sh_sorted, dtype=dt, usm_type=usm_type, device=dev, order="C"
-    )
+    R = dpt.empty(sh_sorted, dtype=dt, usm_type=usm_type, device=dev, order="C")
     if max(min(st1_sorted), min(st2_sorted), min(st3_sorted)) < 0:
         sl = tuple(
             (
@@ -876,7 +973,7 @@ def _empty_like_triple_orderK(X1, X2, X3, dt, res_shape, usm_type, dev):
             for i in range(nd1)
         )
         R = R[sl]
-    return dpt_ext.permute_dims(R, inv_perm)
+    return dpt.permute_dims(R, inv_perm)
 
 
 def copy(usm_ary, /, *, order="K"):
@@ -1019,7 +1116,7 @@ def astype(
     else:
         target_dtype = _get_dtype(newdtype, usm_ary.sycl_queue)
 
-    if not dpt_ext.can_cast(ary_dtype, target_dtype, casting=casting):
+    if not dpt.can_cast(ary_dtype, target_dtype, casting=casting):
         raise TypeError(
             f"Can not cast from {ary_dtype} to {newdtype} "
             f"according to rule {casting}."
diff --git a/dpctl_ext/tensor/_ctors.py b/dpctl_ext/tensor/_ctors.py
index 21c3d0077189..041faba73205 100644
--- a/dpctl_ext/tensor/_ctors.py
+++ b/dpctl_ext/tensor/_ctors.py
@@ -31,22 +31,21 @@
 
 import dpctl
 import dpctl.memory as dpm
-import dpctl.tensor as dpt
 import dpctl.utils
 import numpy as np
-from dpctl.tensor._data_types import _get_dtype
-from dpctl.tensor._device import normalize_queue_device
-from dpctl.tensor._usmarray import _is_object_with_buffer_protocol
 
 # TODO: revert to `import dpctl.tensor...`
 # when dpnp fully migrates dpctl/tensor
-import dpctl_ext.tensor as dpt_ext
+import dpctl_ext.tensor as dpt
 import dpctl_ext.tensor._tensor_impl as ti
 
 from ._copy_utils import (
     _empty_like_orderK,
     _from_numpy_empty_like_orderK,
 )
+from ._data_types import _get_dtype
+from ._device import normalize_queue_device
+from ._usmarray import _is_object_with_buffer_protocol
 
 __doc__ = "Implementation of creation functions in :module:`dpctl.tensor`"
 
@@ -182,7 +181,7 @@ def _asarray_from_seq(
     if order in "KA":
         order = "C"
     if isinstance(exec_q, dpctl.SyclQueue):
-        res = dpt_ext.empty(
+        res = dpt.empty(
             seq_shape,
             dtype=dtype,
             usm_type=usm_type,
@@ -193,7 +192,7 @@ def _asarray_from_seq(
         _device_copy_walker(seq_obj, res, _manager)
         return res
     else:
-        res = dpt_ext.empty(
+        res = dpt.empty(
             seq_shape,
             dtype=dtype,
             usm_type=usm_type,
@@ -312,7 +311,7 @@ def _asarray_from_usm_ndarray(
         )
         _manager.add_event_pair(hev, cpy_ev)
     else:
-        tmp = dpt_ext.asnumpy(usm_ndary)
+        tmp = dpt.asnumpy(usm_ndary)
         res[...] = tmp
     return res
 
@@ -361,7 +360,7 @@ def _copy_through_host_walker(seq_o, usm_res):
             )
             is None
         ):
-            usm_res[...] = dpt_ext.asnumpy(seq_o).copy()
+            usm_res[...] = dpt.asnumpy(seq_o).copy()
             return
         else:
             usm_res[...] = seq_o
@@ -381,7 +380,7 @@ def _copy_through_host_walker(seq_o, usm_res):
             )
             is None
         ):
-            usm_res[...] = dpt_ext.asnumpy(usm_ar).copy()
+            usm_res[...] = dpt.asnumpy(usm_ar).copy()
         else:
             usm_res[...] = usm_ar
         return
@@ -1092,7 +1091,7 @@ def eye(
     n_cols = n_rows if n_cols is None else operator.index(n_cols)
     k = operator.index(k)
     if k >= n_cols or -k >= n_rows:
-        return dpt_ext.zeros(
+        return dpt.zeros(
             (n_rows, n_cols),
             dtype=dtype,
             order=order,
@@ -1194,14 +1193,14 @@ def full(
             sycl_queue = normalize_queue_device(
                 sycl_queue=sycl_queue, device=device
             )
-        X = dpt_ext.asarray(
+        X = dpt.asarray(
             fill_value,
             dtype=dtype,
             order=order,
             usm_type=usm_type,
             sycl_queue=sycl_queue,
         )
-        return dpt_ext.copy(dpt_ext.broadcast_to(X, shape), order=order)
+        return dpt.copy(dpt.broadcast_to(X, shape), order=order)
     else:
         _validate_fill_value(fill_value)
 
@@ -1301,14 +1300,14 @@ def full_like(
     if order == "K":
         _ensure_native_dtype_device_support(dtype, sycl_queue.sycl_device)
         if isinstance(fill_value, (dpt.usm_ndarray, np.ndarray, tuple, list)):
-            X = dpt_ext.asarray(
+            X = dpt.asarray(
                 fill_value,
                 dtype=dtype,
                 order=order,
                 usm_type=usm_type,
                 sycl_queue=sycl_queue,
             )
-            X = dpt_ext.broadcast_to(X, sh)
+            X = dpt.broadcast_to(X, sh)
             res = _empty_like_orderK(x, dtype, usm_type, sycl_queue)
             _manager = dpctl.utils.SequentialOrderManager[sycl_queue]
             # order copy after tasks populating X
@@ -1434,14 +1433,14 @@ def linspace(
         start = float(start)
         stop = float(stop)
 
-    res = dpt_ext.empty(num, dtype=dt, usm_type=usm_type, sycl_queue=sycl_queue)
+    res = dpt.empty(num, dtype=dt, usm_type=usm_type, sycl_queue=sycl_queue)
     _manager = dpctl.utils.SequentialOrderManager[sycl_queue]
     hev, la_ev = ti._linspace_affine(
         start, stop, dst=res, include_endpoint=endpoint, sycl_queue=sycl_queue
     )
     _manager.add_event_pair(hev, la_ev)
 
-    return res if int_dt is None else dpt_ext.astype(res, int_dt)
+    return res if int_dt is None else dpt.astype(res, int_dt)
 
 
 def meshgrid(*arrays, indexing="xy"):
@@ -1506,15 +1505,15 @@ def meshgrid(*arrays, indexing="xy"):
 
     res = []
     if n > 1 and indexing == "xy":
-        res.append(dpt_ext.reshape(arrays[0], (1, -1) + sh[2:], copy=True))
-        res.append(dpt_ext.reshape(arrays[1], sh, copy=True))
+        res.append(dpt.reshape(arrays[0], (1, -1) + sh[2:], copy=True))
+        res.append(dpt.reshape(arrays[1], sh, copy=True))
         arrays, sh = arrays[2:], sh[-2:] + sh[:-2]
 
     for array in arrays:
-        res.append(dpt_ext.reshape(array, sh, copy=True))
+        res.append(dpt.reshape(array, sh, copy=True))
         sh = sh[-1:] + sh[:-1]
 
-    output = dpt_ext.broadcast_arrays(*res)
+    output = dpt.broadcast_arrays(*res)
 
     return output
 
@@ -1707,7 +1706,7 @@ def tril(x, /, *, k=0):
 
     q = x.sycl_queue
     if k >= shape[nd - 1] - 1:
-        res = dpt_ext.empty(
+        res = dpt.empty(
             x.shape,
             dtype=x.dtype,
             order=order,
@@ -1721,7 +1720,7 @@ def tril(x, /, *, k=0):
         )
         _manager.add_event_pair(hev, cpy_ev)
     elif k < -shape[nd - 2]:
-        res = dpt_ext.zeros(
+        res = dpt.zeros(
             x.shape,
             dtype=x.dtype,
             order=order,
@@ -1729,7 +1728,7 @@ def tril(x, /, *, k=0):
             sycl_queue=q,
         )
     else:
-        res = dpt_ext.empty(
+        res = dpt.empty(
             x.shape,
             dtype=x.dtype,
             order=order,
@@ -1785,7 +1784,7 @@ def triu(x, /, *, k=0):
 
     q = x.sycl_queue
     if k > shape[nd - 1]:
-        res = dpt_ext.zeros(
+        res = dpt.zeros(
             x.shape,
             dtype=x.dtype,
             order=order,
@@ -1793,7 +1792,7 @@ def triu(x, /, *, k=0):
             sycl_queue=q,
         )
     elif k <= -shape[nd - 2] + 1:
-        res = dpt_ext.empty(
+        res = dpt.empty(
             x.shape,
             dtype=x.dtype,
             order=order,
@@ -1807,7 +1806,7 @@ def triu(x, /, *, k=0):
         )
         _manager.add_event_pair(hev, cpy_ev)
     else:
-        res = dpt_ext.empty(
+        res = dpt.empty(
             x.shape,
             dtype=x.dtype,
             order=order,
diff --git a/dpctl_ext/tensor/_data_types.py b/dpctl_ext/tensor/_data_types.py
new file mode 100644
index 000000000000..faf30ffdabd0
--- /dev/null
+++ b/dpctl_ext/tensor/_data_types.py
@@ -0,0 +1,104 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+from numpy import bool_ as np_bool_
+from numpy import complexfloating as np_complexfloating
+from numpy import dtype
+from numpy import floating as np_floating
+from numpy import integer as np_integer
+from numpy import issubdtype as np_issubdtype
+
+from ._tensor_impl import (
+    default_device_bool_type as ti_default_device_bool_type,
+)
+from ._tensor_impl import (
+    default_device_complex_type as ti_default_device_complex_type,
+)
+from ._tensor_impl import default_device_fp_type as ti_default_device_fp_type
+from ._tensor_impl import default_device_int_type as ti_default_device_int_type
+
+bool = dtype("bool")
+int8 = dtype("int8")
+int16 = dtype("int16")
+int32 = dtype("int32")
+int64 = dtype("int64")
+uint8 = dtype("uint8")
+uint16 = dtype("uint16")
+uint32 = dtype("uint32")
+uint64 = dtype("uint64")
+float16 = dtype("float16")
+float32 = dtype("float32")
+float64 = dtype("float64")
+complex64 = dtype("complex64")
+complex128 = dtype("complex128")
+
+
+def _get_dtype(inp_dt, sycl_obj, ref_type=None):
+    """
+    Type inference utility to construct data type
+    object with defaults based on reference type.
+
+    _get_dtype is used by dpctl.tensor.asarray
+    to infer data type of the output array from the
+    input sequence.
+    """
+    if inp_dt is None:
+        if ref_type in [None, float] or np_issubdtype(ref_type, np_floating):
+            fp_dt = ti_default_device_fp_type(sycl_obj)
+            return dtype(fp_dt)
+        if ref_type in [bool, np_bool_]:
+            bool_dt = ti_default_device_bool_type(sycl_obj)
+            return dtype(bool_dt)
+        if ref_type is int or np_issubdtype(ref_type, np_integer):
+            int_dt = ti_default_device_int_type(sycl_obj)
+            return dtype(int_dt)
+        if ref_type is complex or np_issubdtype(ref_type, np_complexfloating):
+            cfp_dt = ti_default_device_complex_type(sycl_obj)
+            return dtype(cfp_dt)
+        raise TypeError(f"Reference type {ref_type} not recognized.")
+    return dtype(inp_dt)
+
+
+__all__ = [
+    "dtype",
+    "_get_dtype",
+    "bool",
+    "int8",
+    "uint8",
+    "int16",
+    "uint16",
+    "int32",
+    "uint32",
+    "int64",
+    "uint64",
+    "float16",
+    "float32",
+    "float64",
+    "complex64",
+    "complex128",
+]
diff --git a/dpctl_ext/tensor/_device.py b/dpctl_ext/tensor/_device.py
new file mode 100644
index 000000000000..8d763bc721e3
--- /dev/null
+++ b/dpctl_ext/tensor/_device.py
@@ -0,0 +1,195 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+
+import dpctl
+from dpctl._sycl_device_factory import _cached_default_device
+from dpctl._sycl_queue_manager import get_device_cached_queue
+
+__doc__ = "Implementation of array API mandated Device class"
+
+
+class Device:
+    """
+    An object representing Data-API concept of device.
+
+    This is a wrapper around :class:`dpctl.SyclQueue` with custom
+    formatting. The class does not have public constructor,
+    but a class method :meth:`dpctl.tensor.Device.create_device` to construct
+    it from `device` keyword argument in Array-API functions.
+
+    Instance can be queried for ``sycl_queue``, ``sycl_context``,
+    or ``sycl_device``.
+    """
+
+    __device_queue_map__ = {}
+    sycl_queue_ = None
+
+    def __new__(cls, *args, **kwargs):
+        raise TypeError("No public constructor")
+
+    @classmethod
+    def create_device(cls, device=None):
+        """Device.create_device(device=None)
+
+        Creates instance of Device from argument.
+
+        Args:
+            device:
+                Device specification, i.e. `None`, :class:`.Device`,
+                :class:`dpctl.SyclQueue`, or a :class:`dpctl.SyclDevice`
+                corresponding to a root SYCL device.
+        Raises:
+            ValueError: if an instance of :class:`dpctl.SycDevice` corresponding
+                        to a sub-device was specified as the argument
+            SyclQueueCreationError: if :class:`dpctl.SyclQueue` could not be
+                                    created from the argument
+        """
+        dev = device
+        obj = super().__new__(cls)
+        if isinstance(dev, Device):
+            obj.sycl_queue_ = dev.sycl_queue
+        elif isinstance(dev, dpctl.SyclQueue):
+            obj.sycl_queue_ = dev
+        elif isinstance(dev, dpctl.SyclDevice):
+            par = dev.parent_device
+            if par is None:
+                obj.sycl_queue_ = get_device_cached_queue(dev)
+            else:
+                raise ValueError(
+                    f"Using non-root device {dev} to specify offloading "
+                    "target is ambiguous. Please use dpctl.SyclQueue "
+                    "targeting this device"
+                )
+        else:
+            if dev is None:
+                _dev = _cached_default_device()
+            else:
+                _dev = dpctl.SyclDevice(dev)
+            obj.sycl_queue_ = get_device_cached_queue(_dev)
+        return obj
+
+    @property
+    def sycl_queue(self):
+        """:class:`dpctl.SyclQueue` used to offload to this :class:`.Device`."""
+        return self.sycl_queue_
+
+    @property
+    def sycl_context(self):
+        """:class:`dpctl.SyclContext` associated with this :class:`.Device`."""
+        return self.sycl_queue_.sycl_context
+
+    @property
+    def sycl_device(self):
+        """:class:`dpctl.SyclDevice` targeted by this :class:`.Device`."""
+        return self.sycl_queue_.sycl_device
+
+    def __repr__(self):
+        try:
+            sd = self.sycl_device
+        except AttributeError as exc:
+            raise ValueError(
+                f"Instance of {self.__class__} is not initialized"
+            ) from exc
+        try:
+            fs = sd.filter_string
+            return f"Device({fs})"
+        except TypeError:
+            # This is a sub-device
+            return repr(self.sycl_queue)
+
+    def print_device_info(self):
+        """Outputs information about targeted SYCL device"""
+        self.sycl_device.print_device_info()
+
+    def wait(self):
+        """Call ``wait`` method of the underlying ``sycl_queue``."""
+        self.sycl_queue_.wait()
+
+    def __eq__(self, other):
+        """Equality comparison based on underlying ``sycl_queue``."""
+        if isinstance(other, Device):
+            return self.sycl_queue.__eq__(other.sycl_queue)
+        elif isinstance(other, dpctl.SyclQueue):
+            return self.sycl_queue.__eq__(other)
+        return False
+
+    def __hash__(self):
+        """Compute object's hash value."""
+        return self.sycl_queue.__hash__()
+
+
+def normalize_queue_device(sycl_queue=None, device=None):
+    """normalize_queue_device(sycl_queue=None, device=None)
+
+    Utility to process exclusive keyword arguments 'device'
+    and 'sycl_queue' in functions of `dpctl.tensor`.
+
+    Args:
+        sycl_queue (:class:`dpctl.SyclQueue`, optional):
+            explicitly indicates where USM allocation is done
+            and the population code (if any) is executed.
+            Value `None` is interpreted as get the SYCL queue
+            from `device` keyword, or use default queue.
+            Default: None
+        device (string, :class:`dpctl.SyclDevice`, :class:`dpctl.SyclQueue,
+            :class:`dpctl.tensor.Device`, optional):
+            array-API keyword indicating non-partitioned SYCL device
+            where array is allocated.
+
+    Returns
+        :class:`dpctl.SyclQueue` object implied by either of provided
+        keywords. If both are None, `dpctl.SyclQueue()` is returned.
+        If both are specified and imply the same queue, `sycl_queue`
+        is returned.
+
+    Raises:
+        TypeError: if argument is not of the expected type, or keywords
+            imply incompatible queues.
+    """
+    q = sycl_queue
+    d = device
+    if q is None:
+        d = Device.create_device(d)
+        return d.sycl_queue
+    if not isinstance(q, dpctl.SyclQueue):
+        raise TypeError(f"Expected dpctl.SyclQueue, got {type(q)}")
+    if d is None:
+        return q
+    d = Device.create_device(d)
+    qq = dpctl.utils.get_execution_queue(
+        (
+            q,
+            d.sycl_queue,
+        )
+    )
+    if qq is None:
+        raise TypeError(
+            "sycl_queue and device keywords can not be both specified"
+        )
+    return qq
diff --git a/dpctl_ext/tensor/_dldevice_conversions.py b/dpctl_ext/tensor/_dldevice_conversions.py
new file mode 100644
index 000000000000..595a280689a5
--- /dev/null
+++ b/dpctl_ext/tensor/_dldevice_conversions.py
@@ -0,0 +1,52 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+from dpctl._sycl_device import SyclDevice
+
+from ._usmarray import DLDeviceType
+
+
+def dldevice_to_sycl_device(dl_dev: tuple):
+    if isinstance(dl_dev, tuple):
+        if len(dl_dev) != 2:
+            raise ValueError("dldevice tuple must have length 2")
+    else:
+        raise TypeError(
+            f"dl_dev is expected to be a 2-tuple, got " f"{type(dl_dev)}"
+        )
+    if dl_dev[0] != DLDeviceType.kDLOneAPI:
+        raise ValueError("dldevice type must be kDLOneAPI")
+    return SyclDevice(str(dl_dev[1]))
+
+
+def sycl_device_to_dldevice(dev: SyclDevice):
+    if not isinstance(dev, SyclDevice):
+        raise TypeError(
+            "dev is expected to be a SyclDevice, got " f"{type(dev)}"
+        )
+    return (DLDeviceType.kDLOneAPI, dev.get_device_id())
diff --git a/dpctl_ext/tensor/_dlpack.pxd b/dpctl_ext/tensor/_dlpack.pxd
new file mode 100644
index 000000000000..75378bfa7a92
--- /dev/null
+++ b/dpctl_ext/tensor/_dlpack.pxd
@@ -0,0 +1,73 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+# distutils: language = c++
+# cython: language_level=3
+# cython: linetrace=True
+
+cdef extern from "numpy/npy_no_deprecated_api.h":
+    pass
+from dpctl._sycl_device cimport SyclDevice
+from numpy cimport ndarray
+
+from ._usmarray cimport usm_ndarray
+
+
+cdef extern from "dlpack/dlpack.h" nogil:
+    int device_CPU "kDLCPU"
+    int device_CUDA "kDLCUDA"
+    int device_CUDAHost "kDLCUDAHost"
+    int device_CUDAManaged "kDLCUDAManaged"
+    int device_DLROCM "kDLROCM"
+    int device_ROCMHost "kDLROCMHost"
+    int device_OpenCL "kDLOpenCL"
+    int device_Vulkan "kDLVulkan"
+    int device_Metal "kDLMetal"
+    int device_VPI "kDLVPI"
+    int device_OneAPI "kDLOneAPI"
+    int device_WebGPU "kDLWebGPU"
+    int device_Hexagon "kDLHexagon"
+    int device_MAIA "kDLMAIA"
+    int device_Trn "kDLTrn"
+
+cpdef object to_dlpack_capsule(usm_ndarray array) except +
+cpdef object to_dlpack_versioned_capsule(
+    usm_ndarray array, bint copied
+) except +
+cpdef object numpy_to_dlpack_versioned_capsule(
+    ndarray array, bint copied
+) except +
+cpdef object from_dlpack_capsule(object dltensor) except +
+
+cdef class DLPackCreationError(Exception):
+    """
+    A DLPackCreateError exception is raised when constructing
+    DLPack capsule from `usm_ndarray` based on a USM allocation
+    on a partitioned SYCL device.
+    """
+    pass
diff --git a/dpctl_ext/tensor/_dlpack.pyx b/dpctl_ext/tensor/_dlpack.pyx
new file mode 100644
index 000000000000..21b3d877c475
--- /dev/null
+++ b/dpctl_ext/tensor/_dlpack.pyx
@@ -0,0 +1,1245 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+# distutils: language = c++
+# cython: language_level=3
+# cython: linetrace=True
+
+cdef extern from "numpy/npy_no_deprecated_api.h":
+    pass
+
+cimport cpython
+cimport dpctl as c_dpctl
+cimport dpctl.memory as c_dpmem
+from dpctl._backend cimport (
+    DPCTLDevice_Delete,
+    DPCTLDevice_GetParentDevice,
+    DPCTLSyclDeviceRef,
+    DPCTLSyclUSMRef,
+)
+from dpctl._sycl_queue_manager cimport get_device_cached_queue
+from libc cimport stdlib
+from libc.stdint cimport int64_t, uint8_t, uint16_t, uint32_t, uint64_t
+from numpy cimport ndarray
+
+from ._usmarray cimport (
+    USM_ARRAY_C_CONTIGUOUS,
+    USM_ARRAY_F_CONTIGUOUS,
+    USM_ARRAY_WRITABLE,
+    usm_ndarray,
+)
+
+import ctypes
+
+import dpctl
+import dpctl.memory as dpmem
+import numpy as np
+
+from ._device import Device
+
+
+cdef extern from "dlpack/dlpack.h" nogil:
+    cdef int DLPACK_MAJOR_VERSION
+
+    cdef int DLPACK_MINOR_VERSION
+
+    cdef int DLPACK_FLAG_BITMASK_READ_ONLY
+
+    cdef int DLPACK_FLAG_BITMASK_IS_COPIED
+
+    ctypedef struct DLPackVersion:
+        uint32_t major
+        uint32_t minor
+
+    cdef enum DLDeviceType:
+        kDLCPU
+        kDLCUDA
+        kDLCUDAHost
+        kDLCUDAManaged
+        kDLROCM
+        kDLROCMHost
+        kDLOpenCL
+        kDLVulkan
+        kDLMetal
+        kDLVPI
+        kDLOneAPI
+        kDLWebGPU
+        kDLHexagon
+        kDLMAIA
+        kDLTrn
+
+    ctypedef struct DLDevice:
+        DLDeviceType device_type
+        int device_id
+
+    cdef enum DLDataTypeCode:
+        kDLInt
+        kDLUInt
+        kDLFloat
+        kDLBfloat
+        kDLComplex
+        kDLBool
+        kDLFloat8_e3m4
+        kDLFloat8_e4m3
+        kDLFloat8_e4m3b11fnuz
+        kDLFloat8_e4m3fn
+        kDLFloat8_e4m3fnuz
+        kDLFloat8_e5m2
+        kDLFloat8_e5m2fnuz
+        kDLFloat8_e8m0fnu
+        kDLFloat6_e2m3fn
+        kDLFloat6_e3m2fn
+        kDLFloat4_e2m1fn
+
+    ctypedef struct DLDataType:
+        uint8_t code
+        uint8_t bits
+        uint16_t lanes
+
+    ctypedef struct DLTensor:
+        void *data
+        DLDevice device
+        int ndim
+        DLDataType dtype
+        int64_t *shape
+        int64_t *strides
+        uint64_t byte_offset
+
+    ctypedef struct DLManagedTensor:
+        DLTensor dl_tensor
+        void *manager_ctx
+        void (*deleter)(DLManagedTensor *)  # noqa: E211
+
+    ctypedef struct DLManagedTensorVersioned:
+        DLPackVersion version
+        void *manager_ctx
+        void (*deleter)(DLManagedTensorVersioned *)  # noqa: E211
+        uint64_t flags
+        DLTensor dl_tensor
+
+
+def get_build_dlpack_version():
+    """
+    Returns a tuple of integers representing the `major` and `minor`
+    version of DLPack :module:`dpctl.tensor` was built with.
+    This tuple can be passed as the `max_version` argument to
+    `__dlpack__` to guarantee module:`dpctl.tensor` can properly
+    consume capsule.
+
+    Returns:
+        Tuple[int, int]
+            A tuple of integers representing the `major` and `minor`
+            version of DLPack used to build :module:`dpctl.tensor`.
+    """
+    return (DLPACK_MAJOR_VERSION, DLPACK_MINOR_VERSION)
+
+
+cdef void _pycapsule_deleter(object dlt_capsule) noexcept:
+    cdef DLManagedTensor *dlm_tensor = NULL
+    if cpython.PyCapsule_IsValid(dlt_capsule, "dltensor"):
+        dlm_tensor = <DLManagedTensor*>cpython.PyCapsule_GetPointer(
+            dlt_capsule, "dltensor")
+        dlm_tensor.deleter(dlm_tensor)
+
+
+cdef void _managed_tensor_deleter(
+    DLManagedTensor *dlm_tensor
+) noexcept with gil:
+    if dlm_tensor is not NULL:
+        # we only delete shape, because we make single allocation to
+        # accommodate both shape and strides if strides are needed
+        stdlib.free(dlm_tensor.dl_tensor.shape)
+        cpython.Py_DECREF(<object>dlm_tensor.manager_ctx)
+        dlm_tensor.manager_ctx = NULL
+        stdlib.free(dlm_tensor)
+
+
+cdef void _pycapsule_versioned_deleter(object dlt_capsule) noexcept:
+    cdef DLManagedTensorVersioned *dlmv_tensor = NULL
+    if cpython.PyCapsule_IsValid(dlt_capsule, "dltensor_versioned"):
+        dlmv_tensor = <DLManagedTensorVersioned*>cpython.PyCapsule_GetPointer(
+            dlt_capsule, "dltensor_versioned")
+        dlmv_tensor.deleter(dlmv_tensor)
+
+
+cdef void _managed_tensor_versioned_deleter(
+    DLManagedTensorVersioned *dlmv_tensor
+) noexcept with gil:
+    if dlmv_tensor is not NULL:
+        # we only delete shape, because we make single allocation to
+        # accommodate both shape and strides if strides are needed
+        stdlib.free(dlmv_tensor.dl_tensor.shape)
+        cpython.Py_DECREF(<object>dlmv_tensor.manager_ctx)
+        dlmv_tensor.manager_ctx = NULL
+        stdlib.free(dlmv_tensor)
+
+
+cdef object _get_default_context(c_dpctl.SyclDevice dev):
+    try:
+        default_context = dev.sycl_platform.default_context
+    except RuntimeError:
+        # RT does not support default_context
+        default_context = None
+
+    return default_context
+
+cdef int get_array_dlpack_device_id(
+    usm_ndarray usm_ary
+) except -1:
+    """Finds ordinal number of the parent of device where array
+    was allocated.
+    """
+    cdef c_dpctl.SyclQueue ary_sycl_queue
+    cdef c_dpctl.SyclDevice ary_sycl_device
+    cdef DPCTLSyclDeviceRef pDRef = NULL
+    cdef int device_id = -1
+
+    ary_sycl_queue = usm_ary.get_sycl_queue()
+    ary_sycl_device = ary_sycl_queue.get_sycl_device()
+
+    default_context = _get_default_context(ary_sycl_device)
+    if default_context is None:
+        # check that ary_sycl_device is a non-partitioned device
+        pDRef = DPCTLDevice_GetParentDevice(ary_sycl_device.get_device_ref())
+        if pDRef is not NULL:
+            DPCTLDevice_Delete(pDRef)
+            raise DLPackCreationError(
+                "to_dlpack_capsule: DLPack can only export arrays allocated "
+                "on non-partitioned SYCL devices on platforms where "
+                "default_context oneAPI extension is not supported."
+            )
+    else:
+        if not usm_ary.sycl_context == default_context:
+            raise DLPackCreationError(
+                "to_dlpack_capsule: DLPack can only export arrays based on USM "
+                "allocations bound to a default platform SYCL context"
+            )
+    device_id = ary_sycl_device.get_device_id()
+
+    if device_id < 0:
+        raise DLPackCreationError(
+            "get_array_dlpack_device_id: failed to determine device_id"
+        )
+
+    return device_id
+
+
+cpdef to_dlpack_capsule(usm_ndarray usm_ary):
+    """
+    to_dlpack_capsule(usm_ary)
+
+    Constructs named Python capsule object referencing
+    instance of ``DLManagedTensor`` from
+    :class:`dpctl.tensor.usm_ndarray` instance.
+
+    Args:
+        usm_ary: An instance of :class:`dpctl.tensor.usm_ndarray`
+    Returns:
+        A new capsule with name ``"dltensor"`` that contains
+        a pointer to ``DLManagedTensor`` struct.
+    Raises:
+        DLPackCreationError: when array can be represented as
+            DLPack tensor. This may happen when array was allocated
+            on a partitioned sycl device, or its USM allocation is
+            not bound to the platform default SYCL context.
+        MemoryError: when host allocation to needed for ``DLManagedTensor``
+            did not succeed.
+        ValueError: when array elements data type could not be represented
+            in ``DLManagedTensor``.
+    """
+    cdef DLManagedTensor *dlm_tensor = NULL
+    cdef DLTensor *dl_tensor = NULL
+    cdef int nd = usm_ary.get_ndim()
+    cdef char *data_ptr = usm_ary.get_data()
+    cdef Py_ssize_t *shape_ptr = NULL
+    cdef Py_ssize_t *strides_ptr = NULL
+    cdef int64_t *shape_strides_ptr = NULL
+    cdef int i = 0
+    cdef int device_id = -1
+    cdef int flags = 0
+    cdef Py_ssize_t element_offset = 0
+    cdef Py_ssize_t byte_offset = 0
+    cdef Py_ssize_t si = 1
+
+    ary_base = usm_ary.get_base()
+
+    device_id = get_array_dlpack_device_id(usm_ary)
+
+    dlm_tensor = <DLManagedTensor *> stdlib.malloc(
+        sizeof(DLManagedTensor))
+    if dlm_tensor is NULL:
+        raise MemoryError(
+            "to_dlpack_capsule: Could not allocate memory for DLManagedTensor"
+        )
+    if nd > 0:
+        shape_strides_ptr = <int64_t *>stdlib.malloc((sizeof(int64_t) * 2) * nd)
+        if shape_strides_ptr is NULL:
+            stdlib.free(dlm_tensor)
+            raise MemoryError(
+                "to_dlpack_capsule: Could not allocate memory for shape/strides"
+            )
+        shape_ptr = usm_ary.get_shape()
+        for i in range(nd):
+            shape_strides_ptr[i] = shape_ptr[i]
+        strides_ptr = usm_ary.get_strides()
+        flags = usm_ary.flags_
+        if strides_ptr:
+            for i in range(nd):
+                shape_strides_ptr[nd + i] = strides_ptr[i]
+        else:
+            if flags & USM_ARRAY_C_CONTIGUOUS:
+                si = 1
+                for i in range(nd - 1, -1, -1):
+                    shape_strides_ptr[nd + i] = si
+                    si = si * shape_ptr[i]
+            elif flags & USM_ARRAY_F_CONTIGUOUS:
+                si = 1
+                for i in range(0, nd):
+                    shape_strides_ptr[nd + i] = si
+                    si = si * shape_ptr[i]
+            else:
+                stdlib.free(shape_strides_ptr)
+                stdlib.free(dlm_tensor)
+                raise BufferError(
+                    "to_dlpack_capsule: Invalid array encountered "
+                    "when building strides"
+                )
+
+            strides_ptr = <Py_ssize_t *>&shape_strides_ptr[nd]
+
+    ary_dt = usm_ary.dtype
+    ary_dtk = ary_dt.kind
+    element_offset = usm_ary.get_offset()
+    byte_offset = element_offset * (<Py_ssize_t>ary_dt.itemsize)
+
+    dl_tensor = &dlm_tensor.dl_tensor
+    dl_tensor.data = <void*>(data_ptr - byte_offset)
+    dl_tensor.ndim = nd
+    dl_tensor.byte_offset = <uint64_t>byte_offset
+    dl_tensor.shape = &shape_strides_ptr[0] if nd > 0 else NULL
+    dl_tensor.strides = &shape_strides_ptr[nd] if nd > 0 else NULL
+    dl_tensor.device.device_type = kDLOneAPI
+    dl_tensor.device.device_id = device_id
+    dl_tensor.dtype.lanes = <uint16_t>1
+    dl_tensor.dtype.bits = <uint8_t>(ary_dt.itemsize * 8)
+    if (ary_dtk == "b"):
+        dl_tensor.dtype.code = <uint8_t>kDLBool
+    elif (ary_dtk == "u"):
+        dl_tensor.dtype.code = <uint8_t>kDLUInt
+    elif (ary_dtk == "i"):
+        dl_tensor.dtype.code = <uint8_t>kDLInt
+    elif (ary_dtk == "f"):
+        dl_tensor.dtype.code = <uint8_t>kDLFloat
+    elif (ary_dtk == "c"):
+        dl_tensor.dtype.code = <uint8_t>kDLComplex
+    else:
+        stdlib.free(shape_strides_ptr)
+        stdlib.free(dlm_tensor)
+        raise ValueError("Unrecognized array data type")
+
+    dlm_tensor.manager_ctx = <void*>ary_base
+    cpython.Py_INCREF(ary_base)
+    dlm_tensor.deleter = _managed_tensor_deleter
+
+    return cpython.PyCapsule_New(dlm_tensor, "dltensor", _pycapsule_deleter)
+
+
+cpdef to_dlpack_versioned_capsule(usm_ndarray usm_ary, bint copied):
+    """
+    to_dlpack_versioned_capsule(usm_ary, copied)
+
+    Constructs named Python capsule object referencing
+    instance of ``DLManagedTensorVersioned`` from
+    :class:`dpctl.tensor.usm_ndarray` instance.
+
+    Args:
+        usm_ary: An instance of :class:`dpctl.tensor.usm_ndarray`
+        copied: A bint representing whether the data was previously
+            copied in order to set the flags with the is-copied
+            bitmask.
+    Returns:
+        A new capsule with name ``"dltensor_versioned"`` that
+        contains a pointer to ``DLManagedTensorVersioned`` struct.
+    Raises:
+        DLPackCreationError: when array can be represented as
+            DLPack tensor. This may happen when array was allocated
+            on a partitioned sycl device, or its USM allocation is
+            not bound to the platform default SYCL context.
+        MemoryError: when host allocation to needed for
+            ``DLManagedTensorVersioned`` did not succeed.
+        ValueError: when array elements data type could not be represented
+            in ``DLManagedTensorVersioned``.
+    """
+    cdef DLManagedTensorVersioned *dlmv_tensor = NULL
+    cdef DLTensor *dl_tensor = NULL
+    cdef uint32_t dlmv_flags = 0
+    cdef int nd = usm_ary.get_ndim()
+    cdef char *data_ptr = usm_ary.get_data()
+    cdef Py_ssize_t *shape_ptr = NULL
+    cdef Py_ssize_t *strides_ptr = NULL
+    cdef int64_t *shape_strides_ptr = NULL
+    cdef int i = 0
+    cdef int device_id = -1
+    cdef int flags = 0
+    cdef Py_ssize_t element_offset = 0
+    cdef Py_ssize_t byte_offset = 0
+    cdef Py_ssize_t si = 1
+
+    ary_base = usm_ary.get_base()
+
+    # Find ordinal number of the parent device
+    device_id = get_array_dlpack_device_id(usm_ary)
+
+    dlmv_tensor = <DLManagedTensorVersioned *> stdlib.malloc(
+        sizeof(DLManagedTensorVersioned))
+    if dlmv_tensor is NULL:
+        raise MemoryError(
+            "to_dlpack_versioned_capsule: Could not allocate memory "
+            "for DLManagedTensorVersioned"
+        )
+    if nd > 0:
+        shape_strides_ptr = <int64_t *>stdlib.malloc((sizeof(int64_t) * 2) * nd)
+        if shape_strides_ptr is NULL:
+            stdlib.free(dlmv_tensor)
+            raise MemoryError(
+                "to_dlpack_versioned_capsule: Could not allocate memory "
+                "for shape/strides"
+            )
+        # this can be a separate function for handling shapes and strides
+        shape_ptr = usm_ary.get_shape()
+        for i in range(nd):
+            shape_strides_ptr[i] = shape_ptr[i]
+        strides_ptr = usm_ary.get_strides()
+        flags = usm_ary.flags_
+        if strides_ptr:
+            for i in range(nd):
+                shape_strides_ptr[nd + i] = strides_ptr[i]
+        else:
+            if flags & USM_ARRAY_C_CONTIGUOUS:
+                si = 1
+                for i in range(nd - 1, -1, -1):
+                    shape_strides_ptr[nd + i] = si
+                    si = si * shape_ptr[i]
+            elif flags & USM_ARRAY_F_CONTIGUOUS:
+                si = 1
+                for i in range(0, nd):
+                    shape_strides_ptr[nd + i] = si
+                    si = si * shape_ptr[i]
+            else:
+                stdlib.free(shape_strides_ptr)
+                stdlib.free(dlmv_tensor)
+                raise BufferError(
+                    "to_dlpack_versioned_capsule: Invalid array encountered "
+                    "when building strides"
+                )
+
+            strides_ptr = <Py_ssize_t *>&shape_strides_ptr[nd]
+
+    # this can all be a function for building the dl_tensor
+    # object (separate from dlm/dlmv)
+    ary_dt = usm_ary.dtype
+    ary_dtk = ary_dt.kind
+    element_offset = usm_ary.get_offset()
+    byte_offset = element_offset * (<Py_ssize_t>ary_dt.itemsize)
+
+    dl_tensor = &dlmv_tensor.dl_tensor
+    dl_tensor.data = <void*>(data_ptr - byte_offset)
+    dl_tensor.ndim = nd
+    dl_tensor.byte_offset = <uint64_t>byte_offset
+    dl_tensor.shape = &shape_strides_ptr[0] if nd > 0 else NULL
+    dl_tensor.strides = &shape_strides_ptr[nd] if nd > 0 else NULL
+    dl_tensor.device.device_type = kDLOneAPI
+    dl_tensor.device.device_id = device_id
+    dl_tensor.dtype.lanes = <uint16_t>1
+    dl_tensor.dtype.bits = <uint8_t>(ary_dt.itemsize * 8)
+    if (ary_dtk == "b"):
+        dl_tensor.dtype.code = <uint8_t>kDLBool
+    elif (ary_dtk == "u"):
+        dl_tensor.dtype.code = <uint8_t>kDLUInt
+    elif (ary_dtk == "i"):
+        dl_tensor.dtype.code = <uint8_t>kDLInt
+    elif (ary_dtk == "f"):
+        dl_tensor.dtype.code = <uint8_t>kDLFloat
+    elif (ary_dtk == "c"):
+        dl_tensor.dtype.code = <uint8_t>kDLComplex
+    else:
+        stdlib.free(shape_strides_ptr)
+        stdlib.free(dlmv_tensor)
+        raise ValueError("Unrecognized array data type")
+
+    # set flags down here
+    if copied:
+        dlmv_flags |= DLPACK_FLAG_BITMASK_IS_COPIED
+    if not (flags & USM_ARRAY_WRITABLE):
+        dlmv_flags |= DLPACK_FLAG_BITMASK_READ_ONLY
+    dlmv_tensor.flags = dlmv_flags
+
+    dlmv_tensor.version.major = DLPACK_MAJOR_VERSION
+    dlmv_tensor.version.minor = DLPACK_MINOR_VERSION
+
+    dlmv_tensor.manager_ctx = <void*>ary_base
+    cpython.Py_INCREF(ary_base)
+    dlmv_tensor.deleter = _managed_tensor_versioned_deleter
+
+    return cpython.PyCapsule_New(
+        dlmv_tensor, "dltensor_versioned", _pycapsule_versioned_deleter
+    )
+
+
+cpdef numpy_to_dlpack_versioned_capsule(ndarray npy_ary, bint copied):
+    """
+    to_dlpack_versioned_capsule(npy_ary, copied)
+
+    Constructs named Python capsule object referencing
+    instance of ``DLManagedTensorVersioned`` from
+    :class:`numpy.ndarray` instance.
+
+    Args:
+        npy_ary: An instance of :class:`numpy.ndarray`
+        copied: A bint representing whether the data was previously
+            copied in order to set the flags with the is-copied
+            bitmask.
+    Returns:
+        A new capsule with name ``"dltensor_versioned"`` that
+        contains a pointer to ``DLManagedTensorVersioned`` struct.
+    Raises:
+        DLPackCreationError: when array can be represented as
+            DLPack tensor.
+        MemoryError: when host allocation to needed for
+            ``DLManagedTensorVersioned`` did not succeed.
+        ValueError: when array elements data type could not be represented
+            in ``DLManagedTensorVersioned``.
+    """
+    cdef DLManagedTensorVersioned *dlmv_tensor = NULL
+    cdef DLTensor *dl_tensor = NULL
+    cdef uint32_t dlmv_flags = 0
+    cdef int nd = npy_ary.ndim
+    cdef int64_t *shape_strides_ptr = NULL
+    cdef int i = 0
+    cdef Py_ssize_t byte_offset = 0
+    cdef int itemsize = npy_ary.itemsize
+
+    dlmv_tensor = <DLManagedTensorVersioned *> stdlib.malloc(
+        sizeof(DLManagedTensorVersioned))
+    if dlmv_tensor is NULL:
+        raise MemoryError(
+            "numpy_to_dlpack_versioned_capsule: Could not allocate memory "
+            "for DLManagedTensorVersioned"
+        )
+
+    shape = npy_ary.ctypes.shape_as(ctypes.c_int64)
+    strides = npy_ary.ctypes.strides_as(ctypes.c_int64)
+    if nd > 0:
+        if npy_ary.size != 1:
+            for i in range(nd):
+                if shape[i] != 1 and strides[i] % itemsize != 0:
+                    stdlib.free(dlmv_tensor)
+                    raise BufferError(
+                        "numpy_to_dlpack_versioned_capsule: DLPack cannot "
+                        "encode an array if strides are not a multiple of "
+                        "itemsize"
+                    )
+        shape_strides_ptr = <int64_t *>stdlib.malloc((sizeof(int64_t) * 2) * nd)
+        if shape_strides_ptr is NULL:
+            stdlib.free(dlmv_tensor)
+            raise MemoryError(
+                "numpy_to_dlpack_versioned_capsule: Could not allocate memory "
+                "for shape/strides"
+            )
+        for i in range(nd):
+            shape_strides_ptr[i] = shape[i]
+            shape_strides_ptr[nd + i] = strides[i] // itemsize
+
+    writable_flag = npy_ary.flags["W"]
+
+    ary_dt = npy_ary.dtype
+    ary_dtk = ary_dt.kind
+
+    dl_tensor = &dlmv_tensor.dl_tensor
+    dl_tensor.data = <void *> npy_ary.data
+    dl_tensor.ndim = nd
+    dl_tensor.byte_offset = <uint64_t>byte_offset
+    dl_tensor.shape = &shape_strides_ptr[0] if nd > 0 else NULL
+    dl_tensor.strides = &shape_strides_ptr[nd] if nd > 0 else NULL
+    dl_tensor.device.device_type = kDLCPU
+    dl_tensor.device.device_id = 0
+    dl_tensor.dtype.lanes = <uint16_t>1
+    dl_tensor.dtype.bits = <uint8_t>(ary_dt.itemsize * 8)
+    if (ary_dtk == "b"):
+        dl_tensor.dtype.code = <uint8_t>kDLBool
+    elif (ary_dtk == "u"):
+        dl_tensor.dtype.code = <uint8_t>kDLUInt
+    elif (ary_dtk == "i"):
+        dl_tensor.dtype.code = <uint8_t>kDLInt
+    elif (ary_dtk == "f" and ary_dt.itemsize <= 8):
+        dl_tensor.dtype.code = <uint8_t>kDLFloat
+    elif (ary_dtk == "c" and ary_dt.itemsize <= 16):
+        dl_tensor.dtype.code = <uint8_t>kDLComplex
+    else:
+        stdlib.free(shape_strides_ptr)
+        stdlib.free(dlmv_tensor)
+        raise ValueError("Unrecognized array data type")
+
+    # set flags down here
+    if copied:
+        dlmv_flags |= DLPACK_FLAG_BITMASK_IS_COPIED
+    if not writable_flag:
+        dlmv_flags |= DLPACK_FLAG_BITMASK_READ_ONLY
+    dlmv_tensor.flags = dlmv_flags
+
+    dlmv_tensor.version.major = DLPACK_MAJOR_VERSION
+    dlmv_tensor.version.minor = DLPACK_MINOR_VERSION
+
+    dlmv_tensor.manager_ctx = <void*>npy_ary
+    cpython.Py_INCREF(npy_ary)
+    dlmv_tensor.deleter = _managed_tensor_versioned_deleter
+
+    return cpython.PyCapsule_New(
+        dlmv_tensor, "dltensor_versioned", _pycapsule_versioned_deleter
+    )
+
+
+cdef class _DLManagedTensorOwner:
+    """
+    Helper class managing the lifetime of the DLManagedTensor struct
+    transferred from a 'dlpack' capsule.
+    """
+    cdef DLManagedTensor * dlm_tensor
+
+    def __cinit__(self):
+        self.dlm_tensor = NULL
+
+    def __dealloc__(self):
+        if self.dlm_tensor:
+            self.dlm_tensor.deleter(self.dlm_tensor)
+            self.dlm_tensor = NULL
+
+    @staticmethod
+    cdef _DLManagedTensorOwner _create(DLManagedTensor *dlm_tensor_src):
+        cdef _DLManagedTensorOwner res
+        res = _DLManagedTensorOwner.__new__(_DLManagedTensorOwner)
+        res.dlm_tensor = dlm_tensor_src
+        return res
+
+
+cdef class _DLManagedTensorVersionedOwner:
+    """
+    Helper class managing the lifetime of the DLManagedTensorVersioned
+    struct transferred from a 'dlpack_versioned' capsule.
+    """
+    cdef DLManagedTensorVersioned * dlmv_tensor
+
+    def __cinit__(self):
+        self.dlmv_tensor = NULL
+
+    def __dealloc__(self):
+        if self.dlmv_tensor:
+            self.dlmv_tensor.deleter(self.dlmv_tensor)
+            self.dlmv_tensor = NULL
+
+    @staticmethod
+    cdef _DLManagedTensorVersionedOwner _create(
+        DLManagedTensorVersioned *dlmv_tensor_src
+    ):
+        cdef _DLManagedTensorVersionedOwner res
+        res = _DLManagedTensorVersionedOwner.__new__(
+            _DLManagedTensorVersionedOwner
+        )
+        res.dlmv_tensor = dlmv_tensor_src
+        return res
+
+
+cdef dict _numpy_array_interface_from_dl_tensor(DLTensor *dlt, bint ro_flag):
+    """Constructs a NumPy `__array_interface__` dictionary from a DLTensor."""
+    cdef int itemsize = 0
+
+    if dlt.dtype.lanes != 1:
+        raise BufferError(
+            "Can not import DLPack tensor with lanes != 1"
+        )
+    itemsize = dlt.dtype.bits // 8
+    shape = list()
+    if (dlt.strides is NULL):
+        strides = None
+        for dim in range(dlt.ndim):
+            shape.append(dlt.shape[dim])
+    else:
+        strides = list()
+        for dim in range(dlt.ndim):
+            shape.append(dlt.shape[dim])
+            # convert to byte-strides
+            strides.append(dlt.strides[dim] * itemsize)
+        strides = tuple(strides)
+    shape = tuple(shape)
+    if (dlt.dtype.code == kDLUInt):
+        ary_dt = "u" + str(itemsize)
+    elif (dlt.dtype.code == kDLInt):
+        ary_dt = "i" + str(itemsize)
+    elif (dlt.dtype.code == kDLFloat):
+        ary_dt = "f" + str(itemsize)
+    elif (dlt.dtype.code == kDLComplex):
+        ary_dt = "c" + str(itemsize)
+    elif (dlt.dtype.code == kDLBool):
+        ary_dt = "b" + str(itemsize)
+    else:
+        raise BufferError(
+            "Can not import DLPack tensor with type code {}.".format(
+                <object>dlt.dtype.code
+            )
+        )
+    typestr = "|" + ary_dt
+    return dict(
+        version=3,
+        shape=shape,
+        strides=strides,
+        data=(<size_t> dlt.data, True if ro_flag else False),
+        offset=dlt.byte_offset,
+        typestr=typestr,
+    )
+
+
+class _numpy_array_interface_wrapper:
+    """
+    Class that wraps a Python capsule and dictionary for consumption by NumPy.
+
+    Implementation taken from
+    https://github.com/dmlc/dlpack/blob/main/apps/numpy_dlpack/dlpack/to_numpy.py
+
+    Args:
+        array_interface:
+            A dictionary describing the underlying memory. Formatted
+            to match `numpy.ndarray.__array_interface__`.
+
+        pycapsule:
+            A Python capsule wrapping the dlpack tensor that will be
+            converted to numpy.
+    """
+
+    def __init__(self, array_interface, memory_owner) -> None:
+        self.__array_interface__ = array_interface
+        self._memory_owner = memory_owner
+
+
+cdef bint _is_kdlcpu_device(DLDevice *dev):
+    "Check if DLTensor.DLDevice denotes (kDLCPU, 0)"
+    return (dev[0].device_type == kDLCPU) and (dev[0].device_id == 0)
+
+
+cpdef object from_dlpack_capsule(object py_caps):
+    """
+    from_dlpack_capsule(py_caps)
+
+    Reconstructs instance of :class:`dpctl.tensor.usm_ndarray` from
+    named Python capsule object referencing instance of ``DLManagedTensor``
+    without copy. The instance forms a view in the memory of the tensor.
+
+    Args:
+        caps:
+            Python capsule with name ``"dltensor"`` expected to reference
+            an instance of ``DLManagedTensor`` struct.
+    Returns:
+        Instance of :class:`dpctl.tensor.usm_ndarray` with a view into
+        memory of the tensor. Capsule is renamed to ``"used_dltensor"``
+        upon success.
+    Raises:
+        TypeError:
+            if argument is not a ``"dltensor"`` capsule.
+        ValueError:
+            if argument is ``"used_dltensor"`` capsule
+        BufferError:
+            if the USM pointer is not bound to the reconstructed
+            sycl context, or the DLPack's device_type is not supported
+            by :mod:`dpctl`.
+    """
+    cdef DLManagedTensorVersioned *dlmv_tensor = NULL
+    cdef DLManagedTensor *dlm_tensor = NULL
+    cdef DLTensor *dl_tensor = NULL
+    cdef int versioned = 0
+    cdef int readonly = 0
+    cdef bytes usm_type
+    cdef size_t sz = 1
+    cdef size_t alloc_sz = 1
+    cdef int i
+    cdef int device_id = -1
+    cdef int element_bytesize = 0
+    cdef Py_ssize_t offset_min = 0
+    cdef Py_ssize_t offset_max = 0
+    cdef char *mem_ptr = NULL
+    cdef Py_ssize_t mem_ptr_delta = 0
+    cdef Py_ssize_t element_offset = 0
+    cdef int64_t stride_i = -1
+    cdef int64_t shape_i = -1
+
+    if cpython.PyCapsule_IsValid(py_caps, "dltensor"):
+        dlm_tensor = <DLManagedTensor*>cpython.PyCapsule_GetPointer(
+                py_caps, "dltensor")
+        dl_tensor = &dlm_tensor.dl_tensor
+    elif cpython.PyCapsule_IsValid(py_caps, "dltensor_versioned"):
+        dlmv_tensor = <DLManagedTensorVersioned*>cpython.PyCapsule_GetPointer(
+                py_caps, "dltensor_versioned")
+        if dlmv_tensor.version.major > DLPACK_MAJOR_VERSION:
+            raise BufferError(
+                "Can not import DLPack tensor with major version "
+                f"greater than {DLPACK_MAJOR_VERSION}"
+            )
+        versioned = 1
+        readonly = (dlmv_tensor.flags & DLPACK_FLAG_BITMASK_READ_ONLY) != 0
+        dl_tensor = &dlmv_tensor.dl_tensor
+    elif (
+        cpython.PyCapsule_IsValid(py_caps, "used_dltensor")
+        or cpython.PyCapsule_IsValid(py_caps, "used_dltensor_versioned")
+    ):
+        raise ValueError(
+            "A DLPack tensor object can not be consumed multiple times"
+        )
+    else:
+        raise TypeError(
+            "`from_dlpack_capsule` expects a Python 'dltensor' capsule"
+        )
+
+    # Verify that we can work with this device
+    if dl_tensor.device.device_type == kDLOneAPI:
+        device_id = dl_tensor.device.device_id
+        root_device = dpctl.SyclDevice(str(<int>device_id))
+        try:
+            default_context = root_device.sycl_platform.default_context
+        except RuntimeError:
+            default_context = get_device_cached_queue(root_device).sycl_context
+        if dl_tensor.data is NULL:
+            usm_type = b"device"
+            q = get_device_cached_queue((default_context, root_device,))
+        else:
+            usm_type = c_dpmem._Memory.get_pointer_type(
+                <DPCTLSyclUSMRef> dl_tensor.data,
+                <c_dpctl.SyclContext>default_context)
+            if usm_type == b"unknown":
+                raise BufferError(
+                    "Data pointer in DLPack is not bound to default sycl "
+                    f"context of device '{device_id}', translated to "
+                    f"{root_device.filter_string}"
+                )
+            alloc_device = c_dpmem._Memory.get_pointer_device(
+                <DPCTLSyclUSMRef> dl_tensor.data,
+                <c_dpctl.SyclContext>default_context
+            )
+            q = get_device_cached_queue((default_context, alloc_device,))
+        if dl_tensor.dtype.bits % 8:
+            raise BufferError(
+                "Can not import DLPack tensor whose element's "
+                "bitsize is not a multiple of 8"
+            )
+        if dl_tensor.dtype.lanes != 1:
+            raise BufferError(
+                "Can not import DLPack tensor with lanes != 1"
+            )
+        if dl_tensor.ndim > 0:
+            offset_min = 0
+            offset_max = 0
+            for i in range(dl_tensor.ndim):
+                stride_i = dl_tensor.strides[i]
+                shape_i = dl_tensor.shape[i]
+                if shape_i > 1:
+                    shape_i -= 1
+                    if stride_i > 0:
+                        offset_max = offset_max + stride_i * shape_i
+                    else:
+                        offset_min = offset_min + stride_i * shape_i
+            sz = offset_max - offset_min + 1
+        if sz == 0:
+            sz = 1
+
+        element_bytesize = (dl_tensor.dtype.bits // 8)
+        sz = sz * element_bytesize
+        element_offset = dl_tensor.byte_offset // element_bytesize
+
+        # transfer ownership
+        if not versioned:
+            dlm_holder = _DLManagedTensorOwner._create(dlm_tensor)
+            cpython.PyCapsule_SetName(py_caps, "used_dltensor")
+        else:
+            dlmv_holder = _DLManagedTensorVersionedOwner._create(dlmv_tensor)
+            cpython.PyCapsule_SetName(py_caps, "used_dltensor_versioned")
+
+        if dl_tensor.data is NULL:
+            usm_mem = dpmem.MemoryUSMDevice(sz, q)
+        else:
+            mem_ptr_delta = dl_tensor.byte_offset - (
+                element_offset * element_bytesize
+            )
+            mem_ptr = <char *>dl_tensor.data
+            alloc_sz = dl_tensor.byte_offset + <uint64_t>(
+                (offset_max + 1) * element_bytesize)
+            tmp = c_dpmem._Memory.create_from_usm_pointer_size_qref(
+                <DPCTLSyclUSMRef> mem_ptr,
+                max(alloc_sz, <uint64_t>element_bytesize),
+                (<c_dpctl.SyclQueue>q).get_queue_ref(),
+                memory_owner=dlmv_holder if versioned else dlm_holder
+            )
+            if mem_ptr_delta == 0:
+                usm_mem = tmp
+            else:
+                alloc_sz = dl_tensor.byte_offset + <uint64_t>(
+                    (offset_max * element_bytesize + mem_ptr_delta))
+                usm_mem = c_dpmem._Memory.create_from_usm_pointer_size_qref(
+                    <DPCTLSyclUSMRef> (
+                        mem_ptr + (element_bytesize - mem_ptr_delta)
+                    ),
+                    max(alloc_sz, <uint64_t>element_bytesize),
+                    (<c_dpctl.SyclQueue>q).get_queue_ref(),
+                    memory_owner=tmp
+                )
+
+        py_shape = list()
+        if (dl_tensor.shape is not NULL):
+            for i in range(dl_tensor.ndim):
+                py_shape.append(dl_tensor.shape[i])
+        if (dl_tensor.strides is not NULL):
+            py_strides = list()
+            for i in range(dl_tensor.ndim):
+                py_strides.append(dl_tensor.strides[i])
+        else:
+            py_strides = None
+        if (dl_tensor.dtype.code == kDLUInt):
+            ary_dt = np.dtype("u" + str(element_bytesize))
+        elif (dl_tensor.dtype.code == kDLInt):
+            ary_dt = np.dtype("i" + str(element_bytesize))
+        elif (dl_tensor.dtype.code == kDLFloat):
+            ary_dt = np.dtype("f" + str(element_bytesize))
+        elif (dl_tensor.dtype.code == kDLComplex):
+            ary_dt = np.dtype("c" + str(element_bytesize))
+        elif (dl_tensor.dtype.code == kDLBool):
+            ary_dt = np.dtype("?")
+        else:
+            raise BufferError(
+                "Can not import DLPack tensor with type code {}.".format(
+                    <object>dl_tensor.dtype.code
+                )
+            )
+        res_ary = usm_ndarray(
+            py_shape,
+            dtype=ary_dt,
+            buffer=usm_mem,
+            strides=py_strides,
+            offset=element_offset
+        )
+        if readonly:
+            res_ary.flags_ = (res_ary.flags_ & ~USM_ARRAY_WRITABLE)
+        return res_ary
+    elif _is_kdlcpu_device(&dl_tensor.device):
+        ary_iface = _numpy_array_interface_from_dl_tensor(dl_tensor, readonly)
+        if not versioned:
+            dlm_holder = _DLManagedTensorOwner._create(dlm_tensor)
+            cpython.PyCapsule_SetName(py_caps, "used_dltensor")
+            return np.ctypeslib.as_array(
+                _numpy_array_interface_wrapper(ary_iface, dlm_holder)
+            )
+        else:
+            dlmv_holder = _DLManagedTensorVersionedOwner._create(dlmv_tensor)
+            cpython.PyCapsule_SetName(py_caps, "used_dltensor_versioned")
+            return np.ctypeslib.as_array(
+                _numpy_array_interface_wrapper(ary_iface, dlmv_holder)
+            )
+    else:
+        raise BufferError(
+            "The DLPack tensor resides on unsupported device."
+        )
+
+cdef usm_ndarray _to_usm_ary_from_host_blob(object host_blob, dev : Device):
+    q = dev.sycl_queue
+    np_ary = np.asarray(host_blob)
+    dt = np_ary.dtype
+    if dt.char in "dD" and q.sycl_device.has_aspect_fp64 is False:
+        Xusm_dtype = (
+            "float32" if dt.char == "d" else "complex64"
+        )
+    else:
+        Xusm_dtype = dt
+    usm_mem = dpmem.MemoryUSMDevice(np_ary.nbytes, queue=q)
+    usm_ary = usm_ndarray(np_ary.shape, dtype=Xusm_dtype, buffer=usm_mem)
+    usm_mem.copy_from_host(np.reshape(np_ary.view(dtype="u1"), -1))
+    return usm_ary
+
+
+# only cdef to make it private
+cdef object _create_device(object device, object dl_device):
+    if isinstance(device, Device):
+        return device
+    elif isinstance(device, dpctl.SyclDevice):
+        return Device.create_device(device)
+    else:
+        root_device = dpctl.SyclDevice(str(<int>dl_device[1]))
+        return Device.create_device(root_device)
+
+
+def from_dlpack(x, /, *, device=None, copy=None):
+    """from_dlpack(x, /, *, device=None, copy=None)
+
+    Constructs :class:`dpctl.tensor.usm_ndarray` or :class:`numpy.ndarray`
+    instance from a Python object ``x`` that implements ``__dlpack__`` protocol.
+
+    Args:
+        x (object):
+            A Python object representing an array that supports
+            ``__dlpack__`` protocol.
+        device (
+            Optional[str, :class:`dpctl.SyclDevice`,
+            :class:`dpctl.SyclQueue`,
+            :class:`dpctl.tensor.Device`,
+            tuple([:class:`enum.IntEnum`, int])])
+        ):
+            Device where the output array is to be placed. ``device`` keyword
+            values can be:
+
+            * ``None``
+                The data remains on the same device.
+            * oneAPI filter selector string
+                SYCL device selected by :ref:`filter selector string
+                <filter_selector_string>`.
+            * :class:`dpctl.SyclDevice`
+                explicit SYCL device that must correspond to
+                a non-partitioned SYCL device.
+            * :class:`dpctl.SyclQueue`
+                implies SYCL device targeted by the SYCL queue.
+            * :class:`dpctl.tensor.Device`
+                implies SYCL device `device.sycl_queue`. The `Device` object
+                is obtained via :attr:`dpctl.tensor.usm_ndarray.device`.
+            * ``(device_type, device_id)``
+               2-tuple matching the format of the output of the
+               ``__dlpack_device__`` method: an integer enumerator representing
+               the device type followed by an integer representing the index of
+               the device. The only supported :class:`dpctl.tensor.DLDeviceType`
+               device types are ``"kDLCPU"`` and ``"kDLOneAPI"``.
+
+            Default: ``None``.
+
+        copy (bool, optional)
+            Boolean indicating whether or not to copy the input.
+
+            * If ``copy`` is ``True``, the input will always be
+              copied.
+            * If ``False``, a ``BufferError`` will be raised if a
+              copy is deemed necessary.
+            * If ``None``, a copy will be made only if deemed
+              necessary, otherwise, the existing memory buffer will
+              be reused.
+
+            Default: ``None``.
+
+    Returns:
+        Alternative[usm_ndarray, numpy.ndarray]:
+            An array containing the data in ``x``. When ``copy`` is
+            ``None`` or ``False``, this may be a view into the original
+            memory.
+
+            The type of the returned object
+            depends on where the data backing up input object ``x`` resides.
+            If it resides in a USM allocation on a SYCL device, the
+            type :class:`dpctl.tensor.usm_ndarray` is returned, otherwise if it
+            resides on ``"kDLCPU"`` device the type is :class:`numpy.ndarray`,
+            and otherwise an exception is raised.
+
+            .. note::
+
+                If the return type is :class:`dpctl.tensor.usm_ndarray`, the
+                associated SYCL queue is derived from the ``device`` keyword.
+                When ``device`` keyword value has type :class:`dpctl.SyclQueue`,
+                the explicit queue instance is used, when ``device`` keyword
+                value has type :class:`dpctl.tensor.Device`, the
+                ``device.sycl_queue`` is used. In all other cases, the cached
+                SYCL queue corresponding to the implied SYCL device is used.
+
+    Raises:
+        TypeError:
+            if ``x`` does not implement ``__dlpack__`` method
+        ValueError:
+            if data of the input object resides on an unsupported device
+
+    See https://dmlc.github.io/dlpack/latest/ for more details.
+
+    :Example:
+
+        .. code-block:: python
+
+            import dpctl
+            import dpctl_ext.tensor as dpt
+
+            class Container:
+                "Helper class implementing `__dlpack__` protocol"
+                def __init__(self, array):
+                    self._array = array
+
+                def __dlpack__(self, stream=None):
+                    return self._array.__dlpack__(stream=stream)
+
+                def __dlpack_device__(self):
+                    return self._array.__dlpack_device__()
+
+            C = Container(dpt.linspace(0, 100, num=20, dtype="int16"))
+            # create usm_ndarray view
+            X = dpt.from_dlpack(C)
+            # migrate content of the container to device of type kDLCPU
+            Y = dpt.from_dlpack(C, device=(dpt.DLDeviceType.kDLCPU, 0))
+
+    """
+    dlpack_attr = getattr(x, "__dlpack__", None)
+    dlpack_dev_attr = getattr(x, "__dlpack_device__", None)
+    if not callable(dlpack_attr) or not callable(dlpack_dev_attr):
+        raise TypeError(
+            f"The argument of type {type(x)} does not implement "
+            "`__dlpack__` and `__dlpack_device__` methods."
+        )
+    # device is converted to a dlpack_device if necessary
+    dl_device = None
+    if device:
+        if isinstance(device, tuple):
+            dl_device = device
+            if len(dl_device) != 2:
+                raise ValueError(
+                    "Argument `device` specified as a tuple must have length 2"
+                )
+        else:
+            if not isinstance(device, dpctl.SyclDevice):
+                device = Device.create_device(device)
+                d = device.sycl_device
+            else:
+                d = device
+            dl_device = (device_OneAPI, d.get_device_id())
+    if dl_device is not None:
+        if (dl_device[0] not in [device_OneAPI, device_CPU]):
+            raise ValueError(
+                f"Argument `device`={device} is not supported."
+            )
+    got_type_error = False
+    got_buffer_error = False
+    got_other_error = False
+    saved_exception = None
+    # First DLPack version supporting dl_device, and copy
+    requested_ver = (1, 0)
+    cpu_dev = (device_CPU, 0)
+    try:
+        # setting max_version to minimal version that supports
+        # dl_device/copy keywords
+        dlpack_capsule = dlpack_attr(
+            max_version=requested_ver,
+            dl_device=dl_device,
+            copy=copy
+        )
+    except TypeError:
+        # exporter does not support max_version keyword
+        got_type_error = True
+    except (BufferError, NotImplementedError, ValueError) as e:
+        # Either dl_device, or copy cannot be satisfied
+        got_buffer_error = True
+        saved_exception = e
+    except Exception as e:
+        got_other_error = True
+        saved_exception = e
+    else:
+        # execution did not raise exceptions
+        return from_dlpack_capsule(dlpack_capsule)
+    finally:
+        if got_type_error:
+            # max_version/dl_device, copy keywords are not supported
+            # by __dlpack__
+            x_dldev = dlpack_dev_attr()
+            if (dl_device is None) or (dl_device == x_dldev):
+                dlpack_capsule = dlpack_attr()
+                return from_dlpack_capsule(dlpack_capsule)
+            # must copy via host
+            if copy is False:
+                raise BufferError(
+                    "Importing data via DLPack requires copying, but "
+                    "copy=False was provided"
+                )
+            # when max_version/dl_device/copy are not supported
+            # we can only support importing to OneAPI devices
+            # from host, or from another oneAPI device
+            is_supported_x_dldev = (
+                x_dldev == cpu_dev or
+                (x_dldev[0] == device_OneAPI)
+            )
+            is_supported_dl_device = (
+                dl_device == cpu_dev or
+                dl_device[0] == device_OneAPI
+            )
+            if is_supported_x_dldev and is_supported_dl_device:
+                dlpack_capsule = dlpack_attr()
+                blob = from_dlpack_capsule(dlpack_capsule)
+            else:
+                raise BufferError(
+                    f"Can not import to requested device {dl_device}"
+                )
+            dev = _create_device(device, dl_device)
+            if x_dldev == cpu_dev and dl_device == cpu_dev:
+                # both source and destination are CPU
+                return blob
+            elif x_dldev == cpu_dev:
+                # source is CPU, destination is oneAPI
+                return _to_usm_ary_from_host_blob(blob, dev)
+            elif dl_device == cpu_dev:
+                # source is oneAPI, destination is CPU
+                cpu_caps = blob.__dlpack__(
+                    max_version=get_build_dlpack_version(),
+                    dl_device=cpu_dev
+                )
+                return from_dlpack_capsule(cpu_caps)
+            else:
+                # TODO: revert to `import dpctl.tensor`
+                # when dpnp fully migrates dpctl/tensor
+                import dpctl_ext.tensor as dpt
+                return dpt.asarray(blob, device=dev)
+        elif got_buffer_error:
+            # we are here, because dlpack_attr could not deal with requested
+            # dl_device, or copying was required
+            if copy is False:
+                raise BufferError(
+                    "Importing data via DLPack requires copying, but "
+                    "copy=False was provided"
+                )
+            if dl_device is None:
+                raise saved_exception
+            # must copy via host
+            if dl_device[0] != device_OneAPI:
+                raise BufferError(
+                    f"Can not import to requested device {dl_device}"
+                )
+            x_dldev = dlpack_dev_attr()
+            if x_dldev == cpu_dev:
+                dlpack_capsule = dlpack_attr()
+                host_blob = from_dlpack_capsule(dlpack_capsule)
+            else:
+                dlpack_capsule = dlpack_attr(
+                    max_version=requested_ver,
+                    dl_device=cpu_dev,
+                    copy=copy
+                )
+                host_blob = from_dlpack_capsule(dlpack_capsule)
+            dev = _create_device(device, dl_device)
+            return _to_usm_ary_from_host_blob(host_blob, dev)
+        elif got_other_error:
+            raise saved_exception
diff --git a/dpctl_ext/tensor/_elementwise_common.py b/dpctl_ext/tensor/_elementwise_common.py
index 7fd9dabf9614..ffe849db9cad 100644
--- a/dpctl_ext/tensor/_elementwise_common.py
+++ b/dpctl_ext/tensor/_elementwise_common.py
@@ -27,12 +27,11 @@
 # *****************************************************************************
 
 import dpctl
-import dpctl.tensor as dpt
 from dpctl.utils import ExecutionPlacementError, SequentialOrderManager
 
 # TODO: revert to `import dpctl.tensor...`
 # when dpnp fully migrates dpctl/tensor
-import dpctl_ext.tensor as dpt_ext
+import dpctl_ext.tensor as dpt
 import dpctl_ext.tensor._tensor_impl as ti
 
 from ._copy_utils import _empty_like_orderK, _empty_like_pair_orderK
@@ -233,7 +232,7 @@ def __call__(self, x, /, *, out=None, order="K"):
                 # Allocate a temporary buffer to avoid memory overlapping.
                 # Note if `buf_dt` is not None, a temporary copy of `x` will be
                 # created, so the array overlap check isn't needed.
-                out = dpt_ext.empty_like(out)
+                out = dpt.empty_like(out)
 
             if (
                 dpctl.utils.get_execution_queue((x.sycl_queue, out.sycl_queue))
@@ -252,7 +251,7 @@ def __call__(self, x, /, *, out=None, order="K"):
                 else:
                     if order == "A":
                         order = "F" if x.flags.f_contiguous else "C"
-                    out = dpt_ext.empty_like(x, dtype=res_dt, order=order)
+                    out = dpt.empty_like(x, dtype=res_dt, order=order)
 
             dep_evs = _manager.submitted_events
             ht_unary_ev, unary_ev = self.unary_fn_(
@@ -275,7 +274,7 @@ def __call__(self, x, /, *, out=None, order="K"):
         else:
             if order == "A":
                 order = "F" if x.flags.f_contiguous else "C"
-            buf = dpt_ext.empty_like(x, dtype=buf_dt, order=order)
+            buf = dpt.empty_like(x, dtype=buf_dt, order=order)
 
         dep_evs = _manager.submitted_events
         ht_copy_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
@@ -286,7 +285,7 @@ def __call__(self, x, /, *, out=None, order="K"):
             if order == "K":
                 out = _empty_like_orderK(buf, res_dt)
             else:
-                out = dpt_ext.empty_like(buf, dtype=res_dt, order=order)
+                out = dpt.empty_like(buf, dtype=res_dt, order=order)
 
         ht, uf_ev = self.unary_fn_(
             buf, out, sycl_queue=exec_q, depends=[copy_ev]
@@ -597,7 +596,7 @@ def __call__(self, o1, o2, /, *, out=None, order="K"):
             if isinstance(o1, dpt.usm_ndarray):
                 if ti._array_overlap(o1, out) and buf1_dt is None:
                     if not ti._same_logical_tensors(o1, out):
-                        out = dpt_ext.empty_like(out)
+                        out = dpt.empty_like(out)
                     elif self.binary_inplace_fn_ is not None:
                         # if there is a dedicated in-place kernel
                         # it can be called here, otherwise continues
@@ -610,12 +609,12 @@ def __call__(self, o1, o2, /, *, out=None, order="K"):
                             ):
                                 buf2_dt = o2_dtype
                         else:
-                            src2 = dpt_ext.asarray(
+                            src2 = dpt.asarray(
                                 o2, dtype=o2_dtype, sycl_queue=exec_q
                             )
                         if buf2_dt is None:
                             if src2.shape != res_shape:
-                                src2 = dpt_ext.broadcast_to(src2, res_shape)
+                                src2 = dpt.broadcast_to(src2, res_shape)
                             dep_evs = _manager.submitted_events
                             ht_, comp_ev = self.binary_inplace_fn_(
                                 lhs=o1,
@@ -625,7 +624,7 @@ def __call__(self, o1, o2, /, *, out=None, order="K"):
                             )
                             _manager.add_event_pair(ht_, comp_ev)
                         else:
-                            buf2 = dpt_ext.empty_like(src2, dtype=buf2_dt)
+                            buf2 = dpt.empty_like(src2, dtype=buf2_dt)
                             dep_evs = _manager.submitted_events
                             (
                                 ht_copy_ev,
@@ -638,7 +637,7 @@ def __call__(self, o1, o2, /, *, out=None, order="K"):
                             )
                             _manager.add_event_pair(ht_copy_ev, copy_ev)
 
-                            buf2 = dpt_ext.broadcast_to(buf2, res_shape)
+                            buf2 = dpt.broadcast_to(buf2, res_shape)
                             ht_, bf_ev = self.binary_inplace_fn_(
                                 lhs=o1,
                                 rhs=buf2,
@@ -657,16 +656,16 @@ def __call__(self, o1, o2, /, *, out=None, order="K"):
                 ):
                     # should not reach if out is reallocated
                     # after being checked against o1
-                    out = dpt_ext.empty_like(out)
+                    out = dpt.empty_like(out)
 
         if isinstance(o1, dpt.usm_ndarray):
             src1 = o1
         else:
-            src1 = dpt_ext.asarray(o1, dtype=o1_dtype, sycl_queue=exec_q)
+            src1 = dpt.asarray(o1, dtype=o1_dtype, sycl_queue=exec_q)
         if isinstance(o2, dpt.usm_ndarray):
             src2 = o2
         else:
-            src2 = dpt_ext.asarray(o2, dtype=o2_dtype, sycl_queue=exec_q)
+            src2 = dpt.asarray(o2, dtype=o2_dtype, sycl_queue=exec_q)
 
         if order == "A":
             order = (
@@ -688,7 +687,7 @@ def __call__(self, o1, o2, /, *, out=None, order="K"):
                         src1, src2, res_dt, res_shape, res_usm_type, exec_q
                     )
                 else:
-                    out = dpt_ext.empty(
+                    out = dpt.empty(
                         res_shape,
                         dtype=res_dt,
                         usm_type=res_usm_type,
@@ -696,9 +695,9 @@ def __call__(self, o1, o2, /, *, out=None, order="K"):
                         order=order,
                     )
             if src1.shape != res_shape:
-                src1 = dpt_ext.broadcast_to(src1, res_shape)
+                src1 = dpt.broadcast_to(src1, res_shape)
             if src2.shape != res_shape:
-                src2 = dpt_ext.broadcast_to(src2, res_shape)
+                src2 = dpt.broadcast_to(src2, res_shape)
             deps_ev = _manager.submitted_events
             ht_binary_ev, binary_ev = self.binary_fn_(
                 src1=src1,
@@ -723,7 +722,7 @@ def __call__(self, o1, o2, /, *, out=None, order="K"):
             if order == "K":
                 buf2 = _empty_like_orderK(src2, buf2_dt)
             else:
-                buf2 = dpt_ext.empty_like(src2, dtype=buf2_dt, order=order)
+                buf2 = dpt.empty_like(src2, dtype=buf2_dt, order=order)
             dep_evs = _manager.submitted_events
             ht_copy_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
                 src=src2, dst=buf2, sycl_queue=exec_q, depends=dep_evs
@@ -735,7 +734,7 @@ def __call__(self, o1, o2, /, *, out=None, order="K"):
                         src1, buf2, res_dt, res_shape, res_usm_type, exec_q
                     )
                 else:
-                    out = dpt_ext.empty(
+                    out = dpt.empty(
                         res_shape,
                         dtype=res_dt,
                         usm_type=res_usm_type,
@@ -744,8 +743,8 @@ def __call__(self, o1, o2, /, *, out=None, order="K"):
                     )
 
             if src1.shape != res_shape:
-                src1 = dpt_ext.broadcast_to(src1, res_shape)
-            buf2 = dpt_ext.broadcast_to(buf2, res_shape)
+                src1 = dpt.broadcast_to(src1, res_shape)
+            buf2 = dpt.broadcast_to(buf2, res_shape)
             ht_binary_ev, binary_ev = self.binary_fn_(
                 src1=src1,
                 src2=buf2,
@@ -769,7 +768,7 @@ def __call__(self, o1, o2, /, *, out=None, order="K"):
             if order == "K":
                 buf1 = _empty_like_orderK(src1, buf1_dt)
             else:
-                buf1 = dpt_ext.empty_like(src1, dtype=buf1_dt, order=order)
+                buf1 = dpt.empty_like(src1, dtype=buf1_dt, order=order)
             dep_evs = _manager.submitted_events
             ht_copy_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
                 src=src1, dst=buf1, sycl_queue=exec_q, depends=dep_evs
@@ -781,7 +780,7 @@ def __call__(self, o1, o2, /, *, out=None, order="K"):
                         buf1, src2, res_dt, res_shape, res_usm_type, exec_q
                     )
                 else:
-                    out = dpt_ext.empty(
+                    out = dpt.empty(
                         res_shape,
                         dtype=res_dt,
                         usm_type=res_usm_type,
@@ -789,9 +788,9 @@ def __call__(self, o1, o2, /, *, out=None, order="K"):
                         order=order,
                     )
 
-            buf1 = dpt_ext.broadcast_to(buf1, res_shape)
+            buf1 = dpt.broadcast_to(buf1, res_shape)
             if src2.shape != res_shape:
-                src2 = dpt_ext.broadcast_to(src2, res_shape)
+                src2 = dpt.broadcast_to(src2, res_shape)
             ht_binary_ev, binary_ev = self.binary_fn_(
                 src1=buf1,
                 src2=src2,
@@ -820,7 +819,7 @@ def __call__(self, o1, o2, /, *, out=None, order="K"):
         if order == "K":
             buf1 = _empty_like_orderK(src1, buf1_dt)
         else:
-            buf1 = dpt_ext.empty_like(src1, dtype=buf1_dt, order=order)
+            buf1 = dpt.empty_like(src1, dtype=buf1_dt, order=order)
         dep_evs = _manager.submitted_events
         ht_copy1_ev, copy1_ev = ti._copy_usm_ndarray_into_usm_ndarray(
             src=src1, dst=buf1, sycl_queue=exec_q, depends=dep_evs
@@ -829,7 +828,7 @@ def __call__(self, o1, o2, /, *, out=None, order="K"):
         if order == "K":
             buf2 = _empty_like_orderK(src2, buf2_dt)
         else:
-            buf2 = dpt_ext.empty_like(src2, dtype=buf2_dt, order=order)
+            buf2 = dpt.empty_like(src2, dtype=buf2_dt, order=order)
         ht_copy2_ev, copy2_ev = ti._copy_usm_ndarray_into_usm_ndarray(
             src=src2, dst=buf2, sycl_queue=exec_q, depends=dep_evs
         )
@@ -840,7 +839,7 @@ def __call__(self, o1, o2, /, *, out=None, order="K"):
                     buf1, buf2, res_dt, res_shape, res_usm_type, exec_q
                 )
             else:
-                out = dpt_ext.empty(
+                out = dpt.empty(
                     res_shape,
                     dtype=res_dt,
                     usm_type=res_usm_type,
@@ -848,8 +847,8 @@ def __call__(self, o1, o2, /, *, out=None, order="K"):
                     order=order,
                 )
 
-        buf1 = dpt_ext.broadcast_to(buf1, res_shape)
-        buf2 = dpt_ext.broadcast_to(buf2, res_shape)
+        buf1 = dpt.broadcast_to(buf1, res_shape)
+        buf2 = dpt.broadcast_to(buf2, res_shape)
         ht_, bf_ev = self.binary_fn_(
             src1=buf1,
             src2=buf2,
@@ -960,10 +959,10 @@ def _inplace_op(self, o1, o2):
             ):
                 buf_dt = o2_dtype
         else:
-            src2 = dpt_ext.asarray(o2, dtype=o2_dtype, sycl_queue=exec_q)
+            src2 = dpt.asarray(o2, dtype=o2_dtype, sycl_queue=exec_q)
         if buf_dt is None:
             if src2.shape != res_shape:
-                src2 = dpt_ext.broadcast_to(src2, res_shape)
+                src2 = dpt.broadcast_to(src2, res_shape)
             dep_evs = _manager.submitted_events
             ht_, comp_ev = self.binary_inplace_fn_(
                 lhs=o1,
@@ -973,7 +972,7 @@ def _inplace_op(self, o1, o2):
             )
             _manager.add_event_pair(ht_, comp_ev)
         else:
-            buf = dpt_ext.empty_like(src2, dtype=buf_dt)
+            buf = dpt.empty_like(src2, dtype=buf_dt)
             dep_evs = _manager.submitted_events
             (
                 ht_copy_ev,
@@ -986,7 +985,7 @@ def _inplace_op(self, o1, o2):
             )
             _manager.add_event_pair(ht_copy_ev, copy_ev)
 
-            buf = dpt_ext.broadcast_to(buf, res_shape)
+            buf = dpt.broadcast_to(buf, res_shape)
             ht_, bf_ev = self.binary_inplace_fn_(
                 lhs=o1,
                 rhs=buf,
diff --git a/dpctl_ext/tensor/_flags.pyx b/dpctl_ext/tensor/_flags.pyx
new file mode 100644
index 000000000000..322d52bd56c7
--- /dev/null
+++ b/dpctl_ext/tensor/_flags.pyx
@@ -0,0 +1,175 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+# distutils: language = c++
+# cython: language_level=3
+# cython: linetrace=True
+
+from libcpp cimport bool as cpp_bool
+
+from ._usmarray cimport (
+    USM_ARRAY_C_CONTIGUOUS,
+    USM_ARRAY_F_CONTIGUOUS,
+    USM_ARRAY_WRITABLE,
+    usm_ndarray,
+)
+
+
+cdef cpp_bool _check_bit(int flag, int mask):
+    return (flag & mask) == mask
+
+
+cdef class Flags:
+    """
+    Helper class to query the flags of a :class:`dpctl.tensor.usm_ndarray`
+    instance, which describe how the instance interfaces with its underlying
+    memory.
+    """
+    cdef int flags_
+    cdef usm_ndarray arr_
+
+    def __cinit__(self, usm_ndarray arr, int flags):
+        self.arr_ = arr
+        self.flags_ = flags
+
+    @property
+    def flags(self):
+        """
+        Integer representation of the memory layout flags of
+        :class:`dpctl.tensor.usm_ndarray` instance.
+        """
+        return self.flags_
+
+    @property
+    def c_contiguous(self):
+        """
+        True if the memory layout of the
+        :class:`dpctl.tensor.usm_ndarray` instance is C-contiguous.
+        """
+        return _check_bit(self.flags_, USM_ARRAY_C_CONTIGUOUS)
+
+    @property
+    def f_contiguous(self):
+        """
+        True if the memory layout of the
+        :class:`dpctl.tensor.usm_ndarray` instance is F-contiguous.
+        """
+        return _check_bit(self.flags_, USM_ARRAY_F_CONTIGUOUS)
+
+    @property
+    def writable(self):
+        """
+        True if :class:`dpctl.tensor.usm_ndarray` instance is writable.
+        """
+        return _check_bit(self.flags_, USM_ARRAY_WRITABLE)
+
+    @writable.setter
+    def writable(self, new_val):
+        if not isinstance(new_val, bool):
+            raise TypeError("Expecting a boolean value")
+        self.arr_._set_writable_flag(new_val)
+
+    @property
+    def fc(self):
+        """
+        True if the memory layout of the :class:`dpctl.tensor.usm_ndarray`
+        instance is C-contiguous and F-contiguous.
+        """
+        return (
+           _check_bit(self.flags_, USM_ARRAY_C_CONTIGUOUS)
+           and _check_bit(self.flags_, USM_ARRAY_F_CONTIGUOUS)
+        )
+
+    @property
+    def forc(self):
+        """
+        True if the memory layout of the :class:`dpctl.tensor.usm_ndarray`
+        instance is C-contiguous or F-contiguous.
+        """
+        return (
+           _check_bit(self.flags_, USM_ARRAY_C_CONTIGUOUS)
+           or _check_bit(self.flags_, USM_ARRAY_F_CONTIGUOUS)
+        )
+
+    @property
+    def fnc(self):
+        """
+        True if the memory layout of the :class:`dpctl.tensor.usm_ndarray`
+        instance is F-contiguous and not C-contiguous.
+        """
+        return (
+           _check_bit(self.flags_, USM_ARRAY_F_CONTIGUOUS)
+           and not _check_bit(self.flags_, USM_ARRAY_C_CONTIGUOUS)
+        )
+
+    @property
+    def contiguous(self):
+        """
+        True if the memory layout of the :class:`dpctl.tensor.usm_ndarray`
+        instance is C-contiguous and F-contiguous.
+        Equivalent to `forc.`
+        """
+        return self.forc
+
+    def __getitem__(self, name):
+        if name in ["C_CONTIGUOUS", "C"]:
+            return self.c_contiguous
+        elif name in ["F_CONTIGUOUS", "F"]:
+            return self.f_contiguous
+        elif name in ["WRITABLE", "W"]:
+            return self.writable
+        elif name == "FC":
+            return self.fc
+        elif name == "FNC":
+            return self.fnc
+        elif name in ["FORC", "CONTIGUOUS"]:
+            return self.forc
+
+    def __setitem__(self, name, val):
+        if name in ["WRITABLE", "W"]:
+            self.writable = val
+        else:
+            raise ValueError(
+                "Only writable ('W' or 'WRITABLE') flag can be set"
+            )
+
+    def __repr__(self):
+        out = []
+        for name in "C_CONTIGUOUS", "F_CONTIGUOUS", "WRITABLE":
+            out.append("  {} : {}".format(name, self[name]))
+        return "\n".join(out)
+
+    def __eq__(self, other):
+        cdef Flags other_
+        if isinstance(other, self.__class__):
+            other_ = <Flags>other
+            return self.flags_ == other_.flags_
+        elif isinstance(other, int):
+            return self.flags_ == <int>other
+        else:
+            return False
diff --git a/dpctl_ext/tensor/_indexing_functions.py b/dpctl_ext/tensor/_indexing_functions.py
index 5b4eb1aaf7a2..08db81c1b166 100644
--- a/dpctl_ext/tensor/_indexing_functions.py
+++ b/dpctl_ext/tensor/_indexing_functions.py
@@ -29,12 +29,11 @@
 import operator
 
 import dpctl
-import dpctl.tensor as dpt
 import dpctl.utils
 
 # TODO: revert to `import dpctl.tensor...`
 # when dpnp fully migrates dpctl/tensor
-import dpctl_ext.tensor as dpt_ext
+import dpctl_ext.tensor as dpt
 import dpctl_ext.tensor._tensor_impl as ti
 
 from ._copy_utils import (
@@ -57,7 +56,7 @@ def _get_indexing_mode(name):
 
 
 def _range(sh_i, i, nd, q, usm_t, dt):
-    ind = dpt_ext.arange(sh_i, dtype=dt, usm_type=usm_t, sycl_queue=q)
+    ind = dpt.arange(sh_i, dtype=dt, usm_type=usm_t, sycl_queue=q)
     ind.shape = tuple(sh_i if i == j else 1 for j in range(nd))
     return ind
 
@@ -177,7 +176,7 @@ def place(arr, mask, vals):
         raise dpctl.utils.ExecutionPlacementError
     if arr.shape != mask.shape or vals.ndim != 1:
         raise ValueError("Array sizes are not as required")
-    cumsum = dpt_ext.empty(mask.size, dtype="i8", sycl_queue=exec_q)
+    cumsum = dpt.empty(mask.size, dtype="i8", sycl_queue=exec_q)
     _manager = dpctl.utils.SequentialOrderManager[exec_q]
     deps_ev = _manager.submitted_events
     nz_count = ti.mask_positions(
@@ -190,7 +189,7 @@ def place(arr, mask, vals):
     if vals.dtype == arr.dtype:
         rhs = vals
     else:
-        rhs = dpt_ext.astype(vals, arr.dtype)
+        rhs = dpt.astype(vals, arr.dtype)
     hev, pl_ev = ti._place(
         dst=arr,
         cumsum=cumsum,
@@ -329,7 +328,7 @@ def put_vec_duplicates(vec, ind, vals):
         val_shape = indices.shape
 
     if not isinstance(vals, dpt.usm_ndarray):
-        vals = dpt_ext.asarray(
+        vals = dpt.asarray(
             vals, dtype=x.dtype, usm_type=vals_usm_type, sycl_queue=exec_q
         )
     # choose to throw here for consistency with `place`
@@ -340,8 +339,8 @@ def put_vec_duplicates(vec, ind, vals):
     if vals.dtype == x.dtype:
         rhs = vals
     else:
-        rhs = dpt_ext.astype(vals, x.dtype)
-    rhs = dpt_ext.broadcast_to(rhs, val_shape)
+        rhs = dpt.astype(vals, x.dtype)
+    rhs = dpt.broadcast_to(rhs, val_shape)
 
     _manager = dpctl.utils.SequentialOrderManager[exec_q]
     deps_ev = _manager.submitted_events
@@ -540,9 +539,9 @@ def take(x, indices, /, *, axis=None, out=None, mode="wrap"):
                 "Input and output allocation queues are not compatible"
             )
         if ti._array_overlap(x, out):
-            out = dpt_ext.empty_like(out)
+            out = dpt.empty_like(out)
     else:
-        out = dpt_ext.empty(
+        out = dpt.empty(
             res_shape, dtype=dt, usm_type=res_usm_type, sycl_queue=exec_q
         )
 
diff --git a/dpctl_ext/tensor/_linear_algebra_functions.py b/dpctl_ext/tensor/_linear_algebra_functions.py
index 5f6edecf5e59..6dfb30e881b2 100644
--- a/dpctl_ext/tensor/_linear_algebra_functions.py
+++ b/dpctl_ext/tensor/_linear_algebra_functions.py
@@ -29,12 +29,11 @@
 import operator
 
 import dpctl
-import dpctl.tensor as dpt
 from dpctl.utils import ExecutionPlacementError, SequentialOrderManager
 
 # TODO: revert to `import dpctl.tensor...`
 # when dpnp fully migrates dpctl/tensor
-import dpctl_ext.tensor as dpt_ext
+import dpctl_ext.tensor as dpt
 import dpctl_ext.tensor._tensor_elementwise_impl as tei
 import dpctl_ext.tensor._tensor_impl as ti
 import dpctl_ext.tensor._tensor_linalg_impl as tli
@@ -181,8 +180,8 @@ def tensordot(x1, x2, axes=2):
         axes2 = normalize_axis_tuple(axes2, x2_nd)
         perm1 = [i for i in range(x1_nd) if i not in axes1] + list(axes1)
         perm2 = list(axes2) + [i for i in range(x2_nd) if i not in axes2]
-        arr1 = dpt_ext.permute_dims(x1, perm1)
-        arr2 = dpt_ext.permute_dims(x2, perm2)
+        arr1 = dpt.permute_dims(x1, perm1)
+        arr2 = dpt.permute_dims(x2, perm2)
     arr1_outer_nd = arr1.ndim - n_axes1
     arr2_outer_nd = arr2.ndim - n_axes2
     res_shape = arr1.shape[:arr1_outer_nd] + arr2.shape[n_axes2:]
@@ -207,7 +206,7 @@ def tensordot(x1, x2, axes=2):
 
     _manager = SequentialOrderManager[exec_q]
     if buf1_dt is None and buf2_dt is None:
-        out = dpt_ext.empty(
+        out = dpt.empty(
             res_shape,
             dtype=res_dt,
             usm_type=res_usm_type,
@@ -238,7 +237,7 @@ def tensordot(x1, x2, axes=2):
             src=arr2, dst=buf2, sycl_queue=exec_q, depends=dep_evs
         )
         _manager.add_event_pair(ht_copy_ev, copy_ev)
-        out = dpt_ext.empty(
+        out = dpt.empty(
             res_shape,
             dtype=res_dt,
             usm_type=res_usm_type,
@@ -267,7 +266,7 @@ def tensordot(x1, x2, axes=2):
             src=arr1, dst=buf1, sycl_queue=exec_q, depends=dep_evs
         )
         _manager.add_event_pair(ht_copy_ev, copy_ev)
-        out = dpt_ext.empty(
+        out = dpt.empty(
             res_shape,
             dtype=res_dt,
             usm_type=res_usm_type,
@@ -300,7 +299,7 @@ def tensordot(x1, x2, axes=2):
         src=arr2, dst=buf2, sycl_queue=exec_q, depends=deps_ev
     )
     _manager.add_event_pair(ht_copy2_ev, copy2_ev)
-    out = dpt_ext.empty(
+    out = dpt.empty(
         res_shape,
         dtype=res_dt,
         usm_type=res_usm_type,
@@ -435,12 +434,12 @@ def vecdot(x1, x2, axis=-1):
             _manager.add_event_pair(ht_conj_ev, conj_ev)
             x1 = x1_tmp
         if x1.shape != broadcast_sh:
-            x1 = dpt_ext.broadcast_to(x1, broadcast_sh)
+            x1 = dpt.broadcast_to(x1, broadcast_sh)
         if x2.shape != broadcast_sh:
-            x2 = dpt_ext.broadcast_to(x2, broadcast_sh)
-        x1 = dpt_ext.moveaxis(x1, contracted_axis, -1)
-        x2 = dpt_ext.moveaxis(x2, contracted_axis, -1)
-        out = dpt_ext.empty(
+            x2 = dpt.broadcast_to(x2, broadcast_sh)
+        x1 = dpt.moveaxis(x1, contracted_axis, -1)
+        x2 = dpt.moveaxis(x2, contracted_axis, -1)
+        out = dpt.empty(
             res_sh,
             dtype=res_dt,
             usm_type=res_usm_type,
@@ -460,7 +459,7 @@ def vecdot(x1, x2, axis=-1):
             depends=dep_evs,
         )
         _manager.add_event_pair(ht_dot_ev, dot_ev)
-        return dpt_ext.reshape(out, res_sh)
+        return dpt.reshape(out, res_sh)
 
     elif buf1_dt is None:
         if x1.dtype.kind == "c":
@@ -478,12 +477,12 @@ def vecdot(x1, x2, axis=-1):
         )
         _manager.add_event_pair(ht_copy_ev, copy_ev)
         if x1.shape != broadcast_sh:
-            x1 = dpt_ext.broadcast_to(x1, broadcast_sh)
+            x1 = dpt.broadcast_to(x1, broadcast_sh)
         if buf2.shape != broadcast_sh:
-            buf2 = dpt_ext.broadcast_to(buf2, broadcast_sh)
-        x1 = dpt_ext.moveaxis(x1, contracted_axis, -1)
-        buf2 = dpt_ext.moveaxis(buf2, contracted_axis, -1)
-        out = dpt_ext.empty(
+            buf2 = dpt.broadcast_to(buf2, broadcast_sh)
+        x1 = dpt.moveaxis(x1, contracted_axis, -1)
+        buf2 = dpt.moveaxis(buf2, contracted_axis, -1)
+        out = dpt.empty(
             res_sh,
             dtype=res_dt,
             usm_type=res_usm_type,
@@ -502,7 +501,7 @@ def vecdot(x1, x2, axis=-1):
             depends=[copy_ev],
         )
         _manager.add_event_pair(ht_dot_ev, dot_ev)
-        return dpt_ext.reshape(out, res_sh)
+        return dpt.reshape(out, res_sh)
 
     elif buf2_dt is None:
         buf1 = _empty_like_orderK(x1, buf1_dt)
@@ -517,12 +516,12 @@ def vecdot(x1, x2, axis=-1):
             )
             _manager.add_event_pair(ht_conj_ev, conj_ev)
         if buf1.shape != broadcast_sh:
-            buf1 = dpt_ext.broadcast_to(buf1, broadcast_sh)
+            buf1 = dpt.broadcast_to(buf1, broadcast_sh)
         if x2.shape != broadcast_sh:
-            x2 = dpt_ext.broadcast_to(x2, broadcast_sh)
-        buf1 = dpt_ext.moveaxis(buf1, contracted_axis, -1)
-        x2 = dpt_ext.moveaxis(x2, contracted_axis, -1)
-        out = dpt_ext.empty(
+            x2 = dpt.broadcast_to(x2, broadcast_sh)
+        buf1 = dpt.moveaxis(buf1, contracted_axis, -1)
+        x2 = dpt.moveaxis(x2, contracted_axis, -1)
+        out = dpt.empty(
             res_sh,
             dtype=res_dt,
             usm_type=res_usm_type,
@@ -542,7 +541,7 @@ def vecdot(x1, x2, axis=-1):
             depends=deps_ev,
         )
         _manager.add_event_pair(ht_dot_ev, dot_ev)
-        return dpt_ext.reshape(out, res_sh)
+        return dpt.reshape(out, res_sh)
 
     buf1 = _empty_like_orderK(x1, buf1_dt)
     deps_ev = _manager.submitted_events
@@ -561,12 +560,12 @@ def vecdot(x1, x2, axis=-1):
     )
     _manager.add_event_pair(ht_copy2_ev, copy2_ev)
     if buf1.shape != broadcast_sh:
-        buf1 = dpt_ext.broadcast_to(buf1, broadcast_sh)
+        buf1 = dpt.broadcast_to(buf1, broadcast_sh)
     if buf2.shape != broadcast_sh:
-        buf2 = dpt_ext.broadcast_to(buf2, broadcast_sh)
-    buf1 = dpt_ext.moveaxis(buf1, contracted_axis, -1)
-    buf2 = dpt_ext.moveaxis(buf2, contracted_axis, -1)
-    out = dpt_ext.empty(
+        buf2 = dpt.broadcast_to(buf2, broadcast_sh)
+    buf1 = dpt.moveaxis(buf1, contracted_axis, -1)
+    buf2 = dpt.moveaxis(buf2, contracted_axis, -1)
+    out = dpt.empty(
         res_sh,
         dtype=res_dt,
         usm_type=res_usm_type,
@@ -733,7 +732,7 @@ def matmul(x1, x2, out=None, dtype=None, order="K"):
         res_dt = _to_device_supported_dtype(res_dt, sycl_dev)
         buf1_dt, buf2_dt = None, None
         if x1_dtype != res_dt:
-            if dpt_ext.can_cast(x1_dtype, res_dt, casting="same_kind"):
+            if dpt.can_cast(x1_dtype, res_dt, casting="same_kind"):
                 buf1_dt = res_dt
             else:
                 raise ValueError(
@@ -743,7 +742,7 @@ def matmul(x1, x2, out=None, dtype=None, order="K"):
                     "''same_kind''."
                 )
         if x2_dtype != res_dt:
-            if dpt_ext.can_cast(x2_dtype, res_dt, casting="same_kind"):
+            if dpt.can_cast(x2_dtype, res_dt, casting="same_kind"):
                 buf2_dt = res_dt
             else:
                 raise ValueError(
@@ -775,7 +774,7 @@ def matmul(x1, x2, out=None, dtype=None, order="K"):
             )
 
         if appended_axes:
-            out = dpt_ext.expand_dims(out, axis=appended_axes)
+            out = dpt.expand_dims(out, axis=appended_axes)
             orig_out = out
 
         if res_dt != out.dtype:
@@ -789,12 +788,12 @@ def matmul(x1, x2, out=None, dtype=None, order="K"):
             )
 
         if ti._array_overlap(x1, out) and buf1_dt is None:
-            out = dpt_ext.empty_like(out)
+            out = dpt.empty_like(out)
 
         if ti._array_overlap(x2, out) and buf2_dt is None:
             # should not reach if out is reallocated
             # after being checked against x1
-            out = dpt_ext.empty_like(out)
+            out = dpt.empty_like(out)
 
     if order == "A":
         order = (
@@ -817,7 +816,7 @@ def matmul(x1, x2, out=None, dtype=None, order="K"):
                     x1, x2, res_dt, res_shape, res_usm_type, exec_q
                 )
             else:
-                out = dpt_ext.empty(
+                out = dpt.empty(
                     res_shape,
                     dtype=res_dt,
                     usm_type=res_usm_type,
@@ -825,9 +824,9 @@ def matmul(x1, x2, out=None, dtype=None, order="K"):
                     order=order,
                 )
         if x1.shape != x1_broadcast_shape:
-            x1 = dpt_ext.broadcast_to(x1, x1_broadcast_shape)
+            x1 = dpt.broadcast_to(x1, x1_broadcast_shape)
         if x2.shape != x2_broadcast_shape:
-            x2 = dpt_ext.broadcast_to(x2, x2_broadcast_shape)
+            x2 = dpt.broadcast_to(x2, x2_broadcast_shape)
         deps_evs = _manager.submitted_events
         ht_dot_ev, dot_ev = tli._dot(
             x1=x1,
@@ -852,13 +851,13 @@ def matmul(x1, x2, out=None, dtype=None, order="K"):
             _manager.add_event_pair(ht_copy_out_ev, cpy_ev)
             out = orig_out
         if appended_axes:
-            out = dpt_ext.squeeze(out, tuple(appended_axes))
+            out = dpt.squeeze(out, tuple(appended_axes))
         return out
     elif buf1_dt is None:
         if order == "K":
             buf2 = _empty_like_orderK(x2, buf2_dt)
         else:
-            buf2 = dpt_ext.empty_like(x2, dtype=buf2_dt, order=order)
+            buf2 = dpt.empty_like(x2, dtype=buf2_dt, order=order)
         deps_evs = _manager.submitted_events
         ht_copy_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
             src=x2, dst=buf2, sycl_queue=exec_q, depends=deps_evs
@@ -870,7 +869,7 @@ def matmul(x1, x2, out=None, dtype=None, order="K"):
                     x1, buf2, res_dt, res_shape, res_usm_type, exec_q
                 )
             else:
-                out = dpt_ext.empty(
+                out = dpt.empty(
                     res_shape,
                     dtype=res_dt,
                     usm_type=res_usm_type,
@@ -879,9 +878,9 @@ def matmul(x1, x2, out=None, dtype=None, order="K"):
                 )
 
         if x1.shape != x1_broadcast_shape:
-            x1 = dpt_ext.broadcast_to(x1, x1_broadcast_shape)
+            x1 = dpt.broadcast_to(x1, x1_broadcast_shape)
         if buf2.shape != x2_broadcast_shape:
-            buf2 = dpt_ext.broadcast_to(buf2, x2_broadcast_shape)
+            buf2 = dpt.broadcast_to(buf2, x2_broadcast_shape)
         ht_dot_ev, dot_ev = tli._dot(
             x1=x1,
             x2=buf2,
@@ -905,14 +904,14 @@ def matmul(x1, x2, out=None, dtype=None, order="K"):
             _manager.add_event_pair(ht_copy_out_ev, cpy_ev)
             out = orig_out
         if appended_axes:
-            out = dpt_ext.squeeze(out, tuple(appended_axes))
+            out = dpt.squeeze(out, tuple(appended_axes))
         return out
 
     elif buf2_dt is None:
         if order == "K":
             buf1 = _empty_like_orderK(x1, buf1_dt)
         else:
-            buf1 = dpt_ext.empty_like(x1, dtype=buf1_dt, order=order)
+            buf1 = dpt.empty_like(x1, dtype=buf1_dt, order=order)
         deps_ev = _manager.submitted_events
         ht_copy_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
             src=x1, dst=buf1, sycl_queue=exec_q, depends=deps_ev
@@ -924,7 +923,7 @@ def matmul(x1, x2, out=None, dtype=None, order="K"):
                     buf1, x2, res_dt, res_shape, res_usm_type, exec_q
                 )
             else:
-                out = dpt_ext.empty(
+                out = dpt.empty(
                     res_shape,
                     dtype=res_dt,
                     usm_type=res_usm_type,
@@ -933,9 +932,9 @@ def matmul(x1, x2, out=None, dtype=None, order="K"):
                 )
 
         if buf1.shape != x1_broadcast_shape:
-            buf1 = dpt_ext.broadcast_to(buf1, x1_broadcast_shape)
+            buf1 = dpt.broadcast_to(buf1, x1_broadcast_shape)
         if x2.shape != x2_broadcast_shape:
-            x2 = dpt_ext.broadcast_to(x2, x2_broadcast_shape)
+            x2 = dpt.broadcast_to(x2, x2_broadcast_shape)
         ht_dot_ev, dot_ev = tli._dot(
             x1=buf1,
             x2=x2,
@@ -959,7 +958,7 @@ def matmul(x1, x2, out=None, dtype=None, order="K"):
             _manager.add_event_pair(ht_copy_out_ev, cpy_ev)
             out = orig_out
         if appended_axes:
-            out = dpt_ext.squeeze(out, tuple(appended_axes))
+            out = dpt.squeeze(out, tuple(appended_axes))
         return out
 
     if order == "K":
@@ -970,7 +969,7 @@ def matmul(x1, x2, out=None, dtype=None, order="K"):
     if order == "K":
         buf1 = _empty_like_orderK(x1, buf1_dt)
     else:
-        buf1 = dpt_ext.empty_like(x1, dtype=buf1_dt, order=order)
+        buf1 = dpt.empty_like(x1, dtype=buf1_dt, order=order)
     deps_ev = _manager.submitted_events
     ht_copy1_ev, copy1_ev = ti._copy_usm_ndarray_into_usm_ndarray(
         src=x1, dst=buf1, sycl_queue=exec_q, depends=deps_ev
@@ -979,7 +978,7 @@ def matmul(x1, x2, out=None, dtype=None, order="K"):
     if order == "K":
         buf2 = _empty_like_orderK(x2, buf2_dt)
     else:
-        buf2 = dpt_ext.empty_like(x2, dtype=buf2_dt, order=order)
+        buf2 = dpt.empty_like(x2, dtype=buf2_dt, order=order)
     ht_copy2_ev, copy2_ev = ti._copy_usm_ndarray_into_usm_ndarray(
         src=x2, dst=buf2, sycl_queue=exec_q, depends=deps_ev
     )
@@ -990,7 +989,7 @@ def matmul(x1, x2, out=None, dtype=None, order="K"):
                 buf1, buf2, res_dt, res_shape, res_usm_type, exec_q
             )
         else:
-            out = dpt_ext.empty(
+            out = dpt.empty(
                 res_shape,
                 dtype=res_dt,
                 usm_type=res_usm_type,
@@ -999,9 +998,9 @@ def matmul(x1, x2, out=None, dtype=None, order="K"):
             )
 
     if buf1.shape != x1_broadcast_shape:
-        buf1 = dpt_ext.broadcast_to(buf1, x1_broadcast_shape)
+        buf1 = dpt.broadcast_to(buf1, x1_broadcast_shape)
     if buf2.shape != x2_broadcast_shape:
-        buf2 = dpt_ext.broadcast_to(buf2, x2_broadcast_shape)
+        buf2 = dpt.broadcast_to(buf2, x2_broadcast_shape)
     ht_, dot_ev = tli._dot(
         x1=buf1,
         x2=buf2,
@@ -1015,5 +1014,5 @@ def matmul(x1, x2, out=None, dtype=None, order="K"):
     )
     _manager.add_event_pair(ht_, dot_ev)
     if appended_axes:
-        out = dpt_ext.squeeze(out, tuple(appended_axes))
+        out = dpt.squeeze(out, tuple(appended_axes))
     return out
diff --git a/dpctl_ext/tensor/_manipulation_functions.py b/dpctl_ext/tensor/_manipulation_functions.py
index e2d55c533bc0..33817dd0aa2e 100644
--- a/dpctl_ext/tensor/_manipulation_functions.py
+++ b/dpctl_ext/tensor/_manipulation_functions.py
@@ -30,13 +30,12 @@
 import operator
 
 import dpctl
-import dpctl.tensor as dpt
 import dpctl.utils as dputils
 import numpy as np
 
 # TODO: revert to `import dpctl.tensor...`
 # when dpnp fully migrates dpctl/tensor
-import dpctl_ext.tensor as dpt_ext
+import dpctl_ext.tensor as dpt
 import dpctl_ext.tensor._tensor_impl as ti
 
 from ._numpy_helper import normalize_axis_index, normalize_axis_tuple
@@ -174,7 +173,7 @@ def _concat_axis_None(arrays):
     res_shape = 0
     for array in arrays:
         res_shape += array.size
-    res = dpt_ext.empty(
+    res = dpt.empty(
         res_shape, dtype=res_dtype, usm_type=res_usm_type, sycl_queue=exec_q
     )
 
@@ -185,7 +184,7 @@ def _concat_axis_None(arrays):
         fill_end = fill_start + array.size
         if array.flags.c_contiguous:
             hev, cpy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
-                src=dpt_ext.reshape(array, -1),
+                src=dpt.reshape(array, -1),
                 dst=res[fill_start:fill_end],
                 sycl_queue=exec_q,
                 depends=deps,
@@ -196,7 +195,7 @@ def _concat_axis_None(arrays):
             # _copy_usm_ndarray_for_reshape requires src and dst to have
             # the same data type
             if not array.dtype == res_dtype:
-                src2_ = dpt_ext.empty_like(src_, dtype=res_dtype)
+                src2_ = dpt.empty_like(src_, dtype=res_dtype)
                 ht_copy_ev, cpy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
                     src=src_, dst=src2_, sycl_queue=exec_q, depends=deps
                 )
@@ -334,7 +333,7 @@ def concat(arrays, /, *, axis=0):
         X0_shape[i] if i != axis else res_shape_axis for i in range(X0.ndim)
     )
 
-    res = dpt_ext.empty(
+    res = dpt.empty(
         res_shape, dtype=res_dtype, usm_type=res_usm_type, sycl_queue=exec_q
     )
 
@@ -402,7 +401,7 @@ def expand_dims(X, /, *, axis=0):
     shape_it = iter(X.shape)
     shape = tuple(1 if ax in axis else next(shape_it) for ax in range(out_ndim))
 
-    return dpt_ext.reshape(X, shape)
+    return dpt.reshape(X, shape)
 
 
 def flip(X, /, *, axis=None):
@@ -485,7 +484,7 @@ def moveaxis(X, source, destination, /):
     for src, dst in sorted(zip(destination, source)):
         ind.insert(src, dst)
 
-    return dpt_ext.permute_dims(X, tuple(ind))
+    return dpt.permute_dims(X, tuple(ind))
 
 
 def permute_dims(X, /, axes):
@@ -602,7 +601,7 @@ def repeat(x, repeats, /, *, axis=None):
             )
         )
         dpctl.utils.validate_usm_type(usm_type, allow_none=False)
-        if not dpt_ext.can_cast(repeats.dtype, dpt.int64, casting="same_kind"):
+        if not dpt.can_cast(repeats.dtype, dpt.int64, casting="same_kind"):
             raise TypeError(
                 f"'repeats' data type {repeats.dtype} cannot be cast to "
                 "'int64' according to the casting rule ''safe.''"
@@ -624,7 +623,7 @@ def repeat(x, repeats, /, *, axis=None):
                     "'repeats' array must be broadcastable to the size of "
                     "the repeated axis"
                 )
-            if not dpt_ext.all(repeats >= 0):
+            if not dpt.all(repeats >= 0):
                 raise ValueError("'repeats' elements must be positive")
 
     elif isinstance(repeats, (tuple, list, range)):
@@ -643,10 +642,10 @@ def repeat(x, repeats, /, *, axis=None):
                     "`repeats` sequence must have the same length as the "
                     "repeated axis"
                 )
-            repeats = dpt_ext.asarray(
+            repeats = dpt.asarray(
                 repeats, dtype=dpt.int64, usm_type=usm_type, sycl_queue=exec_q
             )
-            if not dpt_ext.all(repeats >= 0):
+            if not dpt.all(repeats >= 0):
                 raise ValueError("`repeats` elements must be positive")
     else:
         raise TypeError(
@@ -662,7 +661,7 @@ def repeat(x, repeats, /, *, axis=None):
             res_shape = x_shape[:axis] + (res_axis_size,) + x_shape[axis + 1 :]
         else:
             res_shape = (res_axis_size,)
-        res = dpt_ext.empty(
+        res = dpt.empty(
             res_shape, dtype=x.dtype, usm_type=usm_type, sycl_queue=exec_q
         )
         if res_axis_size > 0:
@@ -677,7 +676,7 @@ def repeat(x, repeats, /, *, axis=None):
             _manager.add_event_pair(ht_rep_ev, rep_ev)
     else:
         if repeats.dtype != dpt.int64:
-            rep_buf = dpt_ext.empty(
+            rep_buf = dpt.empty(
                 repeats.shape,
                 dtype=dpt.int64,
                 usm_type=usm_type,
@@ -687,7 +686,7 @@ def repeat(x, repeats, /, *, axis=None):
                 src=repeats, dst=rep_buf, sycl_queue=exec_q, depends=dep_evs
             )
             _manager.add_event_pair(ht_copy_ev, copy_ev)
-            cumsum = dpt_ext.empty(
+            cumsum = dpt.empty(
                 (axis_size,),
                 dtype=dpt.int64,
                 usm_type=usm_type,
@@ -703,7 +702,7 @@ def repeat(x, repeats, /, *, axis=None):
                 )
             else:
                 res_shape = (res_axis_size,)
-            res = dpt_ext.empty(
+            res = dpt.empty(
                 res_shape,
                 dtype=x.dtype,
                 usm_type=usm_type,
@@ -720,7 +719,7 @@ def repeat(x, repeats, /, *, axis=None):
                 )
                 _manager.add_event_pair(ht_rep_ev, rep_ev)
         else:
-            cumsum = dpt_ext.empty(
+            cumsum = dpt.empty(
                 (axis_size,),
                 dtype=dpt.int64,
                 usm_type=usm_type,
@@ -735,7 +734,7 @@ def repeat(x, repeats, /, *, axis=None):
                 )
             else:
                 res_shape = (res_axis_size,)
-            res = dpt_ext.empty(
+            res = dpt.empty(
                 res_shape,
                 dtype=x.dtype,
                 usm_type=usm_type,
@@ -792,7 +791,7 @@ def roll(x, /, shift, *, axis=None):
     _manager = dputils.SequentialOrderManager[exec_q]
     if axis is None:
         shift = operator.index(shift)
-        res = dpt_ext.empty(
+        res = dpt.empty(
             x.shape, dtype=x.dtype, usm_type=x.usm_type, sycl_queue=exec_q
         )
         sz = operator.index(x.size)
@@ -819,7 +818,7 @@ def roll(x, /, shift, *, axis=None):
         n_i = operator.index(shape[ax])
         shifted = shifts[ax] + operator.index(sh)
         shifts[ax] = (shifted % n_i) if n_i > 0 else 0
-    res = dpt_ext.empty(
+    res = dpt.empty(
         x.shape, dtype=x.dtype, usm_type=x.usm_type, sycl_queue=exec_q
     )
     dep_evs = _manager.submitted_events
@@ -872,7 +871,7 @@ def squeeze(X, /, axis=None):
     if new_shape == X.shape:
         return X
     else:
-        return dpt_ext.reshape(X, new_shape)
+        return dpt.reshape(X, new_shape)
 
 
 def stack(arrays, /, *, axis=0):
@@ -917,7 +916,7 @@ def stack(arrays, /, *, axis=0):
         for i in range(res_ndim)
     )
 
-    res = dpt_ext.empty(
+    res = dpt.empty(
         res_shape, dtype=res_dtype, usm_type=res_usm_type, sycl_queue=exec_q
     )
 
@@ -971,7 +970,7 @@ def swapaxes(X, axis1, axis2):
     ind = list(range(0, X.ndim))
     ind[axis1] = axis2
     ind[axis2] = axis1
-    return dpt_ext.permute_dims(X, tuple(ind))
+    return dpt.permute_dims(X, tuple(ind))
 
 
 def unstack(X, /, *, axis=0):
@@ -998,7 +997,7 @@ def unstack(X, /, *, axis=0):
         raise TypeError(f"Expected usm_ndarray type, got {type(X)}.")
 
     axis = normalize_axis_index(axis, X.ndim)
-    Y = dpt_ext.moveaxis(X, axis, 0)
+    Y = dpt.moveaxis(X, axis, 0)
 
     return tuple(Y[i] for i in range(Y.shape[0]))
 
@@ -1049,11 +1048,11 @@ def tile(x, repetitions, /):
     if rep_dims < x_dims:
         repetitions = (x_dims - rep_dims) * (1,) + repetitions
     elif x_dims < rep_dims:
-        x = dpt_ext.reshape(x, (rep_dims - x_dims) * (1,) + x.shape)
+        x = dpt.reshape(x, (rep_dims - x_dims) * (1,) + x.shape)
     res_shape = tuple(map(lambda sh, rep: sh * rep, x.shape, repetitions))
     # case of empty input
     if x.size == 0:
-        return dpt_ext.empty(
+        return dpt.empty(
             res_shape,
             dtype=x.dtype,
             usm_type=x.usm_type,
@@ -1061,7 +1060,7 @@ def tile(x, repetitions, /):
         )
     in_sh = x.shape
     if res_shape == in_sh:
-        return dpt_ext.copy(x)
+        return dpt.copy(x)
     expanded_sh = []
     broadcast_sh = []
     out_sz = 1
@@ -1082,12 +1081,12 @@ def tile(x, repetitions, /):
     exec_q = x.sycl_queue
     xdt = x.dtype
     xut = x.usm_type
-    res = dpt_ext.empty((out_sz,), dtype=xdt, usm_type=xut, sycl_queue=exec_q)
+    res = dpt.empty((out_sz,), dtype=xdt, usm_type=xut, sycl_queue=exec_q)
     # no need to copy data for empty output
     if out_sz > 0:
-        x = dpt_ext.broadcast_to(
+        x = dpt.broadcast_to(
             # this reshape should never copy
-            dpt_ext.reshape(x, expanded_sh),
+            dpt.reshape(x, expanded_sh),
             broadcast_sh,
         )
         # copy broadcast input into flat array
@@ -1097,4 +1096,4 @@ def tile(x, repetitions, /):
             src=x, dst=res, sycl_queue=exec_q, depends=dep_evs
         )
         _manager.add_event_pair(hev, cp_ev)
-    return dpt_ext.reshape(res, res_shape)
+    return dpt.reshape(res, res_shape)
diff --git a/dpctl_ext/tensor/_print.py b/dpctl_ext/tensor/_print.py
new file mode 100644
index 000000000000..5385eadb2537
--- /dev/null
+++ b/dpctl_ext/tensor/_print.py
@@ -0,0 +1,503 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+import contextlib
+import itertools
+import operator
+
+import dpctl
+import dpctl.utils
+import numpy as np
+
+# TODO: revert to `import dpctl.tensor...`
+# when dpnp fully migrates dpctl/tensor
+import dpctl_ext.tensor as dpt
+import dpctl_ext.tensor._tensor_impl as ti
+
+__doc__ = "Print functions for :class:`dpctl.tensor.usm_ndarray`."
+
+_print_options = {
+    "linewidth": 75,
+    "edgeitems": 3,
+    "threshold": 1000,
+    "precision": 8,
+    "floatmode": "maxprec",
+    "suppress": False,
+    "nanstr": "nan",
+    "infstr": "inf",
+    "sign": "-",
+}
+
+
+def _move_to_next_line(string, s, line_width, prefix):
+    """Move string to next line if it doesn't fit in the current line."""
+    bottom_len = len(s) - (s.rfind("\n") + 1)
+    next_line = bottom_len + len(string) + 1 > line_width
+    string = ",\n" + " " * len(prefix) + string if next_line else ", " + string
+
+    return string
+
+
+def _options_dict(
+    linewidth=None,
+    edgeitems=None,
+    threshold=None,
+    precision=None,
+    floatmode=None,
+    suppress=None,
+    nanstr=None,
+    infstr=None,
+    sign=None,
+    numpy=False,
+):
+    if numpy:
+        numpy_options = np.get_printoptions()
+        options = {k: numpy_options[k] for k in _print_options.keys()}
+    else:
+        options = _print_options.copy()
+
+    if suppress:
+        options["suppress"] = True
+
+    local = dict(locals().items())
+    for int_arg in ["linewidth", "precision", "threshold", "edgeitems"]:
+        val = local[int_arg]
+        if val is not None:
+            options[int_arg] = operator.index(val)
+
+    for str_arg in ["nanstr", "infstr"]:
+        val = local[str_arg]
+        if val is not None:
+            if not isinstance(val, str):
+                raise TypeError(
+                    "`{}` ".format(str_arg) + "must be of `string` type."
+                )
+            options[str_arg] = val
+
+    signs = ["-", "+", " "]
+    if sign is not None:
+        if sign not in signs:
+            raise ValueError(
+                "`sign` must be one of"
+                + ", ".join("`{}`".format(s) for s in signs)
+            )
+        options["sign"] = sign
+
+    floatmodes = ["fixed", "unique", "maxprec", "maxprec_equal"]
+    if floatmode is not None:
+        if floatmode not in floatmodes:
+            raise ValueError(
+                "`floatmode` must be one of"
+                + ", ".join("`{}`".format(m) for m in floatmodes)
+            )
+        options["floatmode"] = floatmode
+
+    return options
+
+
+def set_print_options(
+    linewidth=None,
+    edgeitems=None,
+    threshold=None,
+    precision=None,
+    floatmode=None,
+    suppress=None,
+    nanstr=None,
+    infstr=None,
+    sign=None,
+    numpy=False,
+):
+    """
+    set_print_options(linewidth=None, edgeitems=None, threshold=None,
+                      precision=None, floatmode=None, suppress=None,
+                      nanstr=None, infstr=None, sign=None, numpy=False)
+
+    Set options for printing :class:`dpctl.tensor.usm_ndarray` class.
+
+    Args:
+        linewidth (int, optional):
+            Number of characters printed per line.
+            Raises `TypeError` if linewidth is not an integer.
+            Default: `75`.
+        edgeitems (int, optional):
+            Number of elements at the beginning and end
+            when the printed array is abbreviated.
+            Raises `TypeError` if edgeitems is not an integer.
+            Default: `3`.
+        threshold (int, optional):
+            Number of elements that triggers array abbreviation.
+            Raises `TypeError` if threshold is not an integer.
+            Default: `1000`.
+        precision (int or None, optional):
+            Number of digits printed for floating point numbers.
+            Raises `TypeError` if precision is not an integer.
+            Default: `8`.
+        floatmode (str, optional):
+            Controls how floating point numbers are interpreted.
+                `"fixed:`:
+                    Always prints exactly `precision` digits.
+                `"unique"`:
+                    Ignores precision, prints the number of
+                    digits necessary to uniquely specify each number.
+                `"maxprec"`:
+                    Prints `precision` digits or fewer,
+                    if fewer will uniquely represent a number.
+                `"maxprec_equal"`:
+                    Prints an equal number of digits
+                    for each number. This number is `precision` digits
+                    or fewer, if fewer will uniquely represent each number.
+            Raises `ValueError` if floatmode is not one of
+            `fixed`, `unique`, `maxprec`, or `maxprec_equal`.
+            Default: "maxprec_equal"
+        suppress (bool, optional):
+            If `True,` numbers equal to zero in the current precision
+            will print as zero.
+            Default: `False`.
+        nanstr (str, optional):
+            String used to represent nan.
+            Raises `TypeError` if nanstr is not a string.
+            Default: `"nan"`.
+        infstr (str, optional):
+            String used to represent infinity.
+            Raises `TypeError` if infstr is not a string.
+            Default: `"inf"`.
+        sign (str, optional):
+            Controls the sign of floating point numbers.
+                `"-"`:
+                    Omit the sign of positive numbers.
+                `"+"`:
+                    Always print the sign of positive numbers.
+                `" "`:
+                    Always print a whitespace in place of the
+                    sign of positive numbers.
+            Raises `ValueError` if sign is not one of
+            `"-"`, `"+"`, or `" "`.
+            Default: `"-"`.
+        numpy (bool, optional): If `True,` then before other specified print
+            options are set, a dictionary of Numpy's print options
+            will be used to initialize dpctl's print options.
+            Default: "False"
+    """
+    options = _options_dict(
+        linewidth=linewidth,
+        edgeitems=edgeitems,
+        threshold=threshold,
+        precision=precision,
+        floatmode=floatmode,
+        suppress=suppress,
+        nanstr=nanstr,
+        infstr=infstr,
+        sign=sign,
+        numpy=numpy,
+    )
+    _print_options.update(options)
+
+
+def get_print_options():
+    """get_print_options()
+
+    Returns a copy of current options for printing
+    :class:`dpctl.tensor.usm_ndarray` class.
+
+    Returns:
+        dict: dictionary with array
+           printing option settings.
+
+    Options:
+        - "linewidth" : int, default 75
+        - "edgeitems" : int, default 3
+        - "threshold" : int, default 1000
+        - "precision" : int, default 8
+        - "floatmode" : str, default "maxprec_equal"
+        - "suppress" : bool, default False
+        - "nanstr" : str, default "nan"
+        - "infstr" : str, default "inf"
+        - "sign" : str, default "-"
+    """
+    return _print_options.copy()
+
+
+@contextlib.contextmanager
+def print_options(*args, **kwargs):
+    """
+    Context manager for print options.
+
+    Set print options for the scope of a `with` block.
+    `as` yields dictionary of print options.
+    """
+    options = dpt.get_print_options()
+    try:
+        dpt.set_print_options(*args, **kwargs)
+        yield dpt.get_print_options()
+    finally:
+        dpt.set_print_options(**options)
+
+
+def _nd_corners(arr_in, edge_items):
+    _shape = arr_in.shape
+    max_shape = 2 * edge_items + 1
+    if max(_shape) <= max_shape:
+        return dpt.asnumpy(arr_in)
+    res_shape = tuple(
+        max_shape if _shape[i] > max_shape else _shape[i]
+        for i in range(arr_in.ndim)
+    )
+
+    exec_q = arr_in.sycl_queue
+    arr_out = dpt.empty(
+        res_shape,
+        dtype=arr_in.dtype,
+        usm_type=arr_in.usm_type,
+        sycl_queue=exec_q,
+    )
+
+    blocks = []
+    for i in range(len(_shape)):
+        if _shape[i] > max_shape:
+            blocks.append(
+                (
+                    np.s_[:edge_items],
+                    np.s_[-edge_items:],
+                )
+            )
+        else:
+            blocks.append((np.s_[:],))
+
+    _manager = dpctl.utils.SequentialOrderManager[exec_q]
+    dep_evs = _manager.submitted_events
+    hev_list = []
+    for slc in itertools.product(*blocks):
+        hev, _ = ti._copy_usm_ndarray_into_usm_ndarray(
+            src=arr_in[slc],
+            dst=arr_out[slc],
+            sycl_queue=exec_q,
+            depends=dep_evs,
+        )
+        hev_list.append(hev)
+
+    dpctl.SyclEvent.wait_for(hev_list)
+    return dpt.asnumpy(arr_out)
+
+
+def usm_ndarray_str(
+    x,
+    line_width=None,
+    edge_items=None,
+    threshold=None,
+    precision=None,
+    floatmode=None,
+    suppress=None,
+    sign=None,
+    numpy=False,
+    separator=" ",
+    prefix="",
+    suffix="",
+):
+    """
+    usm_ndarray_str(x, line_width=None, edgeitems=None, threshold=None,
+                    precision=None, floatmode=None, suppress=None,
+                    sign=None, numpy=False, separator=" ", prefix="",
+                    suffix="")
+
+    Returns a string representing the elements of a
+    :class:`dpctl.tensor.usm_ndarray`.
+
+    Args:
+        x (usm_ndarray):
+            Input array.
+        line_width (int, optional):
+            Number of characters printed per line.
+            Raises `TypeError` if line_width is not an integer.
+            Default: `75`.
+        edgeitems (int, optional):
+            Number of elements at the beginning and end
+            when the printed array is abbreviated.
+            Raises `TypeError` if edgeitems is not an integer.
+            Default: `3`.
+        threshold (int, optional):
+            Number of elements that triggers array abbreviation.
+            Raises `TypeError` if threshold is not an integer.
+            Default: `1000`.
+        precision (int or None, optional):
+            Number of digits printed for floating point numbers.
+            Raises `TypeError` if precision is not an integer.
+            Default: `8`.
+        floatmode (str, optional):
+            Controls how floating point numbers are interpreted.
+                `"fixed:`:
+                    Always prints exactly `precision` digits.
+                `"unique"`:
+                    Ignores precision, prints the number of
+                    digits necessary to uniquely specify each number.
+                `"maxprec"`:
+                    Prints `precision` digits or fewer,
+                    if fewer will uniquely represent a number.
+                `"maxprec_equal"`:
+                    Prints an equal number of digits for each number.
+                    This number is `precision` digits or fewer,
+                    if fewer will uniquely represent each number.
+            Raises `ValueError` if floatmode is not one of
+            `fixed`, `unique`, `maxprec`, or `maxprec_equal`.
+            Default: "maxprec_equal"
+        suppress (bool, optional):
+            If `True,` numbers equal to zero in the current precision
+            will print as zero.
+            Default: `False`.
+        sign (str, optional):
+            Controls the sign of floating point numbers.
+                `"-"`:
+                    Omit the sign of positive numbers.
+                `"+"`:
+                    Always print the sign of positive numbers.
+                `" "`:
+                    Always print a whitespace in place of the
+                    sign of positive numbers.
+            Raises `ValueError` if sign is not one of
+            `"-"`, `"+"`, or `" "`.
+            Default: `"-"`.
+        numpy (bool, optional):
+            If `True,` then before other specified print
+            options are set, a dictionary of Numpy's print options
+            will be used to initialize dpctl's print options.
+            Default: "False"
+        separator (str, optional):
+            String inserted between elements of the array string.
+            Default: " "
+        prefix (str, optional):
+            String used to determine spacing to the left of the array string.
+            Default: ""
+        suffix (str, optional):
+            String that determines length of the last line of the array string.
+            Default: ""
+
+    Returns:
+        str: string representation of input array.
+    """
+    if not isinstance(x, dpt.usm_ndarray):
+        raise TypeError(f"Expected dpctl.tensor.usm_ndarray, got {type(x)}")
+
+    options = get_print_options()
+    options.update(
+        _options_dict(
+            linewidth=line_width,
+            edgeitems=edge_items,
+            threshold=threshold,
+            precision=precision,
+            floatmode=floatmode,
+            suppress=suppress,
+            sign=sign,
+            numpy=numpy,
+        )
+    )
+
+    threshold = options["threshold"]
+    edge_items = options["edgeitems"]
+
+    if x.size > threshold:
+        data = _nd_corners(x, edge_items)
+        options["threshold"] = 0
+    else:
+        data = dpt.asnumpy(x)
+    with np.printoptions(**options):
+        s = np.array2string(
+            data, separator=separator, prefix=prefix, suffix=suffix
+        )
+    return s
+
+
+def usm_ndarray_repr(
+    x, line_width=None, precision=None, suppress=None, prefix="usm_ndarray"
+):
+    """
+    usm_ndarray_repr(x, line_width=None, precision=None,
+                     suppress=None, prefix="")
+
+    Returns a formatted string representing the elements
+    of a :class:`dpctl.tensor.usm_ndarray` and its data type,
+    if not a default type.
+
+    Args:
+        x (usm_ndarray): Input array.
+        line_width (int, optional): Number of characters printed per line.
+            Raises `TypeError` if line_width is not an integer.
+            Default: `75`.
+        precision (int or None, optional): Number of digits printed for
+            floating point numbers.
+            Raises `TypeError` if precision is not an integer.
+            Default: `8`.
+        suppress (bool, optional): If `True,` numbers equal to zero
+            in the current precision will print as zero.
+            Default: `False`.
+        prefix (str, optional): String inserted at the start of the array
+            string.
+            Default: ""
+
+    Returns:
+        str: formatted string representing the input array
+    """
+    if not isinstance(x, dpt.usm_ndarray):
+        raise TypeError(f"Expected dpctl.tensor.usm_ndarray, got {type(x)}")
+
+    if line_width is None:
+        line_width = _print_options["linewidth"]
+
+    show_dtype = x.dtype not in [
+        dpt.bool,
+        dpt.int64,
+        dpt.float64,
+        dpt.complex128,
+    ]
+
+    prefix = prefix + "("
+    suffix = ")"
+
+    s = usm_ndarray_str(
+        x,
+        line_width=line_width,
+        precision=precision,
+        suppress=suppress,
+        separator=", ",
+        prefix=prefix,
+        suffix=suffix,
+    )
+
+    if show_dtype or x.size == 0:
+        dtype_str = f"dtype={x.dtype.name}"
+        dtype_str = _move_to_next_line(dtype_str, s, line_width, prefix)
+    else:
+        dtype_str = ""
+
+    options = get_print_options()
+    threshold = options["threshold"]
+    if (x.size == 0 and x.shape != (0,)) or x.size > threshold:
+        shape_str = f"shape={x.shape}"
+        shape_str = _move_to_next_line(shape_str, s, line_width, prefix)
+    else:
+        shape_str = ""
+
+    return prefix + s + shape_str + dtype_str + suffix
diff --git a/dpctl_ext/tensor/_reduction.py b/dpctl_ext/tensor/_reduction.py
index 2daf07b81d85..79e620605f07 100644
--- a/dpctl_ext/tensor/_reduction.py
+++ b/dpctl_ext/tensor/_reduction.py
@@ -27,12 +27,11 @@
 # *****************************************************************************
 
 import dpctl
-import dpctl.tensor as dpt
 from dpctl.utils import ExecutionPlacementError, SequentialOrderManager
 
 # TODO: revert to `import dpctl.tensor...`
 # when dpnp fully migrates dpctl/tensor
-import dpctl_ext.tensor as dpt_ext
+import dpctl_ext.tensor as dpt
 import dpctl_ext.tensor._tensor_impl as ti
 import dpctl_ext.tensor._tensor_reductions_impl as tri
 
@@ -58,7 +57,7 @@ def _comparison_over_axis(x, axis, keepdims, out, _reduction_fn):
             axis = (axis,)
         axis = normalize_axis_tuple(axis, nd, "axis")
         perm = [i for i in range(nd) if i not in axis] + list(axis)
-        x_tmp = dpt_ext.permute_dims(x, perm)
+        x_tmp = dpt.permute_dims(x, perm)
     red_nd = len(axis)
     if any([x_tmp.shape[i] == 0 for i in range(-red_nd, 0)]):
         raise ValueError("reduction cannot be performed over zero-size axes")
@@ -96,12 +95,12 @@ def _comparison_over_axis(x, axis, keepdims, out, _reduction_fn):
                 "Input and output allocation queues are not compatible"
             )
         if keepdims:
-            out = dpt_ext.squeeze(out, axis=axis)
+            out = dpt.squeeze(out, axis=axis)
             orig_out = out
         if ti._array_overlap(x, out):
-            out = dpt_ext.empty_like(out)
+            out = dpt.empty_like(out)
     else:
-        out = dpt_ext.empty(
+        out = dpt.empty(
             res_shape, dtype=res_dt, usm_type=res_usm_type, sycl_queue=exec_q
         )
 
@@ -138,7 +137,7 @@ def _comparison_over_axis(x, axis, keepdims, out, _reduction_fn):
     if keepdims:
         res_shape = res_shape + (1,) * red_nd
         inv_perm = sorted(range(nd), key=lambda d: perm[d])
-        out = dpt_ext.permute_dims(dpt_ext.reshape(out, res_shape), inv_perm)
+        out = dpt.permute_dims(dpt.reshape(out, res_shape), inv_perm)
     return out
 
 
@@ -164,7 +163,7 @@ def _reduction_over_axis(
             axis = (axis,)
         axis = normalize_axis_tuple(axis, nd, "axis")
         perm = [i for i in range(nd) if i not in axis] + list(axis)
-        arr = dpt_ext.permute_dims(x, perm)
+        arr = dpt.permute_dims(x, perm)
     red_nd = len(axis)
     res_shape = arr.shape[: nd - red_nd]
     q = x.sycl_queue
@@ -212,12 +211,12 @@ def _reduction_over_axis(
                 "Input and output allocation queues are not compatible"
             )
         if keepdims:
-            out = dpt_ext.squeeze(out, axis=axis)
+            out = dpt.squeeze(out, axis=axis)
             orig_out = out
         if ti._array_overlap(x, out) and implemented_types:
-            out = dpt_ext.empty_like(out)
+            out = dpt.empty_like(out)
     else:
-        out = dpt_ext.empty(
+        out = dpt.empty(
             res_shape, dtype=res_dt, usm_type=res_usm_type, sycl_queue=q
         )
 
@@ -253,7 +252,7 @@ def _reduction_over_axis(
             out = orig_out
     else:
         if _dtype_supported(res_dt, res_dt, res_usm_type, q):
-            tmp = dpt_ext.empty(
+            tmp = dpt.empty(
                 arr.shape, dtype=res_dt, usm_type=res_usm_type, sycl_queue=q
             )
             ht_e_cpy, cpy_e = ti._copy_usm_ndarray_into_usm_ndarray(
@@ -270,14 +269,14 @@ def _reduction_over_axis(
             _manager.add_event_pair(ht_e_red, red_ev)
         else:
             buf_dt = _default_reduction_type_fn(inp_dt, q)
-            tmp = dpt_ext.empty(
+            tmp = dpt.empty(
                 arr.shape, dtype=buf_dt, usm_type=res_usm_type, sycl_queue=q
             )
             ht_e_cpy, cpy_e = ti._copy_usm_ndarray_into_usm_ndarray(
                 src=arr, dst=tmp, sycl_queue=q, depends=dep_evs
             )
             _manager.add_event_pair(ht_e_cpy, cpy_e)
-            tmp_res = dpt_ext.empty(
+            tmp_res = dpt.empty(
                 res_shape, dtype=buf_dt, usm_type=res_usm_type, sycl_queue=q
             )
             ht_e_red, r_e = _reduction_fn(
@@ -296,7 +295,7 @@ def _reduction_over_axis(
     if keepdims:
         res_shape = res_shape + (1,) * red_nd
         inv_perm = sorted(range(nd), key=lambda d: perm[d])
-        out = dpt_ext.permute_dims(dpt_ext.reshape(out, res_shape), inv_perm)
+        out = dpt.permute_dims(dpt.reshape(out, res_shape), inv_perm)
     return out
 
 
@@ -320,7 +319,7 @@ def _search_over_axis(x, axis, keepdims, out, _reduction_fn):
             )
         axis = normalize_axis_tuple(axis, nd, "axis")
         perm = [i for i in range(nd) if i not in axis] + list(axis)
-        x_tmp = dpt_ext.permute_dims(x, perm)
+        x_tmp = dpt.permute_dims(x, perm)
     axis = normalize_axis_tuple(axis, nd, "axis")
     red_nd = len(axis)
     if any([x_tmp.shape[i] == 0 for i in range(-red_nd, 0)]):
@@ -359,12 +358,12 @@ def _search_over_axis(x, axis, keepdims, out, _reduction_fn):
                 "Input and output allocation queues are not compatible"
             )
         if keepdims:
-            out = dpt_ext.squeeze(out, axis=axis)
+            out = dpt.squeeze(out, axis=axis)
             orig_out = out
         if ti._array_overlap(x, out) and red_nd > 0:
-            out = dpt_ext.empty_like(out)
+            out = dpt.empty_like(out)
     else:
-        out = dpt_ext.empty(
+        out = dpt.empty(
             res_shape, dtype=res_dt, usm_type=res_usm_type, sycl_queue=exec_q
         )
 
@@ -395,7 +394,7 @@ def _search_over_axis(x, axis, keepdims, out, _reduction_fn):
     if keepdims:
         res_shape = res_shape + (1,) * red_nd
         inv_perm = sorted(range(nd), key=lambda d: perm[d])
-        out = dpt_ext.permute_dims(dpt_ext.reshape(out, res_shape), inv_perm)
+        out = dpt.permute_dims(dpt.reshape(out, res_shape), inv_perm)
     return out
 
 
@@ -506,7 +505,7 @@ def count_nonzero(x, /, *, axis=None, keepdims=False, out=None):
             type.
     """
     if x.dtype != dpt.bool:
-        x = dpt_ext.astype(x, dpt.bool, copy=False)
+        x = dpt.astype(x, dpt.bool, copy=False)
     return sum(
         x,
         axis=axis,
diff --git a/dpctl_ext/tensor/_reshape.py b/dpctl_ext/tensor/_reshape.py
index 23cf47a83568..7ecdace4fc42 100644
--- a/dpctl_ext/tensor/_reshape.py
+++ b/dpctl_ext/tensor/_reshape.py
@@ -28,13 +28,12 @@
 
 import operator
 
-import dpctl.tensor as dpt
 import dpctl.utils
 import numpy as np
 
 # TODO: revert to `import dpctl.tensor...`
 # when dpnp fully migrates dpctl/tensor
-import dpctl_ext.tensor as dpt_ext
+import dpctl_ext.tensor as dpt
 
 from ._tensor_impl import (
     _copy_usm_ndarray_for_reshape,
@@ -189,7 +188,7 @@ def reshape(X, /, shape, *, order="C", copy=None):
                 src=X, dst=flat_res, sycl_queue=copy_q, depends=dep_evs
             )
         else:
-            X_t = dpt_ext.permute_dims(X, range(X.ndim - 1, -1, -1))
+            X_t = dpt.permute_dims(X, range(X.ndim - 1, -1, -1))
             hev, r_e = _copy_usm_ndarray_for_reshape(
                 src=X_t, dst=flat_res, sycl_queue=copy_q, depends=dep_evs
             )
diff --git a/dpctl_ext/tensor/_scalar_utils.py b/dpctl_ext/tensor/_scalar_utils.py
index 3ab92b42ad00..84abdf7b5a52 100644
--- a/dpctl_ext/tensor/_scalar_utils.py
+++ b/dpctl_ext/tensor/_scalar_utils.py
@@ -29,13 +29,11 @@
 import numbers
 
 import dpctl.memory as dpm
-import dpctl.tensor as dpt
 import numpy as np
-from dpctl.tensor._usmarray import _is_object_with_buffer_protocol as _is_buffer
 
 # TODO: revert to `import dpctl.tensor...`
 # when dpnp fully migrates dpctl/tensor
-import dpctl_ext.tensor as dpt_ext
+import dpctl_ext.tensor as dpt
 
 from ._type_utils import (
     WeakBooleanType,
@@ -44,6 +42,7 @@
     WeakIntegralType,
     _to_device_supported_dtype,
 )
+from ._usmarray import _is_object_with_buffer_protocol as _is_buffer
 
 
 def _get_queue_usm_type(o):
@@ -63,7 +62,7 @@ def _get_dtype(o, dev):
     if isinstance(o, dpt.usm_ndarray):
         return o.dtype
     if hasattr(o, "__sycl_usm_array_interface__"):
-        return dpt_ext.asarray(o).dtype
+        return dpt.asarray(o).dtype
     if _is_buffer(o):
         host_dt = np.array(o).dtype
         dev_dt = _to_device_supported_dtype(host_dt, dev)
diff --git a/dpctl_ext/tensor/_search_functions.py b/dpctl_ext/tensor/_search_functions.py
index 285a02b42bb8..aae185b64e2b 100644
--- a/dpctl_ext/tensor/_search_functions.py
+++ b/dpctl_ext/tensor/_search_functions.py
@@ -27,12 +27,11 @@
 # *****************************************************************************
 
 import dpctl
-import dpctl.tensor as dpt
 from dpctl.utils import ExecutionPlacementError, SequentialOrderManager
 
 # TODO: revert to `import dpctl.tensor...`
 # when dpnp fully migrates dpctl/tensor
-import dpctl_ext.tensor as dpt_ext
+import dpctl_ext.tensor as dpt
 import dpctl_ext.tensor._tensor_impl as ti
 
 from ._copy_utils import _empty_like_orderK, _empty_like_triple_orderK
@@ -111,7 +110,7 @@ def _resolve_two_weak_types(o1_dtype, o2_dtype, dev):
 
 
 def _where_result_type(dt1, dt2, dev):
-    res_dtype = dpt_ext.result_type(dt1, dt2)
+    res_dtype = dpt.result_type(dt1, dt2)
     fp16 = dev.has_aspect_fp16
     fp64 = dev.has_aspect_fp64
 
@@ -291,7 +290,7 @@ def where(condition, x1, x2, /, *, order="K", out=None):
         if ti._array_overlap(condition, out) and not ti._same_logical_tensors(
             condition, out
         ):
-            out = dpt_ext.empty_like(out)
+            out = dpt.empty_like(out)
 
         if isinstance(x1, dpt.usm_ndarray):
             if (
@@ -299,7 +298,7 @@ def where(condition, x1, x2, /, *, order="K", out=None):
                 and not ti._same_logical_tensors(x1, out)
                 and x1_dtype == out_dtype
             ):
-                out = dpt_ext.empty_like(out)
+                out = dpt.empty_like(out)
 
         if isinstance(x2, dpt.usm_ndarray):
             if (
@@ -307,7 +306,7 @@ def where(condition, x1, x2, /, *, order="K", out=None):
                 and not ti._same_logical_tensors(x2, out)
                 and x2_dtype == out_dtype
             ):
-                out = dpt_ext.empty_like(out)
+                out = dpt.empty_like(out)
 
     if order == "A":
         order = (
@@ -323,9 +322,9 @@ def where(condition, x1, x2, /, *, order="K", out=None):
             else "C"
         )
     if not isinstance(x1, dpt.usm_ndarray):
-        x1 = dpt_ext.asarray(x1, dtype=x1_dtype, sycl_queue=exec_q)
+        x1 = dpt.asarray(x1, dtype=x1_dtype, sycl_queue=exec_q)
     if not isinstance(x2, dpt.usm_ndarray):
-        x2 = dpt_ext.asarray(x2, dtype=x2_dtype, sycl_queue=exec_q)
+        x2 = dpt.asarray(x2, dtype=x2_dtype, sycl_queue=exec_q)
 
     if condition.size == 0:
         if out is not None:
@@ -342,7 +341,7 @@ def where(condition, x1, x2, /, *, order="K", out=None):
                     exec_q,
                 )
             else:
-                return dpt_ext.empty(
+                return dpt.empty(
                     res_shape,
                     dtype=out_dtype,
                     order=order,
@@ -356,7 +355,7 @@ def where(condition, x1, x2, /, *, order="K", out=None):
         if order == "K":
             _x1 = _empty_like_orderK(x1, out_dtype)
         else:
-            _x1 = dpt_ext.empty_like(x1, dtype=out_dtype, order=order)
+            _x1 = dpt.empty_like(x1, dtype=out_dtype, order=order)
         ht_copy1_ev, copy1_ev = ti._copy_usm_ndarray_into_usm_ndarray(
             src=x1, dst=_x1, sycl_queue=exec_q, depends=dep_evs
         )
@@ -367,7 +366,7 @@ def where(condition, x1, x2, /, *, order="K", out=None):
         if order == "K":
             _x2 = _empty_like_orderK(x2, out_dtype)
         else:
-            _x2 = dpt_ext.empty_like(x2, dtype=out_dtype, order=order)
+            _x2 = dpt.empty_like(x2, dtype=out_dtype, order=order)
         ht_copy2_ev, copy2_ev = ti._copy_usm_ndarray_into_usm_ndarray(
             src=x2, dst=_x2, sycl_queue=exec_q, depends=dep_evs
         )
@@ -380,7 +379,7 @@ def where(condition, x1, x2, /, *, order="K", out=None):
                 condition, x1, x2, out_dtype, res_shape, out_usm_type, exec_q
             )
         else:
-            out = dpt_ext.empty(
+            out = dpt.empty(
                 res_shape,
                 dtype=out_dtype,
                 order=order,
@@ -389,11 +388,11 @@ def where(condition, x1, x2, /, *, order="K", out=None):
             )
 
     if condition_shape != res_shape:
-        condition = dpt_ext.broadcast_to(condition, res_shape)
+        condition = dpt.broadcast_to(condition, res_shape)
     if x1_shape != res_shape:
-        x1 = dpt_ext.broadcast_to(x1, res_shape)
+        x1 = dpt.broadcast_to(x1, res_shape)
     if x2_shape != res_shape:
-        x2 = dpt_ext.broadcast_to(x2, res_shape)
+        x2 = dpt.broadcast_to(x2, res_shape)
 
     dep_evs = _manager.submitted_events
     hev, where_ev = ti._where(
diff --git a/dpctl_ext/tensor/_searchsorted.py b/dpctl_ext/tensor/_searchsorted.py
index 2d4807fb0d0c..4c680a49b07b 100644
--- a/dpctl_ext/tensor/_searchsorted.py
+++ b/dpctl_ext/tensor/_searchsorted.py
@@ -32,10 +32,6 @@
 import dpctl
 import dpctl.utils as du
 
-# TODO: revert to `from ._usmarray import...`
-# when dpnp fully migrates dpctl/tensor
-from dpctl.tensor._usmarray import usm_ndarray
-
 from ._copy_utils import _empty_like_orderK
 from ._ctors import empty
 from ._tensor_impl import _copy_usm_ndarray_into_usm_ndarray as ti_copy
@@ -46,6 +42,10 @@
 from ._tensor_sorting_impl import _searchsorted_left, _searchsorted_right
 from ._type_utils import isdtype, result_type
 
+# TODO: revert to `from ._usmarray import...`
+# when dpnp fully migrates dpctl/tensor
+from ._usmarray import usm_ndarray
+
 
 def searchsorted(
     x1: usm_ndarray,
diff --git a/dpctl_ext/tensor/_set_functions.py b/dpctl_ext/tensor/_set_functions.py
index 2672e082d18e..76840461b5e9 100644
--- a/dpctl_ext/tensor/_set_functions.py
+++ b/dpctl_ext/tensor/_set_functions.py
@@ -28,13 +28,11 @@
 
 from typing import NamedTuple, Optional, Union
 
-import dpctl.tensor as dpt
 import dpctl.utils as du
 
 # TODO: revert to `import dpctl.tensor...`
 # when dpnp fully migrates dpctl/tensor
-import dpctl_ext.tensor as dpt_ext
-from dpctl_ext.tensor._tensor_elementwise_impl import _not_equal, _subtract
+import dpctl_ext.tensor as dpt
 
 from ._copy_utils import _empty_like_orderK
 from ._scalar_utils import (
@@ -43,6 +41,7 @@
     _get_shape,
     _validate_dtype,
 )
+from ._tensor_elementwise_impl import _not_equal, _subtract
 from ._tensor_impl import (
     _copy_usm_ndarray_into_usm_ndarray,
     _extract,
@@ -112,10 +111,10 @@ def unique_values(x: dpt.usm_ndarray) -> dpt.usm_ndarray:
     if x.ndim == 1:
         fx = x
     else:
-        fx = dpt_ext.reshape(x, (x.size,), order="C")
+        fx = dpt.reshape(x, (x.size,), order="C")
     if fx.size == 0:
         return fx
-    s = dpt_ext.empty_like(fx, order="C")
+    s = dpt.empty_like(fx, order="C")
     _manager = du.SequentialOrderManager[exec_q]
     dep_evs = _manager.submitted_events
     if fx.flags.c_contiguous:
@@ -128,7 +127,7 @@ def unique_values(x: dpt.usm_ndarray) -> dpt.usm_ndarray:
         )
         _manager.add_event_pair(ht_ev, sort_ev)
     else:
-        tmp = dpt_ext.empty_like(fx, order="C")
+        tmp = dpt.empty_like(fx, order="C")
         ht_ev, copy_ev = _copy_usm_ndarray_into_usm_ndarray(
             src=fx, dst=tmp, sycl_queue=exec_q, depends=dep_evs
         )
@@ -141,7 +140,7 @@ def unique_values(x: dpt.usm_ndarray) -> dpt.usm_ndarray:
             depends=[copy_ev],
         )
         _manager.add_event_pair(ht_ev, sort_ev)
-    unique_mask = dpt_ext.empty(fx.shape, dtype="?", sycl_queue=exec_q)
+    unique_mask = dpt.empty(fx.shape, dtype="?", sycl_queue=exec_q)
     ht_ev, uneq_ev = _not_equal(
         src1=s[:-1],
         src2=s[1:],
@@ -155,14 +154,14 @@ def unique_values(x: dpt.usm_ndarray) -> dpt.usm_ndarray:
         fill_value=True, dst=unique_mask[0], sycl_queue=exec_q
     )
     _manager.add_event_pair(ht_ev, one_ev)
-    cumsum = dpt_ext.empty(s.shape, dtype=dpt.int64, sycl_queue=exec_q)
+    cumsum = dpt.empty(s.shape, dtype=dpt.int64, sycl_queue=exec_q)
     # synchronizing call
     n_uniques = mask_positions(
         unique_mask, cumsum, sycl_queue=exec_q, depends=[one_ev, uneq_ev]
     )
     if n_uniques == fx.size:
         return s
-    unique_vals = dpt_ext.empty(
+    unique_vals = dpt.empty(
         n_uniques, dtype=x.dtype, usm_type=x.usm_type, sycl_queue=exec_q
     )
     ht_ev, ex_e = _extract(
@@ -206,11 +205,11 @@ def unique_counts(x: dpt.usm_ndarray) -> UniqueCountsResult:
     if x.ndim == 1:
         fx = x
     else:
-        fx = dpt_ext.reshape(x, (x.size,), order="C")
+        fx = dpt.reshape(x, (x.size,), order="C")
     ind_dt = default_device_index_type(exec_q)
     if fx.size == 0:
-        return UniqueCountsResult(fx, dpt_ext.empty_like(fx, dtype=ind_dt))
-    s = dpt_ext.empty_like(fx, order="C")
+        return UniqueCountsResult(fx, dpt.empty_like(fx, dtype=ind_dt))
+    s = dpt.empty_like(fx, order="C")
 
     _manager = du.SequentialOrderManager[exec_q]
     dep_evs = _manager.submitted_events
@@ -224,7 +223,7 @@ def unique_counts(x: dpt.usm_ndarray) -> UniqueCountsResult:
         )
         _manager.add_event_pair(ht_ev, sort_ev)
     else:
-        tmp = dpt_ext.empty_like(fx, order="C")
+        tmp = dpt.empty_like(fx, order="C")
         ht_ev, copy_ev = _copy_usm_ndarray_into_usm_ndarray(
             src=fx, dst=tmp, sycl_queue=exec_q, depends=dep_evs
         )
@@ -237,7 +236,7 @@ def unique_counts(x: dpt.usm_ndarray) -> UniqueCountsResult:
             depends=[copy_ev],
         )
         _manager.add_event_pair(ht_ev, sort_ev)
-    unique_mask = dpt_ext.empty(s.shape, dtype="?", sycl_queue=exec_q)
+    unique_mask = dpt.empty(s.shape, dtype="?", sycl_queue=exec_q)
     ht_ev, uneq_ev = _not_equal(
         src1=s[:-1],
         src2=s[1:],
@@ -251,9 +250,7 @@ def unique_counts(x: dpt.usm_ndarray) -> UniqueCountsResult:
         fill_value=True, dst=unique_mask[0], sycl_queue=exec_q
     )
     _manager.add_event_pair(ht_ev, one_ev)
-    cumsum = dpt_ext.empty(
-        unique_mask.shape, dtype=dpt.int64, sycl_queue=exec_q
-    )
+    cumsum = dpt.empty(unique_mask.shape, dtype=dpt.int64, sycl_queue=exec_q)
     # synchronizing call
     n_uniques = mask_positions(
         unique_mask, cumsum, sycl_queue=exec_q, depends=[one_ev, uneq_ev]
@@ -261,11 +258,11 @@ def unique_counts(x: dpt.usm_ndarray) -> UniqueCountsResult:
     if n_uniques == fx.size:
         return UniqueCountsResult(
             s,
-            dpt_ext.ones(
+            dpt.ones(
                 n_uniques, dtype=ind_dt, usm_type=x_usm_type, sycl_queue=exec_q
             ),
         )
-    unique_vals = dpt_ext.empty(
+    unique_vals = dpt.empty(
         n_uniques, dtype=x.dtype, usm_type=x_usm_type, sycl_queue=exec_q
     )
     # populate unique values
@@ -278,10 +275,10 @@ def unique_counts(x: dpt.usm_ndarray) -> UniqueCountsResult:
         sycl_queue=exec_q,
     )
     _manager.add_event_pair(ht_ev, ex_e)
-    unique_counts = dpt_ext.empty(
+    unique_counts = dpt.empty(
         n_uniques + 1, dtype=ind_dt, usm_type=x_usm_type, sycl_queue=exec_q
     )
-    idx = dpt_ext.empty(x.size, dtype=ind_dt, sycl_queue=exec_q)
+    idx = dpt.empty(x.size, dtype=ind_dt, sycl_queue=exec_q)
     # writing into new allocation, no dependency
     ht_ev, id_ev = _linspace_step(start=0, dt=1, dst=idx, sycl_queue=exec_q)
     _manager.add_event_pair(ht_ev, id_ev)
@@ -300,7 +297,7 @@ def unique_counts(x: dpt.usm_ndarray) -> UniqueCountsResult:
         x.size, dst=unique_counts[-1], sycl_queue=exec_q
     )
     _manager.add_event_pair(ht_ev, set_ev)
-    _counts = dpt_ext.empty_like(unique_counts[1:])
+    _counts = dpt.empty_like(unique_counts[1:])
     ht_ev, sub_ev = _subtract(
         src1=unique_counts[1:],
         src2=unique_counts[:-1],
@@ -342,11 +339,11 @@ def unique_inverse(x):
     if x.ndim == 1:
         fx = x
     else:
-        fx = dpt_ext.reshape(x, (x.size,), order="C")
-    sorting_ids = dpt_ext.empty_like(fx, dtype=ind_dt, order="C")
-    unsorting_ids = dpt_ext.empty_like(sorting_ids, dtype=ind_dt, order="C")
+        fx = dpt.reshape(x, (x.size,), order="C")
+    sorting_ids = dpt.empty_like(fx, dtype=ind_dt, order="C")
+    unsorting_ids = dpt.empty_like(sorting_ids, dtype=ind_dt, order="C")
     if fx.size == 0:
-        return UniqueInverseResult(fx, dpt_ext.reshape(unsorting_ids, x.shape))
+        return UniqueInverseResult(fx, dpt.reshape(unsorting_ids, x.shape))
 
     _manager = du.SequentialOrderManager[exec_q]
     dep_evs = _manager.submitted_events
@@ -360,7 +357,7 @@ def unique_inverse(x):
         )
         _manager.add_event_pair(ht_ev, sort_ev)
     else:
-        tmp = dpt_ext.empty_like(fx, order="C")
+        tmp = dpt.empty_like(fx, order="C")
         ht_ev, copy_ev = _copy_usm_ndarray_into_usm_ndarray(
             src=fx, dst=tmp, sycl_queue=exec_q, depends=dep_evs
         )
@@ -381,7 +378,7 @@ def unique_inverse(x):
         depends=[sort_ev],
     )
     _manager.add_event_pair(ht_ev, argsort_ev)
-    s = dpt_ext.empty_like(fx)
+    s = dpt.empty_like(fx)
     # s = fx[sorting_ids]
     ht_ev, take_ev = _take(
         src=fx,
@@ -393,7 +390,7 @@ def unique_inverse(x):
         depends=[sort_ev],
     )
     _manager.add_event_pair(ht_ev, take_ev)
-    unique_mask = dpt_ext.empty(fx.shape, dtype="?", sycl_queue=exec_q)
+    unique_mask = dpt.empty(fx.shape, dtype="?", sycl_queue=exec_q)
     ht_ev, uneq_ev = _not_equal(
         src1=s[:-1],
         src2=s[1:],
@@ -407,16 +404,14 @@ def unique_inverse(x):
         fill_value=True, dst=unique_mask[0], sycl_queue=exec_q
     )
     _manager.add_event_pair(ht_ev, one_ev)
-    cumsum = dpt_ext.empty(
-        unique_mask.shape, dtype=dpt.int64, sycl_queue=exec_q
-    )
+    cumsum = dpt.empty(unique_mask.shape, dtype=dpt.int64, sycl_queue=exec_q)
     # synchronizing call
     n_uniques = mask_positions(
         unique_mask, cumsum, sycl_queue=exec_q, depends=[uneq_ev, one_ev]
     )
     if n_uniques == fx.size:
-        return UniqueInverseResult(s, dpt_ext.reshape(unsorting_ids, x.shape))
-    unique_vals = dpt_ext.empty(
+        return UniqueInverseResult(s, dpt.reshape(unsorting_ids, x.shape))
+    unique_vals = dpt.empty(
         n_uniques, dtype=x.dtype, usm_type=x_usm_type, sycl_queue=exec_q
     )
     ht_ev, uv_ev = _extract(
@@ -428,10 +423,10 @@ def unique_inverse(x):
         sycl_queue=exec_q,
     )
     _manager.add_event_pair(ht_ev, uv_ev)
-    cum_unique_counts = dpt_ext.empty(
+    cum_unique_counts = dpt.empty(
         n_uniques + 1, dtype=ind_dt, usm_type=x_usm_type, sycl_queue=exec_q
     )
-    idx = dpt_ext.empty(x.size, dtype=ind_dt, sycl_queue=exec_q)
+    idx = dpt.empty(x.size, dtype=ind_dt, sycl_queue=exec_q)
     ht_ev, id_ev = _linspace_step(start=0, dt=1, dst=idx, sycl_queue=exec_q)
     _manager.add_event_pair(ht_ev, id_ev)
     ht_ev, extr_ev = _extract(
@@ -448,7 +443,7 @@ def unique_inverse(x):
         x.size, dst=cum_unique_counts[-1], sycl_queue=exec_q
     )
     _manager.add_event_pair(ht_ev, set_ev)
-    _counts = dpt_ext.empty_like(cum_unique_counts[1:])
+    _counts = dpt.empty_like(cum_unique_counts[1:])
     ht_ev, sub_ev = _subtract(
         src1=cum_unique_counts[1:],
         src2=cum_unique_counts[:-1],
@@ -458,7 +453,7 @@ def unique_inverse(x):
     )
     _manager.add_event_pair(ht_ev, sub_ev)
 
-    inv = dpt_ext.empty_like(x, dtype=ind_dt, order="C")
+    inv = dpt.empty_like(x, dtype=ind_dt, order="C")
     ht_ev, ssl_ev = _searchsorted_left(
         hay=unique_vals,
         needles=x,
@@ -513,17 +508,17 @@ def unique_all(x: dpt.usm_ndarray) -> UniqueAllResult:
     if x.ndim == 1:
         fx = x
     else:
-        fx = dpt_ext.reshape(x, (x.size,), order="C")
-    sorting_ids = dpt_ext.empty_like(fx, dtype=ind_dt, order="C")
-    unsorting_ids = dpt_ext.empty_like(sorting_ids, dtype=ind_dt, order="C")
+        fx = dpt.reshape(x, (x.size,), order="C")
+    sorting_ids = dpt.empty_like(fx, dtype=ind_dt, order="C")
+    unsorting_ids = dpt.empty_like(sorting_ids, dtype=ind_dt, order="C")
     if fx.size == 0:
         # original array contains no data
         # so it can be safely returned as values
         return UniqueAllResult(
             fx,
             sorting_ids,
-            dpt_ext.reshape(unsorting_ids, x.shape),
-            dpt_ext.empty_like(fx, dtype=ind_dt),
+            dpt.reshape(unsorting_ids, x.shape),
+            dpt.empty_like(fx, dtype=ind_dt),
         )
     _manager = du.SequentialOrderManager[exec_q]
     dep_evs = _manager.submitted_events
@@ -537,7 +532,7 @@ def unique_all(x: dpt.usm_ndarray) -> UniqueAllResult:
         )
         _manager.add_event_pair(ht_ev, sort_ev)
     else:
-        tmp = dpt_ext.empty_like(fx, order="C")
+        tmp = dpt.empty_like(fx, order="C")
         ht_ev, copy_ev = _copy_usm_ndarray_into_usm_ndarray(
             src=fx, dst=tmp, sycl_queue=exec_q, depends=dep_evs
         )
@@ -558,7 +553,7 @@ def unique_all(x: dpt.usm_ndarray) -> UniqueAllResult:
         depends=[sort_ev],
     )
     _manager.add_event_pair(ht_ev, args_ev)
-    s = dpt_ext.empty_like(fx)
+    s = dpt.empty_like(fx)
     # s = fx[sorting_ids]
     ht_ev, take_ev = _take(
         src=fx,
@@ -570,7 +565,7 @@ def unique_all(x: dpt.usm_ndarray) -> UniqueAllResult:
         depends=[sort_ev],
     )
     _manager.add_event_pair(ht_ev, take_ev)
-    unique_mask = dpt_ext.empty(fx.shape, dtype="?", sycl_queue=exec_q)
+    unique_mask = dpt.empty(fx.shape, dtype="?", sycl_queue=exec_q)
     ht_ev, uneq_ev = _not_equal(
         src1=s[:-1],
         src2=s[1:],
@@ -583,24 +578,22 @@ def unique_all(x: dpt.usm_ndarray) -> UniqueAllResult:
         fill_value=True, dst=unique_mask[0], sycl_queue=exec_q
     )
     _manager.add_event_pair(ht_ev, one_ev)
-    cumsum = dpt_ext.empty(
-        unique_mask.shape, dtype=dpt.int64, sycl_queue=exec_q
-    )
+    cumsum = dpt.empty(unique_mask.shape, dtype=dpt.int64, sycl_queue=exec_q)
     # synchronizing call
     n_uniques = mask_positions(
         unique_mask, cumsum, sycl_queue=exec_q, depends=[uneq_ev, one_ev]
     )
     if n_uniques == fx.size:
-        _counts = dpt_ext.ones(
+        _counts = dpt.ones(
             n_uniques, dtype=ind_dt, usm_type=x_usm_type, sycl_queue=exec_q
         )
         return UniqueAllResult(
             s,
             sorting_ids,
-            dpt_ext.reshape(unsorting_ids, x.shape),
+            dpt.reshape(unsorting_ids, x.shape),
             _counts,
         )
-    unique_vals = dpt_ext.empty(
+    unique_vals = dpt.empty(
         n_uniques, dtype=x.dtype, usm_type=x_usm_type, sycl_queue=exec_q
     )
     ht_ev, uv_ev = _extract(
@@ -612,10 +605,10 @@ def unique_all(x: dpt.usm_ndarray) -> UniqueAllResult:
         sycl_queue=exec_q,
     )
     _manager.add_event_pair(ht_ev, uv_ev)
-    cum_unique_counts = dpt_ext.empty(
+    cum_unique_counts = dpt.empty(
         n_uniques + 1, dtype=ind_dt, usm_type=x_usm_type, sycl_queue=exec_q
     )
-    idx = dpt_ext.empty(x.size, dtype=ind_dt, sycl_queue=exec_q)
+    idx = dpt.empty(x.size, dtype=ind_dt, sycl_queue=exec_q)
     ht_ev, id_ev = _linspace_step(start=0, dt=1, dst=idx, sycl_queue=exec_q)
     _manager.add_event_pair(ht_ev, id_ev)
     ht_ev, extr_ev = _extract(
@@ -632,7 +625,7 @@ def unique_all(x: dpt.usm_ndarray) -> UniqueAllResult:
         x.size, dst=cum_unique_counts[-1], sycl_queue=exec_q
     )
     _manager.add_event_pair(ht_ev, set_ev)
-    _counts = dpt_ext.empty_like(cum_unique_counts[1:])
+    _counts = dpt.empty_like(cum_unique_counts[1:])
     ht_ev, sub_ev = _subtract(
         src1=cum_unique_counts[1:],
         src2=cum_unique_counts[:-1],
@@ -642,7 +635,7 @@ def unique_all(x: dpt.usm_ndarray) -> UniqueAllResult:
     )
     _manager.add_event_pair(ht_ev, sub_ev)
 
-    inv = dpt_ext.empty_like(x, dtype=ind_dt, order="C")
+    inv = dpt.empty_like(x, dtype=ind_dt, order="C")
     ht_ev, ssl_ev = _searchsorted_left(
         hay=unique_vals,
         needles=x,
@@ -734,26 +727,26 @@ def isin(
     x_sh = _get_shape(x)
     if isinstance(test_elements, dpt.usm_ndarray) and test_elements.size == 0:
         if invert:
-            return dpt_ext.ones(
+            return dpt.ones(
                 x_sh, dtype=dpt.bool, usm_type=res_usm_type, sycl_queue=exec_q
             )
         else:
-            return dpt_ext.zeros(
+            return dpt.zeros(
                 x_sh, dtype=dpt.bool, usm_type=res_usm_type, sycl_queue=exec_q
             )
 
     dt1, dt2 = _resolve_weak_types_all_py_ints(x_dt, test_dt, sycl_dev)
-    dt = _to_device_supported_dtype(dpt_ext.result_type(dt1, dt2), sycl_dev)
+    dt = _to_device_supported_dtype(dpt.result_type(dt1, dt2), sycl_dev)
 
     if not isinstance(x, dpt.usm_ndarray):
-        x_arr = dpt_ext.asarray(
+        x_arr = dpt.asarray(
             x, dtype=dt1, usm_type=res_usm_type, sycl_queue=exec_q
         )
     else:
         x_arr = x
 
     if not isinstance(test_elements, dpt.usm_ndarray):
-        test_arr = dpt_ext.asarray(
+        test_arr = dpt.asarray(
             test_elements, dtype=dt2, usm_type=res_usm_type, sycl_queue=exec_q
         )
     else:
@@ -773,7 +766,7 @@ def isin(
 
     if test_dt != dt:
         # copy into C-contiguous memory, because the array will be flattened
-        test_buf = dpt_ext.empty_like(
+        test_buf = dpt.empty_like(
             test_arr, dtype=dt, order="C", usm_type=res_usm_type
         )
         ht_ev, ev = _copy_usm_ndarray_into_usm_ndarray(
@@ -783,10 +776,10 @@ def isin(
     else:
         test_buf = test_arr
 
-    test_buf = dpt_ext.reshape(test_buf, -1)
-    test_buf = dpt_ext.sort(test_buf)
+    test_buf = dpt.reshape(test_buf, -1)
+    test_buf = dpt.sort(test_buf)
 
-    dst = dpt_ext.empty_like(
+    dst = dpt.empty_like(
         x_buf, dtype=dpt.bool, usm_type=res_usm_type, order="C"
     )
 
diff --git a/dpctl_ext/tensor/_slicing.pxi b/dpctl_ext/tensor/_slicing.pxi
new file mode 100644
index 000000000000..86db56013e23
--- /dev/null
+++ b/dpctl_ext/tensor/_slicing.pxi
@@ -0,0 +1,383 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+import numbers
+from operator import index
+from cpython.buffer cimport PyObject_CheckBuffer
+from numpy import ndarray
+
+
+cdef bint _is_buffer(object o):
+    return PyObject_CheckBuffer(o)
+
+
+cdef Py_ssize_t _slice_len(
+    Py_ssize_t sl_start,
+    Py_ssize_t sl_stop,
+    Py_ssize_t sl_step
+):
+    """
+    Compute len(range(sl_start, sl_stop, sl_step))
+    """
+    if sl_start == sl_stop:
+        return 0
+    if sl_step > 0:
+        if sl_start > sl_stop:
+            return 0
+        # 1 + argmax k such htat sl_start + sl_step*k < sl_stop
+        return 1 + ((sl_stop - sl_start - 1) // sl_step)
+    else:
+        if sl_start < sl_stop:
+            return 0
+        return 1 + ((sl_stop - sl_start + 1) // sl_step)
+
+
+cdef bint _is_integral(object x) except *:
+    """Gives True if x is an integral slice spec"""
+    if isinstance(x, (ndarray, usm_ndarray)):
+        if x.ndim > 0:
+            return False
+        if x.dtype.kind not in "ui":
+            return False
+        return True
+    if isinstance(x, bool):
+        return False
+    if isinstance(x, int):
+        return True
+    if _is_buffer(x):
+        mbuf = memoryview(x)
+        if mbuf.ndim == 0:
+            f = mbuf.format
+            return f in "bBhHiIlLqQ"
+        else:
+            return False
+    if callable(getattr(x, "__index__", None)):
+        try:
+            index(x)
+        except (TypeError, ValueError):
+            return False
+        return True
+    return False
+
+
+cdef bint _is_boolean(object x) except *:
+    """Gives True if x is an integral slice spec"""
+    if isinstance(x, (ndarray, usm_ndarray)):
+        if x.ndim > 0:
+            return False
+        if x.dtype.kind not in "b":
+            return False
+        return True
+    if isinstance(x, bool):
+        return True
+    if isinstance(x, (int, float, complex)):
+        return False
+    if _is_buffer(x):
+        mbuf = memoryview(x)
+        if mbuf.ndim == 0:
+            f = mbuf.format
+            return f in "?"
+        else:
+            return False
+    if callable(getattr(x, "__bool__", None)):
+        try:
+            x.__bool__()
+        except (TypeError, ValueError):
+            return False
+        return True
+    return False
+
+
+def _basic_slice_meta(ind, shape : tuple, strides : tuple, offset : int):
+    """
+    Give basic slicing index `ind` and array layout information produce
+    a 5-tuple (resulting_shape, resulting_strides, resulting_offset,
+       advanced_ind, resulting_advanced_ind_pos)
+    used to construct a view into underlying array over which advanced
+    indexing, if any, is to be performed.
+
+    Raises IndexError for invalid index `ind`.
+    """
+    _no_advanced_ind = tuple()
+    _no_advanced_pos = -1
+    if ind is Ellipsis:
+        return (shape, strides, offset, _no_advanced_ind, _no_advanced_pos)
+    elif ind is None:
+        return (
+            (1,) + shape,
+            (0,) + strides,
+            offset,
+            _no_advanced_ind,
+            _no_advanced_pos,
+        )
+    elif isinstance(ind, slice):
+        sl_start, sl_stop, sl_step = ind.indices(shape[0])
+        sh0 = _slice_len(sl_start, sl_stop, sl_step)
+        str0 = sl_step * strides[0]
+        new_strides = (
+            strides if (sl_step == 1 or sh0 == 0) else (str0,) + strides[1:]
+        )
+        new_shape = (sh0, ) + shape[1:]
+        is_empty = any(sh_i == 0 for sh_i in new_shape)
+        new_offset = offset if is_empty else offset + sl_start * strides[0]
+        return (
+            new_shape,
+            new_strides,
+            new_offset,
+            _no_advanced_ind,
+            _no_advanced_pos,
+        )
+    elif _is_boolean(ind):
+        if ind:
+            return (
+                (1,) + shape,
+                (0,) + strides,
+                offset,
+                _no_advanced_ind,
+                _no_advanced_pos,
+            )
+        else:
+            return (
+                (0,) + shape,
+                (0,) + strides,
+                offset,
+                _no_advanced_ind,
+                _no_advanced_pos,
+            )
+    elif _is_integral(ind):
+        ind = index(ind)
+        new_shape = shape[1:]
+        new_strides = strides[1:]
+        is_empty = any(sh_i == 0 for sh_i in new_shape)
+        if 0 <= ind < shape[0]:
+            new_offset = offset if is_empty else offset + ind * strides[0]
+            return (
+                new_shape,
+                new_strides,
+                new_offset,
+                _no_advanced_ind,
+                _no_advanced_pos,
+            )
+        elif -shape[0] <= ind < 0:
+            new_offset = (
+                offset if is_empty else offset + (shape[0] + ind) * strides[0]
+            )
+            return (
+                new_shape,
+                new_strides,
+                new_offset,
+                _no_advanced_ind,
+                _no_advanced_pos,
+            )
+        else:
+            raise IndexError(
+                "Index {0} is out of range for axes 0 with "
+                "size {1}".format(ind, shape[0]))
+    elif isinstance(ind, (ndarray, usm_ndarray)):
+        return (shape, strides, offset, (ind,), 0)
+    elif isinstance(ind, tuple):
+        axes_referenced = 0
+        ellipses_count = 0
+        newaxis_count = 0
+        explicit_index = 0
+        seen_arrays_yet = False
+        array_streak_started = False
+        array_streak_interrupted = False
+        for i in ind:
+            if i is None:
+                newaxis_count += 1
+                if array_streak_started:
+                    array_streak_interrupted = True
+            elif i is Ellipsis:
+                ellipses_count += 1
+                if array_streak_started:
+                    array_streak_interrupted = True
+            elif isinstance(i, slice):
+                axes_referenced += 1
+                if array_streak_started:
+                    array_streak_interrupted = True
+            elif _is_boolean(i):
+                newaxis_count += 1
+                if array_streak_started:
+                    array_streak_interrupted = True
+            elif _is_integral(i):
+                axes_referenced += 1
+                if not array_streak_started and array_streak_interrupted:
+                    explicit_index += 1
+            elif isinstance(i, (ndarray, usm_ndarray)):
+                if not seen_arrays_yet:
+                    seen_arrays_yet = True
+                    array_streak_started = True
+                    array_streak_interrupted = False
+                if array_streak_interrupted:
+                    raise IndexError(
+                        "Advanced indexing array specs may not be "
+                        "separated by basic slicing specs."
+                    )
+                dt_k = i.dtype.kind
+                if dt_k == "b" and i.ndim > 0:
+                    axes_referenced += i.ndim
+                elif dt_k in "ui" and i.ndim > 0:
+                    axes_referenced += 1
+                else:
+                    raise IndexError(
+                        "arrays used as indices must be of integer "
+                        "(or boolean) type"
+                    )
+            else:
+                raise IndexError(
+                    "Only integers, slices (`:`), ellipsis (`...`), "
+                    "dpctl.tensor.newaxis (`None`) and integer and "
+                    "boolean arrays are valid indices."
+                )
+        if ellipses_count > 1:
+            raise IndexError(
+                "an index can only have a single ellipsis ('...')")
+        if axes_referenced > len(shape):
+            raise IndexError(
+                "too many indices for an array, array is "
+                "{0}-dimensional, but {1} were indexed".format(
+                    len(shape), axes_referenced))
+        if ellipses_count:
+            ellipses_count = len(shape) - axes_referenced
+        new_shape_len = (newaxis_count + ellipses_count
+                         + axes_referenced - explicit_index)
+        new_shape = list()
+        new_strides = list()
+        new_advanced_ind = list()
+        k = 0
+        new_advanced_start_pos = -1
+        advanced_start_pos_set = False
+        new_offset = offset
+        is_empty = False
+        array_streak = False
+        for i in range(len(ind)):
+            ind_i = ind[i]
+            if (ind_i is Ellipsis):
+                k_new = k + ellipses_count
+                new_shape.extend(shape[k:k_new])
+                new_strides.extend(strides[k:k_new])
+                if any(dim == 0 for dim in shape[k:k_new]):
+                    is_empty = True
+                    new_offset = offset
+                k = k_new
+                if array_streak:
+                    array_streak = False
+            elif ind_i is None:
+                new_shape.append(1)
+                new_strides.append(0)
+                if array_streak:
+                    array_streak = False
+            elif isinstance(ind_i, slice):
+                k_new = k + 1
+                sl_start, sl_stop, sl_step = ind_i.indices(shape[k])
+                sh_i = _slice_len(sl_start, sl_stop, sl_step)
+                str_i = (1 if sh_i == 0 else sl_step) * strides[k]
+                new_shape.append(sh_i)
+                new_strides.append(str_i)
+                if sh_i > 0 and not is_empty:
+                    new_offset = new_offset + sl_start * strides[k]
+                if sh_i == 0:
+                    is_empty = True
+                    new_offset = offset
+                k = k_new
+                if array_streak:
+                    array_streak = False
+            elif _is_boolean(ind_i):
+                new_shape.append(1 if ind_i else 0)
+                new_strides.append(0)
+                if array_streak:
+                    array_streak = False
+            elif _is_integral(ind_i):
+                if array_streak:
+                    if not isinstance(ind_i, (ndarray, usm_ndarray)):
+                        ind_i = index(ind_i)
+                        # integer will be converted to an array,
+                        # still raise if OOB
+                        if not (
+                            0 <= ind_i < shape[k] or -shape[k] <= ind_i < 0
+                        ):
+                            raise IndexError(
+                                "Index {0} is out of range for axes "
+                                "{1} with size {2}".format(ind_i, k, shape[k])
+                            )
+                    new_advanced_ind.append(ind_i)
+                    k_new = k + 1
+                    new_shape.extend(shape[k:k_new])
+                    new_strides.extend(strides[k:k_new])
+                    k = k_new
+                else:
+                    ind_i = index(ind_i)
+                    if 0 <= ind_i < shape[k]:
+                        k_new = k + 1
+                        if not is_empty:
+                            new_offset = new_offset + ind_i * strides[k]
+                        k = k_new
+                    elif -shape[k] <= ind_i < 0:
+                        k_new = k + 1
+                        if not is_empty:
+                            new_offset = (
+                                new_offset + (shape[k] + ind_i) * strides[k]
+                            )
+                        k = k_new
+                    else:
+                        raise IndexError(
+                            "Index {0} is out of range for axes "
+                            "{1} with size {2}".format(ind_i, k, shape[k])
+                        )
+            elif isinstance(ind_i, (ndarray, usm_ndarray)):
+                if not array_streak:
+                    array_streak = True
+                if not advanced_start_pos_set:
+                    new_advanced_start_pos = len(new_shape)
+                    advanced_start_pos_set = True
+                new_advanced_ind.append(ind_i)
+                dt_k = ind_i.dtype.kind
+                if dt_k == "b":
+                    k_new = k + ind_i.ndim
+                else:
+                    k_new = k + 1
+                new_shape.extend(shape[k:k_new])
+                new_strides.extend(strides[k:k_new])
+                k = k_new
+        new_shape.extend(shape[k:])
+        new_strides.extend(strides[k:])
+        new_shape_len += len(shape) - k
+        return (
+            tuple(new_shape),
+            tuple(new_strides),
+            new_offset,
+            tuple(new_advanced_ind),
+            new_advanced_start_pos
+        )
+    else:
+        raise IndexError(
+            "Only integers, slices (`:`), ellipsis (`...`), "
+            "dpctl.tensor.newaxis (`None`) and integer and "
+            "boolean arrays are valid indices."
+        )
diff --git a/dpctl_ext/tensor/_sorting.py b/dpctl_ext/tensor/_sorting.py
index 24693a408889..42cd9e1b44be 100644
--- a/dpctl_ext/tensor/_sorting.py
+++ b/dpctl_ext/tensor/_sorting.py
@@ -29,12 +29,11 @@
 import operator
 from typing import NamedTuple
 
-import dpctl.tensor as dpt
 import dpctl.utils as du
 
 # TODO: revert to `import dpctl.tensor...`
 # when dpnp fully migrates dpctl/tensor
-import dpctl_ext.tensor as dpt_ext
+import dpctl_ext.tensor as dpt
 import dpctl_ext.tensor._tensor_impl as ti
 
 from ._numpy_helper import normalize_axis_index
@@ -98,7 +97,7 @@ def sort(x, /, *, axis=-1, descending=False, stable=True, kind=None):
     nd = x.ndim
     if nd == 0:
         axis = normalize_axis_index(axis, ndim=1, msg_prefix="axis")
-        return dpt_ext.copy(x, order="C")
+        return dpt.copy(x, order="C")
     else:
         axis = normalize_axis_index(axis, ndim=nd, msg_prefix="axis")
     a1 = axis + 1
@@ -109,7 +108,7 @@ def sort(x, /, *, axis=-1, descending=False, stable=True, kind=None):
         perm = [i for i in range(nd) if i != axis] + [
             axis,
         ]
-        arr = dpt_ext.permute_dims(x, perm)
+        arr = dpt.permute_dims(x, perm)
     if kind is None:
         kind = "stable"
     if not isinstance(kind, str) or kind not in [
@@ -138,7 +137,7 @@ def sort(x, /, *, axis=-1, descending=False, stable=True, kind=None):
     _manager = du.SequentialOrderManager[exec_q]
     dep_evs = _manager.submitted_events
     if arr.flags.c_contiguous:
-        res = dpt_ext.empty_like(arr, order="C")
+        res = dpt.empty_like(arr, order="C")
         ht_ev, impl_ev = impl_fn(
             src=arr,
             trailing_dims_to_sort=1,
@@ -148,12 +147,12 @@ def sort(x, /, *, axis=-1, descending=False, stable=True, kind=None):
         )
         _manager.add_event_pair(ht_ev, impl_ev)
     else:
-        tmp = dpt_ext.empty_like(arr, order="C")
+        tmp = dpt.empty_like(arr, order="C")
         ht_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
             src=arr, dst=tmp, sycl_queue=exec_q, depends=dep_evs
         )
         _manager.add_event_pair(ht_ev, copy_ev)
-        res = dpt_ext.empty_like(arr, order="C")
+        res = dpt.empty_like(arr, order="C")
         ht_ev, impl_ev = impl_fn(
             src=tmp,
             trailing_dims_to_sort=1,
@@ -164,7 +163,7 @@ def sort(x, /, *, axis=-1, descending=False, stable=True, kind=None):
         _manager.add_event_pair(ht_ev, impl_ev)
     if a1 != nd:
         inv_perm = sorted(range(nd), key=lambda d: perm[d])
-        res = dpt_ext.permute_dims(res, inv_perm)
+        res = dpt.permute_dims(res, inv_perm)
     return res
 
 
@@ -214,7 +213,7 @@ def argsort(x, axis=-1, descending=False, stable=True, kind=None):
     nd = x.ndim
     if nd == 0:
         axis = normalize_axis_index(axis, ndim=1, msg_prefix="axis")
-        return dpt_ext.zeros_like(
+        return dpt.zeros_like(
             x, dtype=ti.default_device_index_type(x.sycl_queue), order="C"
         )
     else:
@@ -227,7 +226,7 @@ def argsort(x, axis=-1, descending=False, stable=True, kind=None):
         perm = [i for i in range(nd) if i != axis] + [
             axis,
         ]
-        arr = dpt_ext.permute_dims(x, perm)
+        arr = dpt.permute_dims(x, perm)
     if kind is None:
         kind = "stable"
     if not isinstance(kind, str) or kind not in [
@@ -257,7 +256,7 @@ def argsort(x, axis=-1, descending=False, stable=True, kind=None):
     dep_evs = _manager.submitted_events
     index_dt = ti.default_device_index_type(exec_q)
     if arr.flags.c_contiguous:
-        res = dpt_ext.empty_like(arr, dtype=index_dt, order="C")
+        res = dpt.empty_like(arr, dtype=index_dt, order="C")
         ht_ev, impl_ev = impl_fn(
             src=arr,
             trailing_dims_to_sort=1,
@@ -267,12 +266,12 @@ def argsort(x, axis=-1, descending=False, stable=True, kind=None):
         )
         _manager.add_event_pair(ht_ev, impl_ev)
     else:
-        tmp = dpt_ext.empty_like(arr, order="C")
+        tmp = dpt.empty_like(arr, order="C")
         ht_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
             src=arr, dst=tmp, sycl_queue=exec_q, depends=dep_evs
         )
         _manager.add_event_pair(ht_ev, copy_ev)
-        res = dpt_ext.empty_like(arr, dtype=index_dt, order="C")
+        res = dpt.empty_like(arr, dtype=index_dt, order="C")
         ht_ev, impl_ev = impl_fn(
             src=tmp,
             trailing_dims_to_sort=1,
@@ -283,7 +282,7 @@ def argsort(x, axis=-1, descending=False, stable=True, kind=None):
         _manager.add_event_pair(ht_ev, impl_ev)
     if a1 != nd:
         inv_perm = sorted(range(nd), key=lambda d: perm[d])
-        res = dpt_ext.permute_dims(res, inv_perm)
+        res = dpt.permute_dims(res, inv_perm)
     return res
 
 
@@ -354,8 +353,8 @@ def top_k(x, k, /, *, axis=None, mode="largest"):
             if k > 1:
                 raise ValueError(f"`k`={k} is out of bounds 1")
             return TopKResult(
-                dpt_ext.copy(x, order="C"),
-                dpt_ext.zeros_like(
+                dpt.copy(x, order="C"),
+                dpt.zeros_like(
                     x, dtype=ti.default_device_index_type(x.sycl_queue)
                 ),
             )
@@ -373,7 +372,7 @@ def top_k(x, k, /, *, axis=None, mode="largest"):
             perm = [i for i in range(nd) if i != axis] + [
                 axis,
             ]
-            arr = dpt_ext.permute_dims(x, perm)
+            arr = dpt.permute_dims(x, perm)
         n_search_dims = 1
         res_sh = arr.shape[: nd - 1] + (k,)
 
@@ -386,14 +385,14 @@ def top_k(x, k, /, *, axis=None, mode="largest"):
 
     res_usm_type = arr.usm_type
     if arr.flags.c_contiguous:
-        vals = dpt_ext.empty(
+        vals = dpt.empty(
             res_sh,
             dtype=arr.dtype,
             usm_type=res_usm_type,
             order="C",
             sycl_queue=exec_q,
         )
-        inds = dpt_ext.empty(
+        inds = dpt.empty(
             res_sh,
             dtype=ti.default_device_index_type(exec_q),
             usm_type=res_usm_type,
@@ -412,19 +411,19 @@ def top_k(x, k, /, *, axis=None, mode="largest"):
         )
         _manager.add_event_pair(ht_ev, impl_ev)
     else:
-        tmp = dpt_ext.empty_like(arr, order="C")
+        tmp = dpt.empty_like(arr, order="C")
         ht_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
             src=arr, dst=tmp, sycl_queue=exec_q, depends=dep_evs
         )
         _manager.add_event_pair(ht_ev, copy_ev)
-        vals = dpt_ext.empty(
+        vals = dpt.empty(
             res_sh,
             dtype=arr.dtype,
             usm_type=res_usm_type,
             order="C",
             sycl_queue=exec_q,
         )
-        inds = dpt_ext.empty(
+        inds = dpt.empty(
             res_sh,
             dtype=ti.default_device_index_type(exec_q),
             usm_type=res_usm_type,
@@ -444,7 +443,7 @@ def top_k(x, k, /, *, axis=None, mode="largest"):
         _manager.add_event_pair(ht_ev, impl_ev)
     if axis is not None and a1 != nd:
         inv_perm = sorted(range(nd), key=lambda d: perm[d])
-        vals = dpt_ext.permute_dims(vals, inv_perm)
-        inds = dpt_ext.permute_dims(inds, inv_perm)
+        vals = dpt.permute_dims(vals, inv_perm)
+        inds = dpt.permute_dims(inds, inv_perm)
 
     return TopKResult(vals, inds)
diff --git a/dpctl_ext/tensor/_statistical_functions.py b/dpctl_ext/tensor/_statistical_functions.py
index 5513dfa7a65f..c1544b84c6a7 100644
--- a/dpctl_ext/tensor/_statistical_functions.py
+++ b/dpctl_ext/tensor/_statistical_functions.py
@@ -25,12 +25,11 @@
 # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
 # THE POSSIBILITY OF SUCH DAMAGE.
 
-import dpctl.tensor as dpt
 import dpctl.utils as du
 
 # TODO: revert to `import dpctl.tensor...`
 # when dpnp fully migrates dpctl/tensor
-import dpctl_ext.tensor as dpt_ext
+import dpctl_ext.tensor as dpt
 import dpctl_ext.tensor._tensor_elementwise_impl as tei
 import dpctl_ext.tensor._tensor_impl as ti
 import dpctl_ext.tensor._tensor_reductions_impl as tri
@@ -66,7 +65,7 @@ def _var_impl(x, axis, correction, keepdims):
     _manager = du.SequentialOrderManager[q]
     dep_evs = _manager.submitted_events
     if inp_dt != res_dt:
-        buf = dpt_ext.empty_like(x, dtype=res_dt)
+        buf = dpt.empty_like(x, dtype=res_dt)
         ht_e_buf, c_e1 = ti._copy_usm_ndarray_into_usm_ndarray(
             src=x, dst=buf, sycl_queue=q, depends=dep_evs
         )
@@ -74,18 +73,18 @@ def _var_impl(x, axis, correction, keepdims):
     else:
         buf = x
     # calculate mean
-    buf2 = dpt_ext.permute_dims(buf, perm)
+    buf2 = dpt.permute_dims(buf, perm)
     res_shape = buf2.shape[: nd - red_nd]
     # use keepdims=True path for later broadcasting
     if red_nd == 0:
-        mean_ary = dpt_ext.empty_like(buf)
+        mean_ary = dpt.empty_like(buf)
         dep_evs = _manager.submitted_events
         ht_e1, c_e2 = ti._copy_usm_ndarray_into_usm_ndarray(
             src=buf, dst=mean_ary, sycl_queue=q, depends=dep_evs
         )
         _manager.add_event_pair(ht_e1, c_e2)
     else:
-        mean_ary = dpt_ext.empty(
+        mean_ary = dpt.empty(
             res_shape,
             dtype=res_dt,
             usm_type=res_usm_type,
@@ -103,8 +102,8 @@ def _var_impl(x, axis, correction, keepdims):
 
         mean_ary_shape = res_shape + (1,) * red_nd
         inv_perm = sorted(range(nd), key=lambda d: perm[d])
-        mean_ary = dpt_ext.permute_dims(
-            dpt_ext.reshape(mean_ary, mean_ary_shape), inv_perm
+        mean_ary = dpt.permute_dims(
+            dpt.reshape(mean_ary, mean_ary_shape), inv_perm
         )
     # divide in-place to get mean
     mean_ary_shape = mean_ary.shape
@@ -116,9 +115,9 @@ def _var_impl(x, axis, correction, keepdims):
     _manager.add_event_pair(ht_e2, d_e1)
 
     # subtract mean from original array to get deviations
-    dev_ary = dpt_ext.empty_like(buf)
+    dev_ary = dpt.empty_like(buf)
     if mean_ary_shape != buf.shape:
-        mean_ary = dpt_ext.broadcast_to(mean_ary, buf.shape)
+        mean_ary = dpt.broadcast_to(mean_ary, buf.shape)
     ht_e4, su_e = tei._subtract(
         src1=buf, src2=mean_ary, dst=dev_ary, sycl_queue=q, depends=[d_e1]
     )
@@ -130,11 +129,11 @@ def _var_impl(x, axis, correction, keepdims):
     _manager.add_event_pair(ht_e5, sq_e)
 
     # take sum of squared deviations
-    dev_ary2 = dpt_ext.permute_dims(dev_ary, perm)
+    dev_ary2 = dpt.permute_dims(dev_ary, perm)
     if red_nd == 0:
         res = dev_ary
     else:
-        res = dpt_ext.empty(
+        res = dpt.empty(
             res_shape,
             dtype=res_dt,
             usm_type=res_usm_type,
@@ -152,9 +151,7 @@ def _var_impl(x, axis, correction, keepdims):
         if keepdims:
             res_shape = res_shape + (1,) * red_nd
             inv_perm = sorted(range(nd), key=lambda d: perm[d])
-            res = dpt_ext.permute_dims(
-                dpt_ext.reshape(res, res_shape), inv_perm
-            )
+            res = dpt.permute_dims(dpt.reshape(res, res_shape), inv_perm)
     res_shape = res.shape
     # when nelems - correction <= 0, yield nans
     div = max(nelems - correction, 0)
@@ -215,7 +212,7 @@ def mean(x, axis=None, keepdims=False):
             nelems *= x.shape[i]
     sum_nd = len(axis)
     perm = perm + list(axis)
-    arr2 = dpt_ext.permute_dims(x, perm)
+    arr2 = dpt.permute_dims(x, perm)
     res_shape = arr2.shape[: nd - sum_nd]
     q = x.sycl_queue
     inp_dt = x.dtype
@@ -226,12 +223,12 @@ def mean(x, axis=None, keepdims=False):
     )
     res_usm_type = x.usm_type
     if sum_nd == 0:
-        return dpt_ext.astype(x, res_dt, copy=True)
+        return dpt.astype(x, res_dt, copy=True)
 
     _manager = du.SequentialOrderManager[q]
     dep_evs = _manager.submitted_events
     if tri._sum_over_axis_dtype_supported(inp_dt, res_dt, res_usm_type, q):
-        res = dpt_ext.empty(
+        res = dpt.empty(
             res_shape, dtype=res_dt, usm_type=res_usm_type, sycl_queue=q
         )
         ht_e1, r_e = tri._sum_over_axis(
@@ -243,14 +240,14 @@ def mean(x, axis=None, keepdims=False):
         )
         _manager.add_event_pair(ht_e1, r_e)
     else:
-        tmp = dpt_ext.empty(
+        tmp = dpt.empty(
             arr2.shape, dtype=res_dt, usm_type=res_usm_type, sycl_queue=q
         )
         ht_e_cpy, cpy_e = ti._copy_usm_ndarray_into_usm_ndarray(
             src=arr2, dst=tmp, sycl_queue=q, depends=dep_evs
         )
         _manager.add_event_pair(ht_e_cpy, cpy_e)
-        res = dpt_ext.empty(
+        res = dpt.empty(
             res_shape, dtype=res_dt, usm_type=res_usm_type, sycl_queue=q
         )
         ht_e_red, r_e = tri._sum_over_axis(
@@ -265,7 +262,7 @@ def mean(x, axis=None, keepdims=False):
     if keepdims:
         res_shape = res_shape + (1,) * sum_nd
         inv_perm = sorted(range(nd), key=lambda d: perm[d])
-        res = dpt_ext.permute_dims(dpt_ext.reshape(res, res_shape), inv_perm)
+        res = dpt.permute_dims(dpt.reshape(res, res_shape), inv_perm)
 
     dep_evs = _manager.submitted_events
     ht_e2, div_e = tei._divide_by_scalar(
diff --git a/dpctl_ext/tensor/_stride_utils.pxi b/dpctl_ext/tensor/_stride_utils.pxi
new file mode 100644
index 000000000000..3caf8dd8fd1f
--- /dev/null
+++ b/dpctl_ext/tensor/_stride_utils.pxi
@@ -0,0 +1,314 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+# distutils: language = c++
+# cython: language_level=3
+
+from cpython.mem cimport PyMem_Malloc
+from cpython.ref cimport Py_INCREF
+from cpython.tuple cimport PyTuple_New, PyTuple_SetItem
+
+
+cdef int ERROR_MALLOC = 1
+cdef int ERROR_INTERNAL = -1
+cdef int ERROR_INCORRECT_ORDER = 2
+cdef int ERROR_UNEXPECTED_STRIDES = 3
+
+cdef int USM_ARRAY_C_CONTIGUOUS = 1
+cdef int USM_ARRAY_F_CONTIGUOUS = 2
+cdef int USM_ARRAY_WRITABLE = 4
+
+
+cdef Py_ssize_t shape_to_elem_count(int nd, Py_ssize_t *shape_arr):
+    """
+    Computes number of elements in an array.
+    """
+    cdef Py_ssize_t count = 1
+    for i in range(nd):
+        count *= shape_arr[i]
+    return count
+
+
+cdef int _from_input_shape_strides(
+    int nd, object shape, object strides, int itemsize, char order,
+    Py_ssize_t **shape_ptr, Py_ssize_t **strides_ptr,
+    Py_ssize_t *nelems, Py_ssize_t *min_disp, Py_ssize_t *max_disp,
+    int *contig
+):
+    """
+    Arguments: nd, shape, strides, itemsize, order
+    Modifies:
+        shape_ptr - pointer to C array for shape values
+        stride_ptr - pointer to C array for strides values
+        nelems - Number of elements in array
+        min_disp = min( dot(strides, index), index for shape)
+        max_disp = max( dor(strides, index), index for shape)
+        contig = enumeration for array contiguity
+    Returns: 0 on success, error code otherwise.
+        On success pointers point to allocated arrays,
+        Otherwise they are set to NULL
+    """
+    cdef int i
+    cdef int j
+    cdef bint all_incr = 1
+    cdef bint all_decr = 1
+    cdef bint strides_inspected = 0
+    cdef Py_ssize_t elem_count = 1
+    cdef Py_ssize_t min_shift = 0
+    cdef Py_ssize_t max_shift = 0
+    cdef Py_ssize_t str_i
+    cdef Py_ssize_t* shape_arr
+    cdef Py_ssize_t* strides_arr
+
+    if (int(order) not in [ord("C"), ord("F"), ord("c"), ord("f")]):
+        return ERROR_INCORRECT_ORDER
+
+    # 0-d array
+    if (nd == 0):
+        contig[0] = (USM_ARRAY_C_CONTIGUOUS | USM_ARRAY_F_CONTIGUOUS)
+        nelems[0] = 1
+        min_disp[0] = 0
+        max_disp[0] = 0
+        shape_ptr[0] = <Py_ssize_t *>(<size_t>0)
+        strides_ptr[0] = <Py_ssize_t *>(<size_t>0)
+        return 0
+
+    shape_arr = <Py_ssize_t*>PyMem_Malloc(nd * sizeof(Py_ssize_t))
+    if (not shape_arr):
+        return ERROR_MALLOC
+    shape_ptr[0] = shape_arr
+    for i in range(0, nd):
+        shape_arr[i] = <Py_ssize_t> shape[i]
+        elem_count *= shape_arr[i]
+    if elem_count == 0:
+        contig[0] = (USM_ARRAY_C_CONTIGUOUS | USM_ARRAY_F_CONTIGUOUS)
+        nelems[0] = 1
+        min_disp[0] = 0
+        max_disp[0] = 0
+        if strides is None:
+            strides_ptr[0] = <Py_ssize_t *>(<size_t>0)
+        else:
+            strides_arr = <Py_ssize_t*>PyMem_Malloc(nd * sizeof(Py_ssize_t))
+            if (not strides_arr):
+                PyMem_Free(shape_ptr[0])
+                shape_ptr[0] = <Py_ssize_t *>(<size_t>0)
+                return ERROR_MALLOC
+            strides_ptr[0] = strides_arr
+            for i in range(0, nd):
+                strides_arr[i] = <Py_ssize_t> strides[i]
+        return 0
+    nelems[0] = elem_count
+    if (strides is None):
+        # no need to allocate and populate strides
+        if order == <char> ord("C") or order == <char> ord("c"):
+            contig[0] = USM_ARRAY_C_CONTIGUOUS
+        else:
+            contig[0] = USM_ARRAY_F_CONTIGUOUS
+        if nd == 1:
+            contig[0] = USM_ARRAY_C_CONTIGUOUS | USM_ARRAY_F_CONTIGUOUS
+        else:
+            j = 0
+            for i in range(nd):
+                if shape_arr[i] > 1:
+                    j = j + 1
+            if j < 2:
+                contig[0] = USM_ARRAY_C_CONTIGUOUS | USM_ARRAY_F_CONTIGUOUS
+        min_disp[0] = 0
+        max_disp[0] = (elem_count - 1)
+        strides_ptr[0] = <Py_ssize_t *>(<size_t>0)
+        return 0
+    elif ((isinstance(strides, (list, tuple)) or hasattr(strides, "tolist"))
+          and len(strides) == nd):
+        strides_arr = <Py_ssize_t*>PyMem_Malloc(nd * sizeof(Py_ssize_t))
+        if (not strides_arr):
+            PyMem_Free(shape_ptr[0])
+            shape_ptr[0] = <Py_ssize_t *>(<size_t>0)
+            return ERROR_MALLOC
+        strides_ptr[0] = strides_arr
+        for i in range(0, nd):
+            str_i = <Py_ssize_t> strides[i]
+            strides_arr[i] = str_i
+            if str_i > 0:
+                max_shift += str_i * (shape_arr[i] - 1)
+            else:
+                min_shift += str_i * (shape_arr[i] - 1)
+        min_disp[0] = min_shift
+        max_disp[0] = max_shift
+        if max_shift == min_shift + (elem_count - 1):
+            if elem_count == 1:
+                contig[0] = (USM_ARRAY_C_CONTIGUOUS | USM_ARRAY_F_CONTIGUOUS)
+                return 0
+            if nd == 1:
+                if strides_arr[0] == 1:
+                    contig[0] = USM_ARRAY_C_CONTIGUOUS | USM_ARRAY_F_CONTIGUOUS
+                else:
+                    contig[0] = 0
+                return 0
+            i = 0
+            while i < nd:
+                if shape_arr[i] == 1:
+                    i = i + 1
+                    continue
+                j = i + 1
+                while (j < nd and shape_arr[j] == 1):
+                    j = j + 1
+                if j < nd:
+                    strides_inspected = 1
+                    if all_incr:
+                        all_incr = (
+                            (strides_arr[i] > 0) and
+                            (strides_arr[j] > 0) and
+                            (strides_arr[i] <= strides_arr[j])
+                        )
+                    if all_decr:
+                        all_decr = (
+                            (strides_arr[i] > 0) and
+                            (strides_arr[j] > 0) and
+                            (strides_arr[i] >= strides_arr[j])
+                        )
+                    i = j
+                else:
+                    if not strides_inspected:
+                        # all dimensions have size 1 except
+                        # dimension 'i'. Array is both C and F
+                        # contiguous
+                        strides_inspected = 1
+                        all_incr = (strides_arr[i] == 1)
+                        all_decr = all_incr
+                    break
+            # should only set contig flags on actually obtained
+            # values, rather than default values
+            all_incr = all_incr and strides_inspected
+            all_decr = all_decr and strides_inspected
+            if all_incr and all_decr:
+                contig[0] = (USM_ARRAY_C_CONTIGUOUS | USM_ARRAY_F_CONTIGUOUS)
+            elif all_incr:
+                contig[0] = USM_ARRAY_F_CONTIGUOUS
+            elif all_decr:
+                contig[0] = USM_ARRAY_C_CONTIGUOUS
+            else:
+                contig[0] = 0
+            return 0
+        else:
+            contig[0] = 0  # non-contiguous
+        return 0
+    else:
+        PyMem_Free(shape_ptr[0])
+        shape_ptr[0] = <Py_ssize_t *>(<size_t>0)
+        return ERROR_UNEXPECTED_STRIDES
+    # return ERROR_INTERNAL
+
+
+cdef object _make_int_tuple(int nd, const Py_ssize_t *ary):
+    """
+    Makes Python tuple from C array
+    """
+    cdef tuple res
+    cdef object tmp
+    if (ary):
+        res = PyTuple_New(nd)
+        for i in range(nd):
+            tmp = <object>ary[i]
+            Py_INCREF(tmp)  # SetItem steals the reference
+            PyTuple_SetItem(res, i, tmp)
+        return res
+    else:
+        return None
+
+
+cdef object _make_reversed_int_tuple(int nd, const Py_ssize_t *ary):
+    """
+    Makes Python reversed tuple from C array
+    """
+    cdef tuple res
+    cdef object tmp
+    cdef int i
+    cdef int nd_1
+    if (ary):
+        res = PyTuple_New(nd)
+        nd_1 = nd - 1
+        for i in range(nd):
+            tmp = <object>ary[i]
+            Py_INCREF(tmp)  # SetItem steals the reference
+            PyTuple_SetItem(res, nd_1 - i, tmp)
+        return res
+    else:
+        return None
+
+
+cdef object _c_contig_strides(int nd, Py_ssize_t *shape):
+    """
+    Makes Python tuple for strides of C-contiguous array
+    """
+    cdef tuple cc_strides = PyTuple_New(nd)
+    cdef object si = 1
+    cdef int i
+    cdef int nd_1 = nd - 1
+    for i in range(0, nd):
+        Py_INCREF(si)  # SetItem steals the reference
+        PyTuple_SetItem(cc_strides, nd_1 - i, si)
+        si = si * shape[nd_1 - i]
+    return cc_strides
+
+
+cdef object _f_contig_strides(int nd, Py_ssize_t *shape):
+    """
+    Makes Python tuple for strides of F-contiguous array
+    """
+    cdef tuple fc_strides = PyTuple_New(nd)
+    cdef object si = 1
+    for i in range(0, nd):
+        Py_INCREF(si)  # SetItem steals the reference
+        PyTuple_SetItem(fc_strides, i, si)
+        si = si * shape[i]
+    return fc_strides
+
+cdef object _swap_last_two(tuple t):
+    """
+    Swap last two elements of a tuple
+    """
+    cdef int nd = len(t)
+    cdef tuple res
+    cdef int i
+    cdef object tmp
+    if (nd < 2):
+        return t
+    res = PyTuple_New(nd)
+    # copy all elements except the last two
+    for i in range(0, nd-2):
+        tmp = t[i]
+        Py_INCREF(tmp)  # SetItem steals the reference
+        PyTuple_SetItem(res, i, tmp)
+    # swap the last two elements
+    tmp = t[nd-1]
+    Py_INCREF(tmp)  # SetItem steals
+    PyTuple_SetItem(res, nd - 2, tmp)
+    tmp = t[nd-2]
+    Py_INCREF(tmp)  # SetItem steals
+    PyTuple_SetItem(res, nd - 1, tmp)
+    return res
diff --git a/dpctl_ext/tensor/_testing.py b/dpctl_ext/tensor/_testing.py
index c0f475212232..4c9f5ebac9a4 100644
--- a/dpctl_ext/tensor/_testing.py
+++ b/dpctl_ext/tensor/_testing.py
@@ -26,100 +26,91 @@
 # THE POSSIBILITY OF SUCH DAMAGE.
 # *****************************************************************************
 
-import dpctl.tensor as dpt
 import dpctl.utils as du
 import numpy as np
 
 # TODO: revert to `import dpctl.tensor...`
 # when dpnp fully migrates dpctl/tensor
-import dpctl_ext.tensor as dpt_ext
+import dpctl_ext.tensor as dpt
 
 from ._manipulation_functions import _broadcast_shape_impl
 from ._type_utils import _to_device_supported_dtype
 
 
 def _allclose_complex_fp(z1, z2, atol, rtol, equal_nan):
-    z1r = dpt_ext.real(z1)
-    z1i = dpt_ext.imag(z1)
-    z2r = dpt_ext.real(z2)
-    z2i = dpt_ext.imag(z2)
+    z1r = dpt.real(z1)
+    z1i = dpt.imag(z1)
+    z2r = dpt.real(z2)
+    z2i = dpt.imag(z2)
     if equal_nan:
-        check1 = dpt_ext.all(
-            dpt_ext.isnan(z1r) == dpt_ext.isnan(z2r)
-        ) and dpt_ext.all(dpt_ext.isnan(z1i) == dpt_ext.isnan(z2i))
+        check1 = dpt.all(dpt.isnan(z1r) == dpt.isnan(z2r)) and dpt.all(
+            dpt.isnan(z1i) == dpt.isnan(z2i)
+        )
     else:
         check1 = (
-            dpt_ext.logical_not(dpt_ext.any(dpt_ext.isnan(z1r)))
-            and dpt_ext.logical_not(dpt_ext.any(dpt_ext.isnan(z1i)))
+            dpt.logical_not(dpt.any(dpt.isnan(z1r)))
+            and dpt.logical_not(dpt.any(dpt.isnan(z1i)))
         ) and (
-            dpt_ext.logical_not(dpt_ext.any(dpt_ext.isnan(z2r)))
-            and dpt_ext.logical_not(dpt_ext.any(dpt_ext.isnan(z2i)))
+            dpt.logical_not(dpt.any(dpt.isnan(z2r)))
+            and dpt.logical_not(dpt.any(dpt.isnan(z2i)))
         )
     if not check1:
         return check1
-    mr = dpt_ext.isinf(z1r)
-    mi = dpt_ext.isinf(z1i)
-    check2 = dpt_ext.all(mr == dpt_ext.isinf(z2r)) and dpt_ext.all(
-        mi == dpt_ext.isinf(z2i)
-    )
+    mr = dpt.isinf(z1r)
+    mi = dpt.isinf(z1i)
+    check2 = dpt.all(mr == dpt.isinf(z2r)) and dpt.all(mi == dpt.isinf(z2i))
     if not check2:
         return check2
-    check3 = dpt_ext.all(z1r[mr] == z2r[mr]) and dpt_ext.all(z1i[mi] == z2i[mi])
+    check3 = dpt.all(z1r[mr] == z2r[mr]) and dpt.all(z1i[mi] == z2i[mi])
     if not check3:
         return check3
-    mr = dpt_ext.isfinite(z1r)
-    mi = dpt_ext.isfinite(z1i)
+    mr = dpt.isfinite(z1r)
+    mi = dpt.isfinite(z1i)
     mv1 = z1r[mr]
     mv2 = z2r[mr]
-    check4 = dpt_ext.all(
-        dpt_ext.abs(mv1 - mv2)
-        < dpt_ext.maximum(
-            atol, rtol * dpt_ext.maximum(dpt_ext.abs(mv1), dpt_ext.abs(mv2))
-        )
+    check4 = dpt.all(
+        dpt.abs(mv1 - mv2)
+        < dpt.maximum(atol, rtol * dpt.maximum(dpt.abs(mv1), dpt.abs(mv2)))
     )
     if not check4:
         return check4
     mv1 = z1i[mi]
     mv2 = z2i[mi]
-    check5 = dpt_ext.all(
-        dpt_ext.abs(mv1 - mv2)
-        <= dpt_ext.maximum(
-            atol, rtol * dpt_ext.maximum(dpt_ext.abs(mv1), dpt_ext.abs(mv2))
-        )
+    check5 = dpt.all(
+        dpt.abs(mv1 - mv2)
+        <= dpt.maximum(atol, rtol * dpt.maximum(dpt.abs(mv1), dpt.abs(mv2)))
     )
     return check5
 
 
 def _allclose_real_fp(r1, r2, atol, rtol, equal_nan):
     if equal_nan:
-        check1 = dpt_ext.all(dpt_ext.isnan(r1) == dpt_ext.isnan(r2))
+        check1 = dpt.all(dpt.isnan(r1) == dpt.isnan(r2))
     else:
-        check1 = dpt_ext.logical_not(
-            dpt_ext.any(dpt_ext.isnan(r1))
-        ) and dpt_ext.logical_not(dpt_ext.any(dpt_ext.isnan(r2)))
+        check1 = dpt.logical_not(dpt.any(dpt.isnan(r1))) and dpt.logical_not(
+            dpt.any(dpt.isnan(r2))
+        )
     if not check1:
         return check1
-    mr = dpt_ext.isinf(r1)
-    check2 = dpt_ext.all(mr == dpt_ext.isinf(r2))
+    mr = dpt.isinf(r1)
+    check2 = dpt.all(mr == dpt.isinf(r2))
     if not check2:
         return check2
-    check3 = dpt_ext.all(r1[mr] == r2[mr])
+    check3 = dpt.all(r1[mr] == r2[mr])
     if not check3:
         return check3
-    m = dpt_ext.isfinite(r1)
+    m = dpt.isfinite(r1)
     mv1 = r1[m]
     mv2 = r2[m]
-    check4 = dpt_ext.all(
-        dpt_ext.abs(mv1 - mv2)
-        <= dpt_ext.maximum(
-            atol, rtol * dpt_ext.maximum(dpt_ext.abs(mv1), dpt_ext.abs(mv2))
-        )
+    check4 = dpt.all(
+        dpt.abs(mv1 - mv2)
+        <= dpt.maximum(atol, rtol * dpt.maximum(dpt.abs(mv1), dpt.abs(mv2)))
     )
     return check4
 
 
 def _allclose_others(r1, r2):
-    return dpt_ext.all(r1 == r2)
+    return dpt.all(r1 == r2)
 
 
 def allclose(a1, a2, atol=1e-8, rtol=1e-5, equal_nan=False):
@@ -160,11 +151,11 @@ def allclose(a1, a2, atol=1e-8, rtol=1e-5, equal_nan=False):
     else:
         res_dt = np.promote_types(b1.dtype, b2.dtype)
         res_dt = _to_device_supported_dtype(res_dt, exec_q.sycl_device)
-        b1 = dpt_ext.astype(b1, res_dt)
-        b2 = dpt_ext.astype(b2, res_dt)
+        b1 = dpt.astype(b1, res_dt)
+        b2 = dpt.astype(b2, res_dt)
 
-    b1 = dpt_ext.broadcast_to(b1, res_sh)
-    b2 = dpt_ext.broadcast_to(b2, res_sh)
+    b1 = dpt.broadcast_to(b1, res_sh)
+    b2 = dpt.broadcast_to(b2, res_sh)
 
     k = b1.dtype.kind
     if k == "c":
diff --git a/dpctl_ext/tensor/_type_utils.py b/dpctl_ext/tensor/_type_utils.py
index 1e386e15dfa3..8c15053cb4c1 100644
--- a/dpctl_ext/tensor/_type_utils.py
+++ b/dpctl_ext/tensor/_type_utils.py
@@ -28,12 +28,11 @@
 
 from __future__ import annotations
 
-import dpctl.tensor as dpt
 import numpy as np
 
 # TODO: revert to `import dpctl.tensor...`
 # when dpnp fully migrates dpctl/tensor
-import dpctl_ext.tensor as dpt_ext
+import dpctl_ext.tensor as dpt
 import dpctl_ext.tensor._tensor_impl as ti
 
 
@@ -450,7 +449,7 @@ def _resolve_weak_types_all_py_ints(o1_dtype, o2_dtype, dev):
                 o1_dtype, WeakIntegralType
             ):
                 o1_val = o1_dtype.get()
-                o2_iinfo = dpt_ext.iinfo(o2_dtype)
+                o2_iinfo = dpt.iinfo(o2_dtype)
                 if (o1_val < o2_iinfo.min) or (o1_val > o2_iinfo.max):
                     return dpt.dtype(np.min_scalar_type(o1_val)), o2_dtype
             return o2_dtype, o2_dtype
@@ -473,7 +472,7 @@ def _resolve_weak_types_all_py_ints(o1_dtype, o2_dtype, dev):
                 o2_dtype, WeakIntegralType
             ):
                 o2_val = o2_dtype.get()
-                o1_iinfo = dpt_ext.iinfo(o1_dtype)
+                o1_iinfo = dpt.iinfo(o1_dtype)
                 if (o2_val < o1_iinfo.min) or (o2_val > o1_iinfo.max):
                     return o1_dtype, dpt.dtype(np.min_scalar_type(o2_val))
             return o1_dtype, o1_dtype
@@ -936,8 +935,8 @@ def _default_accumulation_dtype(inp_dt, q):
             res_dt = inp_dt
     elif inp_kind in "u":
         res_dt = dpt.dtype(ti.default_device_uint_type(q))
-        res_ii = dpt_ext.iinfo(res_dt)
-        inp_ii = dpt_ext.iinfo(inp_dt)
+        res_ii = dpt.iinfo(res_dt)
+        inp_ii = dpt.iinfo(inp_dt)
         if inp_ii.min >= res_ii.min and inp_ii.max <= res_ii.max:
             pass
         else:
@@ -956,7 +955,7 @@ def _default_accumulation_dtype_fp_types(inp_dt, q):
     inp_kind = inp_dt.kind
     if inp_kind in "biu":
         res_dt = dpt.dtype(ti.default_device_fp_type(q))
-        can_cast_v = dpt_ext.can_cast(inp_dt, res_dt)
+        can_cast_v = dpt.can_cast(inp_dt, res_dt)
         if not can_cast_v:
             _fp64 = q.sycl_device.has_aspect_fp64
             res_dt = dpt.float64 if _fp64 else dpt.float32
diff --git a/dpctl_ext/tensor/_types.pxi b/dpctl_ext/tensor/_types.pxi
new file mode 100644
index 000000000000..090750658f4b
--- /dev/null
+++ b/dpctl_ext/tensor/_types.pxi
@@ -0,0 +1,169 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+# these typenum values are aligned to values in NumPy
+cdef:
+    int UAR_BOOL = 0  # pragma: no cover
+    int UAR_BYTE = 1  # pragma: no cover
+    int UAR_UBYTE = 2  # pragma: no cover
+    int UAR_SHORT = 3  # pragma: no cover
+    int UAR_USHORT = 4  # pragma: no cover
+    int UAR_INT = 5  # pragma: no cover
+    int UAR_UINT = 6  # pragma: no cover
+    int UAR_LONG = 7  # pragma: no cover
+    int UAR_ULONG = 8  # pragma: no cover
+    int UAR_LONGLONG = 9  # pragma: no cover
+    int UAR_ULONGLONG = 10  # pragma: no cover
+    int UAR_FLOAT = 11  # pragma: no cover
+    int UAR_DOUBLE = 12  # pragma: no cover
+    int UAR_CFLOAT = 14  # pragma: no cover
+    int UAR_CDOUBLE = 15  # pragma: no cover
+    int UAR_TYPE_SENTINEL = 17  # pragma: no cover
+    int UAR_HALF = 23  # pragma: no cover
+
+cdef int type_bytesize(int typenum):
+    """
+    NPY_BOOL=0         : 1
+    NPY_BYTE=1         : 1
+    NPY_UBYTE=2        : 1
+    NPY_SHORT=3        : 2
+    NPY_USHORT=4       : 2
+    NPY_INT=5          : sizeof(int)
+    NPY_UINT=6         : sizeof(unsigned int)
+    NPY_LONG=7         : sizeof(long)
+    NPY_ULONG=8        : sizeof(unsigned long)
+    NPY_LONGLONG=9     : 8
+    NPY_ULONGLONG=10   : 8
+    NPY_FLOAT=11       : 4
+    NPY_DOUBLE=12      : 8
+    NPY_LONGDOUBLE=13  : N/A
+    NPY_CFLOAT=14      : 8
+    NPY_CDOUBLE=15     : 16
+    NPY_CLONGDOUBLE=16 : N/A
+    NPY_HALF=23        : 2
+    """
+    cdef int *type_to_bytesize = [
+        1,
+        sizeof(char),
+        sizeof(unsigned char),
+        sizeof(short),
+        sizeof(unsigned short),
+        sizeof(int),
+        sizeof(unsigned int),
+        sizeof(long),
+        sizeof(unsigned long),
+        sizeof(long long),
+        sizeof(unsigned long long),
+        sizeof(float),
+        sizeof(double), -1,
+        sizeof(float complex),
+        sizeof(double complex), -1]
+
+    if typenum < 0:  # pragma: no cover
+        return -1
+    if typenum > 16:
+        if typenum == 23:
+            return 2
+        return -1
+
+    return type_to_bytesize[typenum]
+
+
+cdef str _make_typestr(int typenum):
+    """
+    Make typestring from type number
+    """
+    cdef type_to_str = ["|b", "|i", "|u", "|i", "|u",
+                        "|i", "|u", "|i", "|u", "|i", "|u",
+                        "|f", "|f", "", "|c", "|c", ""]
+
+    if (typenum < 0):  # pragma: no cover
+        return ""
+    if (typenum > 16):
+        if (typenum == 23):
+            return "|f2"
+        return ""  # pragma: no cover
+
+    return type_to_str[typenum] + str(type_bytesize(typenum))
+
+
+cdef int typenum_from_format(str s):
+    """
+    Internal utility to convert string describing type format
+
+    Format is [<|=>][biufc]#
+    Shortcuts for formats are i, u, d, D
+    """
+    if not s:
+        return -1
+    try:
+        dt = np.dtype(s)
+    except Exception:
+        return -1
+    if (dt.byteorder == ">"):
+        return -2
+    return dt.num
+
+
+cdef int descr_to_typenum(object dtype):
+    """
+    Returns typenum for argumentd dtype that has attribute descr,
+    assumed numpy.dtype
+    """
+    obj = getattr(dtype, "descr")
+    if (not isinstance(obj, list) or len(obj) != 1):
+        return -1    # token for ValueError
+    obj = obj[0]
+    if (
+        not isinstance(obj, tuple) or len(obj) != 2 or obj[0]
+    ):  # pragma: no cover
+        return -1
+    obj = obj[1]
+    if not isinstance(obj, str):  # pragma: no cover
+        return -1
+    return typenum_from_format(obj)
+
+
+cdef int dtype_to_typenum(dtype):
+    if isinstance(dtype, str):
+        return typenum_from_format(dtype)
+    elif isinstance(dtype, bytes):
+        return typenum_from_format(dtype.decode("UTF-8"))
+    elif hasattr(dtype, "descr"):
+        return descr_to_typenum(dtype)
+    else:
+        try:
+            dt = np.dtype(dtype)
+        except TypeError:
+            return -3
+        except Exception:  # pragma: no cover
+            return -1
+        if hasattr(dt, "descr"):
+            return descr_to_typenum(dt)
+        else:  # pragma: no cover
+            return -3  # token for TypeError
diff --git a/dpctl_ext/tensor/_usmarray.pxd b/dpctl_ext/tensor/_usmarray.pxd
new file mode 100644
index 000000000000..ccb8f4c796b7
--- /dev/null
+++ b/dpctl_ext/tensor/_usmarray.pxd
@@ -0,0 +1,88 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+# distutils: language = c++
+# cython: language_level=3
+
+cimport dpctl
+
+
+cdef public api int USM_ARRAY_C_CONTIGUOUS
+cdef public api int USM_ARRAY_F_CONTIGUOUS
+cdef public api int USM_ARRAY_WRITABLE
+
+cdef public api int UAR_BOOL
+cdef public api int UAR_BYTE
+cdef public api int UAR_UBYTE
+cdef public api int UAR_SHORT
+cdef public api int UAR_USHORT
+cdef public api int UAR_INT
+cdef public api int UAR_UINT
+cdef public api int UAR_LONG
+cdef public api int UAR_ULONG
+cdef public api int UAR_LONGLONG
+cdef public api int UAR_ULONGLONG
+cdef public api int UAR_FLOAT
+cdef public api int UAR_DOUBLE
+cdef public api int UAR_CFLOAT
+cdef public api int UAR_CDOUBLE
+cdef public api int UAR_TYPE_SENTINEL
+cdef public api int UAR_HALF
+
+
+cdef api class usm_ndarray [object PyUSMArrayObject, type PyUSMArrayType]:
+    # data fields
+    cdef char* data_
+    cdef int nd_
+    cdef Py_ssize_t *shape_
+    cdef Py_ssize_t *strides_
+    cdef int typenum_
+    cdef int flags_
+    cdef object base_
+    cdef object array_namespace_
+    # make usm_ndarray weak-referenceable
+    cdef object __weakref__
+
+    cdef void _reset(usm_ndarray self)
+    cdef void _cleanup(usm_ndarray self)
+    cdef Py_ssize_t get_offset(usm_ndarray self) except *
+
+    cdef char* get_data(self)
+    cdef int get_ndim(self)
+    cdef Py_ssize_t * get_shape(self)
+    cdef Py_ssize_t * get_strides(self)
+    cdef int get_typenum(self)
+    cdef int get_itemsize(self)
+    cdef int get_flags(self)
+    cdef object get_base(self)
+    cdef dpctl.DPCTLSyclQueueRef get_queue_ref(self) except *
+    cdef dpctl.SyclQueue get_sycl_queue(self)
+
+    cdef _set_writable_flag(self, int)
+
+    cdef __cythonbufferdefaults__ = {"mode": "strided"}
diff --git a/dpctl_ext/tensor/_usmarray.pyx b/dpctl_ext/tensor/_usmarray.pyx
new file mode 100644
index 000000000000..4f3856a29fe4
--- /dev/null
+++ b/dpctl_ext/tensor/_usmarray.pyx
@@ -0,0 +1,1986 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+# distutils: language = c++
+# cython: language_level=3
+# cython: linetrace=True
+
+import dpctl
+import dpctl.memory as dpmem
+import numpy as np
+
+from dpctl._backend cimport DPCTLSyclUSMRef
+from dpctl._sycl_device_factory cimport _cached_default_device
+
+# TODO: remote it when dpnp fully migrates dpctl/tensor
+import dpctl_ext
+
+from ._data_types import bool as dpt_bool
+from ._device import Device
+from ._print import usm_ndarray_repr, usm_ndarray_str
+
+cimport dpctl as c_dpctl
+cimport dpctl.memory as c_dpmem
+from cpython.mem cimport PyMem_Free
+from cpython.tuple cimport PyTuple_New, PyTuple_SetItem
+
+from . cimport _dlpack as c_dlpack
+
+from enum import IntEnum
+
+from . import _flags
+from ._dlpack import get_build_dlpack_version
+from ._tensor_impl import default_device_fp_type
+
+include "_stride_utils.pxi"
+include "_types.pxi"
+include "_slicing.pxi"
+
+
+class DLDeviceType(IntEnum):
+    """
+    An :class:`enum.IntEnum` for the types of DLDevices supported by the DLPack
+    protocol.
+
+        ``kDLCPU``:
+            CPU (host) device
+        ``kDLCUDA``:
+            CUDA GPU device
+        ``kDLCUDAHost``:
+            Pinned CUDA CPU memory by cudaMallocHost
+        ``kDLOpenCL``:
+            OpenCL device
+        ``kDLVulkan``:
+            Vulkan buffer
+        ``kDLMetal``:
+            Metal for Apple GPU
+        ``kDLVPI``:
+            Verilog simulator buffer
+        ``kDLROCM``:
+            ROCm GPU device
+        ``kDLROCMHost``:
+            Pinned ROCm CPU memory allocated by hipMallocHost
+        ``kDLExtDev``:
+            Reserved extension device type used to test new devices
+        ``kDLCUDAManaged``:
+            CUDA managed/unified memory allocated by cudaMallocManaged
+        ``kDLOneAPI``:
+            Unified shared memory allocated on a oneAPI non-partitioned device
+        ``kDLWebGPU``:
+            Device support for WebGPU standard
+        ``kDLHexagon``:
+            Qualcomm Hexagon DSP
+        ``kDLMAIA``:
+            Microsoft MAIA device
+        ``kDLTrn``:
+            AWS Trainium device
+    """
+    kDLCPU = c_dlpack.device_CPU
+    kDLCUDA = c_dlpack.device_CUDA
+    kDLCUDAHost = c_dlpack.device_CUDAHost
+    kDLCUDAManaged = c_dlpack.device_CUDAManaged
+    kDLROCM = c_dlpack.device_DLROCM
+    kDLROCMHost = c_dlpack.device_ROCMHost
+    kDLOpenCL = c_dlpack.device_OpenCL
+    kDLVulkan = c_dlpack.device_Vulkan
+    kDLMetal = c_dlpack.device_Metal
+    kDLVPI = c_dlpack.device_VPI
+    kDLOneAPI = c_dlpack.device_OneAPI
+    kDLWebGPU = c_dlpack.device_WebGPU
+    kDLHexagon = c_dlpack.device_Hexagon
+    kDLMAIA = c_dlpack.device_MAIA
+    kDLTrn = c_dlpack.device_Trn
+
+
+cdef class InternalUSMArrayError(Exception):
+    """
+    An InternalUSMArrayError exception is raised when internal
+    inconsistency has been detected in :class:`.usm_ndarray`.
+    """
+    pass
+
+
+cdef object _as_zero_dim_ndarray(object usm_ary):
+    "Convert size-1 array to NumPy 0d array"
+    mem_view = dpmem.as_usm_memory(usm_ary)
+    usm_ary.sycl_queue.wait()
+    host_buf = mem_view.copy_to_host()
+    view = host_buf.view(usm_ary.dtype)
+    view.shape = tuple()
+    return view
+
+
+cdef inline void _check_0d_scalar_conversion(object usm_ary) except *:
+    "Raise TypeError if array cannot be converted to a Python scalar"
+    if (usm_ary.ndim != 0):
+        raise TypeError(
+            "only 0-dimensional arrays can be converted to Python scalars"
+        )
+
+
+cdef int _copy_writable(int lhs_flags, int rhs_flags):
+    "Copy the WRITABLE flag to lhs_flags from rhs_flags"
+    return (lhs_flags & ~USM_ARRAY_WRITABLE) | (rhs_flags & USM_ARRAY_WRITABLE)
+
+
+cdef bint _is_host_cpu(object dl_device):
+    "Check if dl_device denotes (kDLCPU, 0)"
+    cdef object dl_type
+    cdef object dl_id
+    cdef Py_ssize_t n_elems = -1
+
+    try:
+        n_elems = len(dl_device)
+    except TypeError:
+        pass
+
+    if n_elems != 2:
+        return False
+
+    dl_type = dl_device[0]
+    dl_id = dl_device[1]
+    if isinstance(dl_type, str):
+        return (dl_type == "kDLCPU" and dl_id == 0)
+
+    return (dl_type == DLDeviceType.kDLCPU) and (dl_id == 0)
+
+
+cdef void _validate_and_use_stream(
+    object stream, c_dpctl.SyclQueue self_queue
+) except *:
+    if (stream is None or stream == self_queue):
+        pass
+    else:
+        if not isinstance(stream, dpctl.SyclQueue):
+            raise TypeError(
+                "stream argument type was expected to be dpctl.SyclQueue,"
+                f" got {type(stream)} instead"
+            )
+        ev = self_queue.submit_barrier()
+        stream.submit_barrier(dependent_events=[ev])
+
+cdef class usm_ndarray:
+    """ usm_ndarray(shape, dtype=None, strides=None, buffer="device", \
+           offset=0, order="C", buffer_ctor_kwargs=dict(), \
+           array_namespace=None)
+
+    An array object represents a multidimensional tensor of numeric
+    elements stored in a USM allocation on a SYCL device.
+
+    Arg:
+        shape (int, tuple):
+            Shape of the array to be created.
+        dtype (str, dtype):
+            Array data type, i.e. the type of array elements.
+            If ``dtype`` has the value ``None``, it is determined by default
+            floating point type supported by target device.
+            The supported types are
+
+                ``bool``:
+                    boolean type
+                ``int8``, ``int16``, ``int32``, ``int64``:
+                    signed integer types
+                ``uint8``, ``uint16``, ``uint32``, ``uint64``:
+                    unsigned integer types
+                ``float16``:
+                    half-precision floating type,
+                    supported if target device's property
+                    ``has_aspect_fp16`` is ``True``
+                ``float32``, ``complex64``:
+                    single-precision real and complex floating types
+                ``float64``, ``complex128``:
+                    double-precision real and complex floating
+                    types, supported if target device's property
+                    ``has_aspect_fp64`` is ``True``.
+
+            Default: ``None``.
+        strides (tuple, optional):
+            Strides of the array to be created in elements.
+            If ``strides`` has the value ``None``, it is determined by the
+            ``shape`` of the array and the requested ``order``.
+            Default: ``None``.
+        buffer (str, object, optional):
+            A string corresponding to the type of USM allocation to make,
+            or a Python object representing a USM memory allocation, i.e.
+            :class:`dpctl.memory.MemoryUSMDevice`,
+            :class:`dpctl.memory.MemoryUSMShared`, or
+            :class:`dpctl.memory.MemoryUSMHost`. Recognized strings are
+            ``"device"``, ``"shared"``, or ``"host"``. Additional arguments to
+            the USM memory allocators can be passed in a dictionary specified
+            via ``buffer_ctor_kwrds`` keyword parameter.
+            Default: ``"device"``.
+        offset (int, optional):
+            Offset of the array element with all zero indexes relative to the
+            start of the provided `buffer` in elements. The argument is ignored
+            if the ``buffer`` value is a string and the memory is allocated by
+            the constructor. Default: ``0``.
+        order ({"C", "F"}, optional):
+            The memory layout of the array when constructing using a new
+            allocation. Value ``"C"`` corresponds to C-contiguous, or row-major
+            memory layout, while value ``"F"`` corresponds to F-contiguous, or
+            column-major layout. Default: ``"C"``.
+        buffer_ctor_kwargs (dict, optional):
+            Dictionary with keyword parameters to use when creating a new USM
+            memory allocation. See :class:`dpctl.memory.MemoryUSMShared` for
+            supported keyword arguments.
+        array_namespace (module, optional):
+            Array namespace module associated with this array.
+            Default: ``None``.
+
+    ``buffer`` can be ``"shared"``, ``"host"``, ``"device"`` to allocate
+    new device memory by calling respective constructor with
+    the specified ``buffer_ctor_kwrds``; ``buffer`` can be an
+    instance of :class:`dpctl.memory.MemoryUSMShared`,
+    :class:`dpctl.memory.MemoryUSMDevice`, or
+    :class:`dpctl.memory.MemoryUSMHost`; ``buffer`` can also be
+    another :class:`dpctl.tensor.usm_ndarray` instance, in which case its
+    underlying ``MemoryUSM*`` buffer is used.
+    """
+
+    cdef void _reset(usm_ndarray self):
+        """
+        Initializes member fields
+        """
+        self.base_ = None
+        self.array_namespace_ = None
+        self.nd_ = -1
+        self.data_ = <char *>0
+        self.shape_ = <Py_ssize_t *>0
+        self.strides_ = <Py_ssize_t *>0
+        self.flags_ = 0
+
+    cdef void _cleanup(usm_ndarray self):
+        if (self.shape_):
+            PyMem_Free(self.shape_)
+        if (self.strides_):
+            PyMem_Free(self.strides_)
+        self._reset()
+
+    def __cinit__(self, shape, dtype=None, strides=None, buffer="device",
+                  Py_ssize_t offset=0, order="C",
+                  buffer_ctor_kwargs=dict(),
+                  array_namespace=None):
+        """
+        strides and offset must be given in units of array elements.
+        buffer can be strings ('device'|'shared'|'host' to allocate new memory)
+        or ``dpctl.memory.MemoryUSM*`` buffers, or ``usm_ndarray`` instances.
+        """
+        cdef int nd = 0
+        cdef int typenum = 0
+        cdef int itemsize = 0
+        cdef int err = 0
+        cdef int contig_flag = 0
+        cdef int writable_flag = USM_ARRAY_WRITABLE
+        cdef Py_ssize_t *shape_ptr = NULL
+        cdef Py_ssize_t ary_nelems = 0
+        cdef Py_ssize_t ary_nbytes = 0
+        cdef Py_ssize_t *strides_ptr = NULL
+        cdef Py_ssize_t _offset = offset
+        cdef Py_ssize_t ary_min_displacement = 0
+        cdef Py_ssize_t ary_max_displacement = 0
+        cdef bint is_fp64 = False
+        cdef bint is_fp16 = False
+
+        self._reset()
+        if not isinstance(shape, (list, tuple)):
+            if hasattr(shape, "tolist"):
+                fn = getattr(shape, "tolist")
+                if callable(fn):
+                    shape = shape.tolist()
+            if not isinstance(shape, (list, tuple)):
+                try:
+                    <Py_ssize_t> shape
+                    shape = [shape, ]
+                except Exception as e:
+                    raise TypeError(
+                        "Argument shape must a non-negative integer, "
+                        "or a list/tuple of such integers."
+                    ) from e
+        nd = len(shape)
+        if dtype is None:
+            if isinstance(buffer, (dpmem._memory._Memory, usm_ndarray)):
+                q = buffer.sycl_queue
+            else:
+                q = buffer_ctor_kwargs.get("queue")
+            if q is not None:
+                dtype = default_device_fp_type(q)
+            else:
+                dev = _cached_default_device()
+                dtype = "f8" if dev.has_aspect_fp64 else "f4"
+        typenum = dtype_to_typenum(dtype)
+        if (typenum < 0):
+            if typenum == -2:
+                raise ValueError(
+                    "Data type '" + str(dtype) +
+                    "' can only have native byteorder."
+                )
+            elif typenum == -1:
+                raise ValueError(
+                    "Data type '" + str(dtype) + "' is not understood."
+                )
+            raise TypeError(
+                f"Expected string or a dtype object, got {type(dtype)}"
+            )
+        itemsize = type_bytesize(typenum)
+        if (itemsize < 1):
+            raise TypeError(
+                "dtype=" + np.dtype(dtype).name + " is not supported."
+            )
+        # allocate host C-arrays for shape, strides
+        err = _from_input_shape_strides(
+            nd, shape, strides, itemsize, <char> ord(order),
+            &shape_ptr, &strides_ptr, &ary_nelems,
+            &ary_min_displacement, &ary_max_displacement, &contig_flag
+        )
+        if (err):
+            self._cleanup()
+            if err == ERROR_MALLOC:
+                raise MemoryError("Memory allocation for shape/strides "
+                                  "array failed.")
+            elif err == ERROR_INCORRECT_ORDER:
+                raise ValueError(
+                    "Unsupported order='{}' given. "
+                    "Supported values are 'C' or 'F'.".format(order))
+            elif err == ERROR_UNEXPECTED_STRIDES:
+                raise ValueError(
+                    "strides={} is not understood".format(strides))
+            else:
+                raise InternalUSMArrayError(
+                    " .. while processing shape and strides.")
+        ary_nbytes = (ary_max_displacement -
+                      ary_min_displacement + 1) * itemsize
+        if isinstance(buffer, dpmem._memory._Memory):
+            _buffer = buffer
+        elif isinstance(buffer, (str, bytes)):
+            if isinstance(buffer, bytes):
+                buffer = buffer.decode("UTF-8")
+            _offset = -ary_min_displacement
+            if (buffer == "shared"):
+                _buffer = dpmem.MemoryUSMShared(ary_nbytes,
+                                                **buffer_ctor_kwargs)
+            elif (buffer == "device"):
+                _buffer = dpmem.MemoryUSMDevice(ary_nbytes,
+                                                **buffer_ctor_kwargs)
+            elif (buffer == "host"):
+                _buffer = dpmem.MemoryUSMHost(ary_nbytes,
+                                              **buffer_ctor_kwargs)
+            else:
+                self._cleanup()
+                raise ValueError(
+                    "buffer='{}' is not understood. "
+                    "Recognized values are 'device', 'shared',  'host', "
+                    "an instance of `MemoryUSM*` object, or a usm_ndarray"
+                    "".format(buffer)
+                )
+        elif isinstance(buffer, usm_ndarray):
+            if not buffer.flags.writable:
+                writable_flag = 0
+            _buffer = buffer.usm_data
+        else:
+            self._cleanup()
+            raise ValueError("buffer='{}' was not understood.".format(buffer))
+        if (shape_to_elem_count(nd, shape_ptr) > 0 and
+            (_offset + ary_min_displacement < 0 or
+             (_offset + ary_max_displacement + 1) * itemsize > _buffer.nbytes)):
+            self._cleanup()
+            raise ValueError(("buffer='{}' can not accommodate "
+                              "the requested array.").format(buffer))
+        is_fp64 = (typenum == UAR_DOUBLE or typenum == UAR_CDOUBLE)
+        is_fp16 = (typenum == UAR_HALF)
+        if (is_fp64 or is_fp16):
+            if (
+                (is_fp64 and not _buffer.sycl_device.has_aspect_fp64) or
+                (is_fp16 and not _buffer.sycl_device.has_aspect_fp16)
+            ):
+                raise ValueError(
+                    f"Device {_buffer.sycl_device.name} does"
+                    f" not support {dtype} natively."
+                )
+        self.base_ = _buffer
+        self.data_ = (<char *> (<size_t> _buffer._pointer)) + itemsize * _offset
+        self.shape_ = shape_ptr
+        self.strides_ = strides_ptr
+        self.typenum_ = typenum
+        self.flags_ = (contig_flag | writable_flag)
+        self.nd_ = nd
+        self.array_namespace_ = array_namespace
+
+    def __dealloc__(self):
+        self._cleanup()
+
+    @property
+    def _pointer(self):
+        """
+        Returns USM pointer to the start of array (element with zero
+        multi-index) encoded as integer.
+        """
+        return <size_t> self.get_data()
+
+    cdef Py_ssize_t get_offset(self) except *:
+        cdef char *mem_ptr = NULL
+        cdef char *ary_ptr = self.get_data()
+        mem_ptr = <char *>(<size_t> self.base_._pointer)
+        byte_offset = ary_ptr - mem_ptr
+        item_size = self.get_itemsize()
+        if (byte_offset % item_size):
+            raise InternalUSMArrayError(
+                "byte_offset is not a multiple of item_size.")
+        return byte_offset // item_size
+
+    @property
+    def _element_offset(self):
+        """Returns the offset of the zero-index element of the array, in
+        elements, relative to the start of memory allocation"""
+        return self.get_offset()
+
+    @property
+    def _byte_bounds(self):
+        """Returns a 2-tuple with pointers to the end-points of the array
+
+        :Example:
+
+            .. code-block:: python
+
+                from dpctl_ext import tensor
+
+                x = tensor.ones((3, 10, 7))
+                y = tensor.flip(x[:, 1::2], axis=1)
+
+                beg_p, end_p = y._byte_bounds
+                # Bytes taken to store this array
+                bytes_extent = end_p - beg_p
+
+                # C-contiguous copy is more compact
+                yc = tensor.copy(y, order="C")
+                beg_pc, end_pc = yc._byte_bounds
+                assert bytes_extent < end_pc - beg_pc
+        """
+        cdef Py_ssize_t min_disp = 0
+        cdef Py_ssize_t max_disp = 0
+        cdef Py_ssize_t step_ = 0
+        cdef Py_ssize_t dim_ = 0
+        cdef int it = 0
+        cdef Py_ssize_t _itemsize = self.get_itemsize()
+
+        if (
+            (self.flags_ & USM_ARRAY_C_CONTIGUOUS)
+            or (self.flags_ & USM_ARRAY_F_CONTIGUOUS)
+        ):
+            return (
+                self._pointer,
+                self._pointer + shape_to_elem_count(
+                    self.nd_, self.shape_
+                ) * _itemsize
+            )
+
+        for it in range(self.nd_):
+            dim_ = self.shape[it]
+            if dim_ > 0:
+                step_ = self.strides[it]
+                if step_ > 0:
+                    max_disp += step_ * (dim_ - 1)
+                else:
+                    min_disp += step_ * (dim_ - 1)
+
+        return (
+            self._pointer + min_disp * _itemsize,
+            self._pointer + (max_disp + 1) * _itemsize
+        )
+
+    cdef char* get_data(self):
+        """Returns the USM pointer for this array."""
+        return self.data_
+
+    cdef int get_ndim(self):
+        """
+        Returns the number of indices needed to address
+        an element of this array.
+        """
+        return self.nd_
+
+    cdef Py_ssize_t* get_shape(self):
+        """
+        Returns pointer to shape C-array for this array.
+
+        C-array has at least ``ndim`` non-negative elements,
+        which determine the range of permissible indices
+        addressing individual elements of this array.
+        """
+        return self.shape_
+
+    cdef Py_ssize_t* get_strides(self):
+        """
+        Returns pointer to strides C-array for this array.
+
+        The pointer can be NULL (contiguous array), or the
+        array size is at least ``ndim`` elements
+        """
+        return self.strides_
+
+    cdef int get_typenum(self):
+        """Returns typenum corresponding to values of this array"""
+        return self.typenum_
+
+    cdef int get_itemsize(self):
+        """
+        Returns itemsize of this arrays in bytes
+        """
+        return type_bytesize(self.typenum_)
+
+    cdef int get_flags(self):
+        """Returns flags of this array"""
+        return self.flags_
+
+    cdef object get_base(self):
+        """Returns the object owning the USM data addressed by this array"""
+        return self.base_
+
+    cdef c_dpctl.SyclQueue get_sycl_queue(self):
+        cdef c_dpmem._Memory mem
+        if not isinstance(self.base_, dpctl.memory._Memory):
+            raise InternalUSMArrayError(
+                "This array has unexpected memory owner"
+            )
+        mem = <c_dpmem._Memory> self.base_
+        return mem.queue
+
+    cdef c_dpctl.DPCTLSyclQueueRef get_queue_ref(self) except *:
+        """
+        Returns a copy of DPCTLSyclQueueRef associated with array
+        """
+        cdef c_dpctl.SyclQueue q = self.get_sycl_queue()
+        cdef c_dpctl.DPCTLSyclQueueRef QRef = q.get_queue_ref()
+        cdef c_dpctl.DPCTLSyclQueueRef QRefCopy = NULL
+        if QRef is not NULL:
+            QRefCopy = c_dpctl.DPCTLQueue_Copy(QRef)
+            return QRefCopy
+        else:
+            raise InternalUSMArrayError(
+                "Memory owner of this array is corrupted"
+            )
+
+    @property
+    def __sycl_usm_array_interface__(self):
+        """
+        Gives ``__sycl_usm_array_interface__`` dictionary describing
+        the array.
+        """
+        cdef Py_ssize_t byte_offset = -1
+        cdef int item_size = -1
+        cdef Py_ssize_t elem_offset = -1
+        cdef char *mem_ptr = NULL
+        cdef char *ary_ptr = NULL
+        if (not isinstance(self.base_, dpmem._memory._Memory)):
+            raise InternalUSMArrayError(
+                "Invalid instance of usm_ndarray encountered. "
+                "Private field base_ has an unexpected type {}.".format(
+                    type(self.base_)
+                )
+            )
+        ary_iface = self.base_.__sycl_usm_array_interface__
+        mem_ptr = <char *>(<size_t> ary_iface["data"][0])
+        ary_ptr = <char *>(<size_t> self.data_)
+        ro_flag = False if (self.flags_ & USM_ARRAY_WRITABLE) else True
+        ary_iface["data"] = (<size_t> mem_ptr, ro_flag)
+        ary_iface["shape"] = self.shape
+        if (self.strides_):
+            ary_iface["strides"] = _make_int_tuple(self.nd_, self.strides_)
+        else:
+            if (self.flags_ & USM_ARRAY_C_CONTIGUOUS):
+                ary_iface["strides"] = None
+            elif (self.flags_ & USM_ARRAY_F_CONTIGUOUS):
+                ary_iface["strides"] = _f_contig_strides(self.nd_, self.shape_)
+            else:
+                raise InternalUSMArrayError(
+                    "USM Array is not contiguous and has empty strides"
+                )
+        ary_iface["typestr"] = _make_typestr(self.typenum_)
+        byte_offset = ary_ptr - mem_ptr
+        item_size = self.get_itemsize()
+        if (byte_offset % item_size):
+            raise InternalUSMArrayError(
+                "byte_offset is not a multiple of item_size.")
+        elem_offset = byte_offset // item_size
+        ary_iface["offset"] = elem_offset
+        # must wait for content of the memory to finalize
+        self.sycl_queue.wait()
+        return ary_iface
+
+    @property
+    def ndim(self):
+        """
+        Gives the number of indices needed to address elements of this array.
+        """
+        return self.nd_
+
+    @property
+    def usm_data(self):
+        """
+        Gives USM memory object underlying :class:`.usm_ndarray` instance.
+        """
+        return self.get_base()
+
+    @property
+    def shape(self):
+        """
+        Elements of the shape tuple give the lengths of the
+        respective array dimensions.
+
+        Setting shape is allowed only when reshaping to the requested
+        dimensions can be returned as view, otherwise :exc:`AttributeError`
+        is raised. Use :func:`dpctl.tensor.reshape` to reshape the array
+        in all cases.
+
+        :Example:
+
+            .. code-block:: python
+
+                from dpctl_ext import tensor
+
+                x = tensor.arange(899)
+                x.shape = (29, 31)
+        """
+        if self.nd_ > 0:
+            return _make_int_tuple(self.nd_, self.shape_)
+        else:
+            return tuple()
+
+    @shape.setter
+    def shape(self, new_shape):
+        """
+        Modifies usm_ndarray instance in-place by changing its metadata
+        about the shape and the strides of the array, or raises
+        `AttributeError` exception if in-place change is not possible.
+
+        Args:
+            new_shape: (tuple, int)
+                New shape. Only non-negative values are supported.
+                The new shape may not lead to the change in the
+                number of elements in the array.
+
+        Whether the array can be reshape in-place depends on its
+        strides. Use :func:`dpctl.tensor.reshape` function which
+        always succeeds to reshape the array by performing a copy
+        if necessary.
+        """
+        cdef int new_nd = -1
+        cdef Py_ssize_t nelems = -1
+        cdef int err = 0
+        cdef Py_ssize_t min_disp = 0
+        cdef Py_ssize_t max_disp = 0
+        cdef int contig_flag = 0
+        cdef Py_ssize_t *shape_ptr = NULL
+        cdef Py_ssize_t *strides_ptr = NULL
+        cdef Py_ssize_t size = -1
+        import operator
+
+        from ._reshape import reshaped_strides
+
+        try:
+            new_nd = len(new_shape)
+        except TypeError:
+            new_nd = 1
+            new_shape = (new_shape,)
+        try:
+            new_shape = tuple(operator.index(dim) for dim in new_shape)
+        except TypeError:
+            raise TypeError(
+                "Target shape must be a finite iterable of integers"
+            )
+        size = shape_to_elem_count(self.nd_, self.shape_)
+        if not np.prod(new_shape) == size:
+            raise TypeError(
+                f"Can not reshape array of size {self.size} into {new_shape}"
+            )
+        if size > 0:
+            new_strides = reshaped_strides(
+               self.shape,
+               self.strides,
+               new_shape
+            )
+        else:
+            new_strides = (1,) * len(new_shape)
+        if new_strides is None:
+            raise AttributeError(
+                "Incompatible shape for in-place modification. "
+                "Use `reshape()` to make a copy with the desired shape."
+            )
+        err = _from_input_shape_strides(
+            new_nd, new_shape, new_strides,
+            self.get_itemsize(),
+            b"C",
+            &shape_ptr, &strides_ptr,
+            &nelems, &min_disp, &max_disp, &contig_flag
+        )
+        if (err == 0):
+            if (self.shape_):
+                PyMem_Free(self.shape_)
+            if (self.strides_):
+                PyMem_Free(self.strides_)
+            self.flags_ = (contig_flag | (self.flags_ & USM_ARRAY_WRITABLE))
+            self.nd_ = new_nd
+            self.shape_ = shape_ptr
+            self.strides_ = strides_ptr
+        else:
+            raise InternalUSMArrayError(
+                "Encountered in shape setter, error code {err}".format(err)
+            )
+
+    @property
+    def strides(self):
+        """
+        Returns memory displacement in array elements, upon unit
+        change of respective index.
+
+        For example, for strides ``(s1, s2, s3)`` and multi-index
+        ``(i1, i2, i3)`` position of the respective element relative
+        to zero multi-index element is ``s1*s1 + s2*i2 + s3*i3``.
+
+        :Example:
+
+            .. code-block:: python
+
+                from dpctl_ext import tensor
+
+                x = tensor.zeros((20, 30))
+                xv = x[10:, :15]
+
+                multi_id = (3, 5)
+                byte_displacement = xv[multi_id]._pointer - xv[0, 0]._pointer
+                element_displacement = sum(
+                    i * s for i, s in zip(multi_id, xv.strides)
+                )
+                assert byte_displacement == element_displacement * xv.itemsize
+        """
+        if (self.strides_):
+            return _make_int_tuple(self.nd_, self.strides_)
+        else:
+            if (self.flags_ & USM_ARRAY_C_CONTIGUOUS):
+                return _c_contig_strides(self.nd_, self.shape_)
+            elif (self.flags_ & USM_ARRAY_F_CONTIGUOUS):
+                return _f_contig_strides(self.nd_, self.shape_)
+            else:
+                raise ValueError("Inconsistent usm_ndarray data")
+
+    @property
+    def flags(self):
+        """
+        Returns :class:`dpctl.tensor._flags.Flags` object.
+        """
+        return _flags.Flags(self, self.flags_)
+
+    cdef _set_writable_flag(self, int flag):
+        cdef int mask = (USM_ARRAY_WRITABLE if flag else 0)
+        self.flags_ = _copy_writable(self.flags_, mask)
+
+    @property
+    def usm_type(self):
+        """
+        USM type of underlying memory. Possible values are:
+
+            * ``"device"``
+                USM-device allocation in device memory, only accessible
+                to kernels executed on the device
+            * ``"shared"``
+                USM-shared allocation in device memory, accessible both
+                from the device and from host
+            * ``"host"``
+                USM-host allocation in host memory, accessible both
+                from the device and from host
+
+        See: https://docs.oneapi.com/versions/latest/dpcpp/iface/usm.html
+        """
+        return self.base_.get_usm_type()
+
+    @property
+    def itemsize(self):
+        """
+        Size of array element in bytes.
+        """
+        return self.get_itemsize()
+
+    @property
+    def nbytes(self):
+        """
+        Total bytes consumed by the elements of the array.
+        """
+        return (
+            shape_to_elem_count(self.nd_, self.shape_) *
+            self.get_itemsize())
+
+    @property
+    def size(self):
+        """
+        Number of elements in the array.
+        """
+        return shape_to_elem_count(self.nd_, self.shape_)
+
+    @property
+    def dtype(self):
+        """
+        Returns NumPy's dtype corresponding to the type of the array elements.
+        """
+        return np.dtype(_make_typestr(self.typenum_))
+
+    @property
+    def sycl_queue(self):
+        """
+        Returns :class:`dpctl.SyclQueue` object associated with USM data.
+        """
+        return self.get_sycl_queue()
+
+    @property
+    def sycl_device(self):
+        """
+        Returns :class:`dpctl.SyclDevice` object on which USM data
+        was allocated.
+        """
+        q = self.sycl_queue
+        return q.sycl_device
+
+    @property
+    def device(self):
+        """
+        Returns :class:`dpctl.tensor.Device` object representing
+        residence of the array data.
+
+        The ``Device`` object represents Array API notion of the
+        device, and contains :class:`dpctl.SyclQueue` associated
+        with this array. Hence, ``.device`` property provides
+        information distinct from ``.sycl_device`` property.
+
+        :Example:
+
+            .. code-block:: python
+
+                >>> from dpctl_ext import tensor
+                >>> x = tensor.ones(10)
+                >>> x.device
+                Device(level_zero:gpu:0)
+        """
+        return Device.create_device(self.sycl_queue)
+
+    @property
+    def sycl_context(self):
+        """
+        Returns :class:`dpctl.SyclContext` object to which USM data is bound.
+        """
+        q = self.sycl_queue
+        return q.sycl_context
+
+    @property
+    def T(self):
+        """Returns transposed array for 2D array, raises ``ValueError``
+        otherwise.
+        """
+        if self.nd_ == 2:
+            return _transpose(self)
+        else:
+            raise ValueError(
+                "array.T requires array to have 2 dimensions. "
+                "Use array.mT to transpose stacks of matrices and "
+                "dpctl.tensor.permute_dims() to permute dimensions."
+            )
+
+    @property
+    def mT(self):
+        """ Returns array (a view) where the last two dimensions are
+        transposed.
+        """
+        if self.nd_ < 2:
+            raise ValueError(
+                "array.mT requires array to have at least 2 dimensions."
+            )
+        return _m_transpose(self)
+
+    @property
+    def real(self):
+        """
+        Returns view into real component for arrays with
+        complex data-types and returns itself for all other
+        data-types.
+
+        :Example:
+
+            .. code-block:: python
+
+                from dpctl_ext import tensor
+
+                # Create complex array from
+                # arrays of real and imaginary parts
+
+                re = tensor.linspace(-1, 1, num=100, dtype="f4")
+                im = tensor.full_like(re, fill_value=tensor.pi)
+
+                z = tensor.empty_like(re, dtype="c8")
+                z.real[:] = re
+                z.imag[:] = im
+        """
+        # explicitly check for UAR_HALF, which is greater than UAR_CFLOAT
+        if (self.typenum_ < UAR_CFLOAT or self.typenum_ == UAR_HALF):
+            # elements are real
+            return self
+        if (self.typenum_ < UAR_TYPE_SENTINEL):
+            return _real_view(self)
+
+    @property
+    def imag(self):
+        """ Returns view into imaginary component for arrays with
+        complex data-types and returns new zero array for all other
+        data-types.
+
+        :Example:
+
+            .. code-block:: python
+
+                from dpctl_ext import tensor
+
+                # Reset imaginary part of complex array
+
+                z = tensor.ones(100, dtype="c8")
+                z.imag[:] = dpt.pi/2
+        """
+        # explicitly check for UAR_HALF, which is greater than UAR_CFLOAT
+        if (self.typenum_ < UAR_CFLOAT or self.typenum_ == UAR_HALF):
+            # elements are real
+            return _zero_like(self)
+        if (self.typenum_ < UAR_TYPE_SENTINEL):
+            return _imag_view(self)
+
+    def __getitem__(self, ind):
+        cdef tuple _meta = _basic_slice_meta(
+            ind, (<object>self).shape, (<object> self).strides,
+            self.get_offset())
+        cdef usm_ndarray res
+        cdef int i = 0
+        cdef bint matching = 1
+
+        if len(_meta) < 5:
+            raise RuntimeError
+
+        res = usm_ndarray.__new__(
+            usm_ndarray,
+            _meta[0],
+            dtype=_make_typestr(self.typenum_),
+            strides=_meta[1],
+            buffer=self.base_,
+            offset=_meta[2]
+        )
+        res.array_namespace_ = self.array_namespace_
+
+        adv_ind = _meta[3]
+        adv_ind_start_p = _meta[4]
+
+        if adv_ind_start_p < 0:
+            res.flags_ = _copy_writable(res.flags_, self.flags_)
+            return res
+
+        from ._copy_utils import _extract_impl, _nonzero_impl, _take_multi_index
+
+        # if len(adv_ind == 1), the (only) element is always an array
+        if len(adv_ind) == 1 and adv_ind[0].dtype == dpt_bool:
+            key_ = adv_ind[0]
+            adv_ind_end_p = key_.ndim + adv_ind_start_p
+            if adv_ind_end_p > res.ndim:
+                raise IndexError("too many indices for the array")
+            key_shape = key_.shape
+            arr_shape = res.shape[adv_ind_start_p:adv_ind_end_p]
+            for i in range(key_.ndim):
+                if matching:
+                    if not key_shape[i] == arr_shape[i] and key_shape[i] > 0:
+                        matching = 0
+            if not matching:
+                raise IndexError(
+                    "boolean index did not match indexed array in dimensions"
+                )
+            res = _extract_impl(res, key_, axis=adv_ind_start_p)
+            res.flags_ = _copy_writable(res.flags_, self.flags_)
+            return res
+
+        if any(
+            (
+                isinstance(ind, usm_ndarray) and ind.dtype == dpt_bool
+            ) for ind in adv_ind
+        ):
+            adv_ind_int = list()
+            for ind in adv_ind:
+                if isinstance(ind, usm_ndarray) and ind.dtype == dpt_bool:
+                    adv_ind_int.extend(_nonzero_impl(ind))
+                else:
+                    adv_ind_int.append(ind)
+            res = _take_multi_index(res, tuple(adv_ind_int), adv_ind_start_p)
+            res.flags_ = _copy_writable(res.flags_, self.flags_)
+            return res
+
+        res = _take_multi_index(res, adv_ind, adv_ind_start_p)
+        res.flags_ = _copy_writable(res.flags_, self.flags_)
+        return res
+
+    def to_device(self, target_device, /, *, stream=None):
+        """ to_device(target_device, /, *, stream=None)
+
+        Transfers this array to specified target device.
+
+        :Example:
+            .. code-block:: python
+
+                import dpctl
+                import dpctl_ext.tensor as dpt
+
+                x = dpt.full(10**6, 2, dtype="int64")
+                q_prof = dpctl.SyclQueue(
+                    x.sycl_device, property="enable_profiling")
+                # return a view with profile-enabled queue
+                y = x.to_device(q_prof)
+                timer = dpctl.SyclTimer()
+                with timer(q_prof):
+                    z = y * y
+                print(timer.dt)
+
+        Args:
+            target_device (object):
+                Array API concept of target device.
+                It can be a oneAPI filter selector string,
+                an instance of :class:`dpctl.SyclDevice` corresponding to a
+                non-partitioned SYCL device, an instance of
+                :class:`dpctl.SyclQueue`, or a :class:`dpctl.tensor.Device`
+                object returned by :attr:`dpctl.tensor.usm_ndarray.device`.
+            stream (:class:`dpctl.SyclQueue`, optional):
+                Execution queue to synchronize with. If ``None``,
+                synchronization is not performed.
+
+        Returns:
+            usm_ndarray:
+                A view if data copy is not required, and a copy otherwise.
+                If copying is required, it is done by copying from the original
+                allocation device to the host, followed by copying from host
+                to the target device.
+        """
+        cdef c_dpctl.DPCTLSyclQueueRef QRef = NULL
+        cdef c_dpmem._Memory arr_buf
+        d = Device.create_device(target_device)
+
+        _validate_and_use_stream(stream, self.sycl_queue)
+
+        if (d.sycl_context == self.sycl_context):
+            arr_buf = <c_dpmem._Memory> self.usm_data
+            QRef = (<c_dpctl.SyclQueue> d.sycl_queue).get_queue_ref()
+            view_buffer = c_dpmem._Memory.create_from_usm_pointer_size_qref(
+                <DPCTLSyclUSMRef>arr_buf.get_data_ptr(),
+                arr_buf.nbytes,
+                QRef,
+                memory_owner=arr_buf
+            )
+            res = usm_ndarray(
+                self.shape,
+                self.dtype,
+                buffer=view_buffer,
+                strides=self.strides,
+                offset=self.get_offset()
+            )
+            res.flags_ = self.flags_
+            return res
+        else:
+            nbytes = self.usm_data.nbytes
+            copy_buffer = type(self.usm_data)(
+                nbytes, queue=d.sycl_queue
+            )
+            copy_buffer.copy_from_device(self.usm_data)
+            res = usm_ndarray(
+                self.shape,
+                self.dtype,
+                buffer=copy_buffer,
+                strides=self.strides,
+                offset=self.get_offset()
+            )
+            res.flags_ = self.flags_
+            return res
+
+    def _set_namespace(self, mod):
+        """ Sets array namespace to given module `mod`. """
+        self.array_namespace_ = mod
+
+    def __array_namespace__(self, api_version=None):
+        """
+        Returns array namespace, member functions of which
+        implement data API.
+
+        Args:
+            api_version (str, optional)
+                Request namespace compliant with given version of
+                array API. If ``None``, namespace for the most
+                recent supported version is returned.
+                Default: ``None``.
+        """
+        if api_version is not None:
+            from ._array_api import __array_api_version__
+            if not isinstance(api_version, str):
+                raise TypeError(f"Expected type str, got {type(api_version)}")
+            if api_version != __array_api_version__:
+                raise ValueError(f"Only {__array_api_version__} is supported")
+        return (
+            self.array_namespace_
+            if self.array_namespace_ is not None
+            # TODO: revert to `else dpctl.tensor`
+            # when dpnp fully migrates dpctl/tensor
+            else dpctl_ext.tensor
+        )
+
+    def __bool__(self):
+        if self.size == 1:
+            _check_0d_scalar_conversion(self)
+            view = _as_zero_dim_ndarray(self)
+            return view.__bool__()
+
+        if self.size == 0:
+            raise ValueError(
+                "The truth value of an empty array is ambiguous"
+            )
+
+        raise ValueError(
+            "The truth value of an array with more than one element is "
+            "ambiguous. Use dpctl.tensor.any() or dpctl.tensor.all()"
+        )
+
+    def __float__(self):
+        if self.size == 1:
+            _check_0d_scalar_conversion(self)
+            view = _as_zero_dim_ndarray(self)
+            return view.__float__()
+
+        raise ValueError(
+            "only size-1 arrays can be converted to Python scalars"
+        )
+
+    def __complex__(self):
+        if self.size == 1:
+            _check_0d_scalar_conversion(self)
+            view = _as_zero_dim_ndarray(self)
+            return view.__complex__()
+
+        raise ValueError(
+            "only size-1 arrays can be converted to Python scalars"
+        )
+
+    def __int__(self):
+        if self.size == 1:
+            _check_0d_scalar_conversion(self)
+            view = _as_zero_dim_ndarray(self)
+            return view.__int__()
+
+        raise ValueError(
+            "only size-1 arrays can be converted to Python scalars"
+        )
+
+    def __index__(self):
+        if np.issubdtype(self.dtype, np.integer):
+            return int(self)
+
+        raise IndexError("only integer arrays are valid indices")
+
+    def __abs__(self):
+        # TODO: revert to `return dpctl.tensor...`
+        # when dpnp fully migrates dpctl/tensor
+        return dpctl_ext.tensor.abs(self)
+
+    def __add__(self, other):
+        """
+        Implementation for operator.add
+        """
+        return dpctl_ext.tensor.add(self, other)
+
+    def __and__(self, other):
+        "Implementation for operator.and"
+        return dpctl_ext.tensor.bitwise_and(self, other)
+
+    def __dlpack__(
+        self, *, stream=None, max_version=None, dl_device=None, copy=None
+    ):
+        """
+        Produces DLPack capsule.
+
+        Args:
+            stream (:class:`dpctl.SyclQueue`, optional):
+                Execution queue to synchronize with.
+                If ``None``, synchronization is not performed.
+                Default: ``None``.
+            max_version (tuple[int, int], optional):
+                The maximum DLPack version the consumer (caller of
+                ``__dlpack__``) supports. As ``__dlpack__`` may not
+                always return a DLPack capsule with version
+                `max_version`, the consumer must verify the version
+                even if this argument is passed.
+                Default: ``None``.
+            dl_device (tuple[enum.Enum, int], optional):
+                The device the returned DLPack capsule will be
+                placed on.
+                The device must be a 2-tuple matching the format of
+                ``__dlpack_device__`` method, an integer enumerator
+                representing the device type followed by an integer
+                representing the index of the device.
+                Default: ``None``.
+            copy (bool, optional):
+                Boolean indicating whether or not to copy the input.
+
+                * If ``copy`` is ``True``, the input will always be
+                  copied.
+                * If ``False``, a ``BufferError`` will be raised if a
+                  copy is deemed necessary.
+                * If ``None``, a copy will be made only if deemed
+                  necessary, otherwise, the existing memory buffer will
+                  be reused.
+
+                Default: ``None``.
+
+        Raises:
+            MemoryError:
+                when host memory can not be allocated.
+            DLPackCreationError:
+                when array is allocated on a partitioned
+                SYCL device, or with a non-default context.
+            BufferError:
+                when a copy is deemed necessary but ``copy``
+                is ``False`` or when the provided ``dl_device``
+                cannot be handled.
+        """
+        if max_version is None:
+            # legacy path for DLManagedTensor
+            # copy kwarg ignored because copy flag can't be set
+            _caps = c_dlpack.to_dlpack_capsule(self)
+            _validate_and_use_stream(stream, self.sycl_queue)
+            return _caps
+        else:
+            if not isinstance(max_version, tuple) or len(max_version) != 2:
+                raise TypeError(
+                    "`__dlpack__` expects `max_version` to be a "
+                    "2-tuple of integers `(major, minor)`, instead "
+                    f"got {max_version}"
+                )
+            dpctl_dlpack_version = get_build_dlpack_version()
+            if max_version[0] >= dpctl_dlpack_version[0]:
+                # DLManagedTensorVersioned path
+                if dl_device is not None:
+                    if not isinstance(dl_device, tuple) or len(dl_device) != 2:
+                        raise TypeError(
+                            "`__dlpack__` expects `dl_device` to be a 2-tuple "
+                            "of `(device_type, device_id)`, instead "
+                            f"got {dl_device}"
+                        )
+                    if dl_device != self.__dlpack_device__():
+                        if copy is False:
+                            raise BufferError(
+                                "array cannot be placed on the requested "
+                                "device without a copy"
+                            )
+                        if _is_host_cpu(dl_device):
+                            if stream is not None:
+                                raise ValueError(
+                                    "`stream` must be `None` when `dl_device` "
+                                    "is of type `kDLCPU`"
+                                )
+                            from ._copy_utils import _copy_to_numpy
+                            _arr = _copy_to_numpy(self)
+                            _arr.flags["W"] = self.flags["W"]
+                            return c_dlpack.numpy_to_dlpack_versioned_capsule(
+                                _arr, True
+                            )
+                        else:
+                            raise BufferError(
+                                f"targeting `dl_device` {dl_device} with "
+                                "`__dlpack__` is not yet implemented"
+                            )
+                if copy is None:
+                    copy = False
+                # TODO: strategy for handling stream on different device
+                # from dl_device
+                if copy:
+                    _validate_and_use_stream(stream, self.sycl_queue)
+                    nbytes = self.usm_data.nbytes
+                    copy_buffer = type(self.usm_data)(
+                        nbytes, queue=self.sycl_queue
+                    )
+                    copy_buffer.copy_from_device(self.usm_data)
+                    _copied_arr = usm_ndarray(
+                        self.shape,
+                        self.dtype,
+                        buffer=copy_buffer,
+                        strides=self.strides,
+                        offset=self.get_offset()
+                    )
+                    _copied_arr.flags_ = self.flags_
+                    _caps = c_dlpack.to_dlpack_versioned_capsule(
+                        _copied_arr, copy
+                    )
+                else:
+                    _caps = c_dlpack.to_dlpack_versioned_capsule(self, copy)
+                    _validate_and_use_stream(stream, self.sycl_queue)
+                return _caps
+            else:
+                # legacy path for DLManagedTensor
+                _caps = c_dlpack.to_dlpack_capsule(self)
+                _validate_and_use_stream(stream, self.sycl_queue)
+                return _caps
+
+    def __dlpack_device__(self):
+        """
+        Gives a tuple (``device_type``, ``device_id``) corresponding to
+        ``DLDevice`` entry in ``DLTensor`` in DLPack protocol.
+
+        The tuple describes the non-partitioned device where the array has been
+        allocated, or the non-partitioned parent device of the allocation
+        device.
+
+        See :class:`dpctl.tensor.DLDeviceType` for a list of devices supported
+        by the DLPack protocol.
+
+        Raises:
+            DLPackCreationError:
+                when the ``device_id`` could not be determined.
+        """
+        try:
+            dev_id = self.sycl_device.get_device_id()
+        except ValueError as e:
+            raise c_dlpack.DLPackCreationError(
+                "Could not determine id of the device where array was "
+                "allocated."
+            )
+        return (
+            DLDeviceType.kDLOneAPI,
+            dev_id,
+        )
+
+    def __eq__(self, other):
+        # TODO: revert to `return dpctl.tensor...`
+        # when dpnp fully migrates dpctl/tensor
+        return dpctl_ext.tensor.equal(self, other)
+
+    def __floordiv__(self, other):
+        return dpctl_ext.tensor.floor_divide(self, other)
+
+    def __ge__(self, other):
+        return dpctl_ext.tensor.greater_equal(self, other)
+
+    def __gt__(self, other):
+        return dpctl_ext.tensor.greater(self, other)
+
+    def __invert__(self):
+        return dpctl_ext.tensor.bitwise_invert(self)
+
+    def __le__(self, other):
+        return dpctl_ext.tensor.less_equal(self, other)
+
+    def __len__(self):
+        if (self.nd_):
+            return self.shape[0]
+        else:
+            raise TypeError("len() of unsized object")
+
+    def __lshift__(self, other):
+        return dpctl_ext.tensor.bitwise_left_shift(self, other)
+
+    def __lt__(self, other):
+        return dpctl_ext.tensor.less(self, other)
+
+    def __matmul__(self, other):
+        return dpctl_ext.tensor.matmul(self, other)
+
+    def __mod__(self, other):
+        return dpctl_ext.tensor.remainder(self, other)
+
+    def __mul__(self, other):
+        return dpctl_ext.tensor.multiply(self, other)
+
+    def __ne__(self, other):
+        return dpctl_ext.tensor.not_equal(self, other)
+
+    def __neg__(self):
+        return dpctl_ext.tensor.negative(self)
+
+    def __or__(self, other):
+        return dpctl_ext.tensor.bitwise_or(self, other)
+
+    def __pos__(self):
+        return dpctl_ext.tensor.positive(self)
+
+    def __pow__(self, other):
+        return dpctl_ext.tensor.pow(self, other)
+
+    def __rshift__(self, other):
+        return dpctl_ext.tensor.bitwise_right_shift(self, other)
+
+    def __setitem__(self, key, rhs):
+        cdef tuple _meta
+        cdef usm_ndarray Xv
+
+        if (self.flags_ & USM_ARRAY_WRITABLE) == 0:
+            raise ValueError("Can not modify read-only array.")
+
+        _meta = _basic_slice_meta(
+            key, (<object>self).shape, (<object> self).strides,
+            self.get_offset()
+        )
+
+        if len(_meta) < 5:
+            raise RuntimeError
+
+        Xv = usm_ndarray.__new__(
+            usm_ndarray,
+            _meta[0],
+            dtype=_make_typestr(self.typenum_),
+            strides=_meta[1],
+            buffer=self.base_,
+            offset=_meta[2],
+        )
+        # set namespace
+        Xv.array_namespace_ = self.array_namespace_
+
+        from ._copy_utils import (
+            _copy_from_numpy_into,
+            _copy_from_usm_ndarray_to_usm_ndarray,
+            _nonzero_impl,
+            _place_impl,
+            _put_multi_index,
+        )
+
+        adv_ind = _meta[3]
+        adv_ind_start_p = _meta[4]
+
+        if adv_ind_start_p < 0:
+            # basic slicing
+            if isinstance(rhs, usm_ndarray):
+                _copy_from_usm_ndarray_to_usm_ndarray(Xv, rhs)
+            else:
+                if hasattr(rhs, "__sycl_usm_array_interface__"):
+                    from dpctl_ext.tensor import asarray
+                    try:
+                        rhs_ar = asarray(rhs)
+                        _copy_from_usm_ndarray_to_usm_ndarray(Xv, rhs_ar)
+                    except Exception:
+                        raise ValueError(
+                            f"Input of type {type(rhs)} could not be "
+                            "converted to usm_ndarray"
+                        )
+                else:
+                    rhs_np = np.asarray(rhs)
+                    if type_bytesize(rhs_np.dtype.num) < 0:
+                        raise ValueError(
+                            f"Input of type {type(rhs)} can not be "
+                            "assigned to usm_ndarray because of "
+                            f"unsupported data type '{rhs_np.dtype}'"
+                        )
+                    try:
+                        _copy_from_numpy_into(Xv, rhs_np)
+                    except Exception:
+                        raise ValueError(
+                            f"Input of type {type(rhs)} could not be "
+                            "copied into dpctl.tensor.usm_ndarray"
+                        )
+            return
+
+        if len(adv_ind) == 1 and adv_ind[0].dtype == dpt_bool:
+            _place_impl(Xv, adv_ind[0], rhs, axis=adv_ind_start_p)
+            return
+
+        if any(
+            (
+                isinstance(ind, usm_ndarray) and ind.dtype == dpt_bool
+            ) for ind in adv_ind
+        ):
+            adv_ind_int = list()
+            for ind in adv_ind:
+                if isinstance(ind, usm_ndarray) and ind.dtype == dpt_bool:
+                    adv_ind_int.extend(_nonzero_impl(ind))
+                else:
+                    adv_ind_int.append(ind)
+            _put_multi_index(Xv, tuple(adv_ind_int), adv_ind_start_p, rhs)
+            return
+
+        _put_multi_index(Xv, adv_ind, adv_ind_start_p, rhs)
+        return
+
+    def __sub__(self, other):
+        # TODO: revert to `return dpctl.tensor...`
+        # when dpnp fully migrates dpctl/tensor
+        return dpctl_ext.tensor.subtract(self, other)
+
+    def __truediv__(self, other):
+        return dpctl_ext.tensor.divide(self, other)
+
+    def __xor__(self, other):
+        return dpctl_ext.tensor.bitwise_xor(self, other)
+
+    def __radd__(self, other):
+        return dpctl_ext.tensor.add(other, self)
+
+    def __rand__(self, other):
+        return dpctl_ext.tensor.bitwise_and(other, self)
+
+    def __rfloordiv__(self, other):
+        return dpctl_ext.tensor.floor_divide(other, self)
+
+    def __rlshift__(self, other):
+        return dpctl_ext.tensor.bitwise_left_shift(other, self)
+
+    def __rmatmul__(self, other):
+        return dpctl_ext.tensor.matmul(other, self)
+
+    def __rmod__(self, other):
+        return dpctl_ext.tensor.remainder(other, self)
+
+    def __rmul__(self, other):
+        return dpctl_ext.tensor.multiply(other, self)
+
+    def __ror__(self, other):
+        return dpctl_ext.tensor.bitwise_or(other, self)
+
+    def __rpow__(self, other):
+        return dpctl_ext.tensor.pow(other, self)
+
+    def __rrshift__(self, other):
+        return dpctl_ext.tensor.bitwise_right_shift(other, self)
+
+    def __rsub__(self, other):
+        return dpctl_ext.tensor.subtract(other, self)
+
+    def __rtruediv__(self, other):
+        return dpctl_ext.tensor.divide(other, self)
+
+    def __rxor__(self, other):
+        return dpctl_ext.tensor.bitwise_xor(other, self)
+
+    def __iadd__(self, other):
+        return dpctl_ext.tensor.add._inplace_op(self, other)
+
+    def __iand__(self, other):
+        return dpctl_ext.tensor.bitwise_and._inplace_op(self, other)
+
+    def __ifloordiv__(self, other):
+        return dpctl_ext.tensor.floor_divide._inplace_op(self, other)
+
+    def __ilshift__(self, other):
+        return dpctl_ext.tensor.bitwise_left_shift._inplace_op(self, other)
+
+    def __imatmul__(self, other):
+        return dpctl_ext.tensor.matmul(self, other, out=self, dtype=self.dtype)
+
+    def __imod__(self, other):
+        return dpctl_ext.tensor.remainder._inplace_op(self, other)
+
+    def __imul__(self, other):
+        return dpctl_ext.tensor.multiply._inplace_op(self, other)
+
+    def __ior__(self, other):
+        return dpctl_ext.tensor.bitwise_or._inplace_op(self, other)
+
+    def __ipow__(self, other):
+        return dpctl_ext.tensor.pow._inplace_op(self, other)
+
+    def __irshift__(self, other):
+        return dpctl_ext.tensor.bitwise_right_shift._inplace_op(self, other)
+
+    def __isub__(self, other):
+        return dpctl_ext.tensor.subtract._inplace_op(self, other)
+
+    def __itruediv__(self, other):
+        return dpctl_ext.tensor.divide._inplace_op(self, other)
+
+    def __ixor__(self, other):
+        return dpctl_ext.tensor.bitwise_xor._inplace_op(self, other)
+
+    def __str__(self):
+        return usm_ndarray_str(self)
+
+    def __repr__(self):
+        return usm_ndarray_repr(self)
+
+    def __array__(self, dtype=None, /, *, copy=None):
+        """NumPy's array protocol method to disallow implicit conversion.
+
+        Without this definition, `numpy.asarray(usm_ar)` converts
+        usm_ndarray instance into NumPy array with data type `object`
+        and every element being 0d usm_ndarray.
+
+        https://github.com/IntelPython/dpctl/pull/1384#issuecomment-1707212972
+        """
+        raise TypeError(
+            "Implicit conversion to a NumPy array is not allowed. "
+            "Use `dpctl.tensor.asnumpy` to copy data from this "
+            "`dpctl.tensor.usm_ndarray` instance to NumPy array"
+        )
+
+
+cdef usm_ndarray _real_view(usm_ndarray ary):
+    """
+    View into real parts of a complex type array
+    """
+    cdef int r_typenum_ = -1
+    cdef usm_ndarray r = None
+    cdef Py_ssize_t offset_elems = 0
+
+    if (ary.typenum_ == UAR_CFLOAT):
+        r_typenum_ = UAR_FLOAT
+    elif (ary.typenum_ == UAR_CDOUBLE):
+        r_typenum_ = UAR_DOUBLE
+    else:
+        raise InternalUSMArrayError(
+            "_real_view call on array of non-complex type.")
+
+    offset_elems = ary.get_offset() * 2
+    r = usm_ndarray.__new__(
+        usm_ndarray,
+        _make_int_tuple(ary.nd_, ary.shape_) if ary.nd_ > 0 else tuple(),
+        dtype=_make_typestr(r_typenum_),
+        strides=tuple(2 * si for si in ary.strides),
+        buffer=ary.base_,
+        offset=offset_elems,
+        order=("C" if (ary.flags_ & USM_ARRAY_C_CONTIGUOUS) else "F")
+    )
+    r.flags_ = _copy_writable(r.flags_, ary.flags_)
+    r.array_namespace_ = ary.array_namespace_
+    return r
+
+
+cdef usm_ndarray _imag_view(usm_ndarray ary):
+    """
+    View into imaginary parts of a complex type array
+    """
+    cdef int r_typenum_ = -1
+    cdef usm_ndarray r = None
+    cdef Py_ssize_t offset_elems = 0
+
+    if (ary.typenum_ == UAR_CFLOAT):
+        r_typenum_ = UAR_FLOAT
+    elif (ary.typenum_ == UAR_CDOUBLE):
+        r_typenum_ = UAR_DOUBLE
+    else:
+        raise InternalUSMArrayError(
+            "_imag_view call on array of non-complex type.")
+
+    # displace pointer to imaginary part
+    offset_elems = 2 * ary.get_offset() + 1
+    r = usm_ndarray.__new__(
+        usm_ndarray,
+        _make_int_tuple(ary.nd_, ary.shape_) if ary.nd_ > 0 else tuple(),
+        dtype=_make_typestr(r_typenum_),
+        strides=tuple(2 * si for si in ary.strides),
+        buffer=ary.base_,
+        offset=offset_elems,
+        order=("C" if (ary.flags_ & USM_ARRAY_C_CONTIGUOUS) else "F")
+    )
+    r.flags_ = _copy_writable(r.flags_, ary.flags_)
+    r.array_namespace_ = ary.array_namespace_
+    return r
+
+
+cdef usm_ndarray _transpose(usm_ndarray ary):
+    """
+    Construct transposed array without copying the data
+    """
+    cdef usm_ndarray r = usm_ndarray.__new__(
+        usm_ndarray,
+        _make_reversed_int_tuple(ary.nd_, ary.shape_),
+        dtype=_make_typestr(ary.typenum_),
+        strides=(
+            _make_reversed_int_tuple(ary.nd_, ary.strides_)
+            if (ary.strides_) else None),
+        buffer=ary.base_,
+        order=("F" if (ary.flags_ & USM_ARRAY_C_CONTIGUOUS) else "C"),
+        offset=ary.get_offset()
+    )
+    r.flags_ = _copy_writable(r.flags_, ary.flags_)
+    return r
+
+
+cdef usm_ndarray _m_transpose(usm_ndarray ary):
+    """
+    Construct matrix transposed array
+    """
+    cdef usm_ndarray r = usm_ndarray.__new__(
+        usm_ndarray,
+        _swap_last_two(_make_int_tuple(ary.nd_, ary.shape_)),
+        dtype=_make_typestr(ary.typenum_),
+        strides=_swap_last_two(ary.strides),
+        buffer=ary.base_,
+        order=("F" if (ary.flags_ & USM_ARRAY_C_CONTIGUOUS) else "C"),
+        offset=ary.get_offset()
+    )
+    r.flags_ = _copy_writable(r.flags_, ary.flags_)
+    return r
+
+
+cdef usm_ndarray _zero_like(usm_ndarray ary):
+    """
+    Make C-contiguous array of zero elements with same shape,
+    type, device, and sycl_queue as ary.
+    """
+    cdef dt = _make_typestr(ary.typenum_)
+    cdef usm_ndarray r = usm_ndarray(
+        _make_int_tuple(ary.nd_, ary.shape_) if ary.nd_ > 0 else tuple(),
+        dtype=dt,
+        buffer=ary.base_.get_usm_type(),
+        buffer_ctor_kwargs={"queue": ary.get_sycl_queue()},
+    )
+    r.base_.memset()
+    return r
+
+
+cdef api char* UsmNDArray_GetData(usm_ndarray arr):
+    """Get allocation pointer of zero index element of array """
+    return arr.get_data()
+
+
+cdef api int UsmNDArray_GetNDim(usm_ndarray arr):
+    """Get array rank: length of its shape"""
+    return arr.get_ndim()
+
+
+cdef api Py_ssize_t* UsmNDArray_GetShape(usm_ndarray arr):
+    """Get host pointer to shape vector"""
+    return arr.get_shape()
+
+
+cdef api Py_ssize_t* UsmNDArray_GetStrides(usm_ndarray arr):
+    """Get host pointer to strides vector"""
+    return arr.get_strides()
+
+
+cdef api int UsmNDArray_GetTypenum(usm_ndarray arr):
+    """Get type number for data type of array elements"""
+    return arr.get_typenum()
+
+
+cdef api int UsmNDArray_GetElementSize(usm_ndarray arr):
+    """Get array element size in bytes"""
+    return arr.get_itemsize()
+
+
+cdef api int UsmNDArray_GetFlags(usm_ndarray arr):
+    """Get flags of array"""
+    return arr.get_flags()
+
+
+cdef api c_dpctl.DPCTLSyclQueueRef UsmNDArray_GetQueueRef(usm_ndarray arr):
+    """Get DPCTLSyclQueueRef for queue associated with the array"""
+    return arr.get_queue_ref()
+
+
+cdef api Py_ssize_t UsmNDArray_GetOffset(usm_ndarray arr):
+    """Get offset of zero-index array element from the beginning of the USM
+    allocation"""
+    return arr.get_offset()
+
+
+cdef api object UsmNDArray_GetUSMData(usm_ndarray arr):
+    """Get USM data object underlying the array"""
+    return arr.get_base()
+
+
+cdef api void UsmNDArray_SetWritableFlag(usm_ndarray arr, int flag):
+    """Set/unset USM_ARRAY_WRITABLE in the given array `arr`."""
+    arr._set_writable_flag(flag)
+
+
+cdef api object UsmNDArray_MakeSimpleFromMemory(
+    int nd, const Py_ssize_t *shape, int typenum,
+    c_dpmem._Memory mobj, Py_ssize_t offset, char order
+):
+    """Create contiguous usm_ndarray.
+
+    Args:
+        nd: number of dimensions (non-negative)
+        shape: array of nd non-negative array's sizes along each dimension
+        typenum: array elemental type number
+        ptr: pointer to the start of allocation
+        QRef: DPCTLSyclQueueRef associated with the allocation
+        offset: distance between element with zero multi-index and the
+                start of allocation
+        order: Memory layout of the array. Use 'C' for C-contiguous or
+               row-major layout; 'F' for F-contiguous or column-major layout
+    Returns:
+        Created usm_ndarray instance
+    """
+    cdef object shape_tuple = _make_int_tuple(nd, <Py_ssize_t *>shape)
+    cdef usm_ndarray arr = usm_ndarray(
+        shape_tuple,
+        dtype=_make_typestr(typenum),
+        buffer=mobj,
+        offset=offset,
+        order=<bytes>(order)
+    )
+    return arr
+
+
+cdef api object UsmNDArray_MakeSimpleFromPtr(
+    size_t nelems,
+    int typenum,
+    c_dpctl.DPCTLSyclUSMRef ptr,
+    c_dpctl.DPCTLSyclQueueRef QRef,
+    object owner
+):
+    """Create 1D contiguous usm_ndarray from pointer.
+
+    Args:
+        nelems: number of elements in array
+        typenum: array elemental type number
+        ptr: pointer to the start of allocation
+        QRef: DPCTLSyclQueueRef associated with the allocation
+        owner: Python object managing lifetime of USM allocation.
+               Value None implies transfer of USM allocation ownership
+               to the created array object.
+    Returns:
+        Created usm_ndarray instance
+    """
+    cdef int itemsize = type_bytesize(typenum)
+    if (itemsize < 1):
+        raise ValueError(
+            "dtype with typenum=" + str(typenum) + " is not supported."
+        )
+    cdef size_t nbytes = (<size_t> itemsize) * nelems
+    cdef c_dpmem._Memory mobj
+    mobj = c_dpmem._Memory.create_from_usm_pointer_size_qref(
+        ptr, nbytes, QRef, memory_owner=owner
+    )
+    cdef usm_ndarray arr = usm_ndarray(
+        (nelems,),
+        dtype=_make_typestr(typenum),
+        buffer=mobj
+    )
+    return arr
+
+cdef api object UsmNDArray_MakeFromPtr(
+    int nd,
+    const Py_ssize_t *shape,
+    int typenum,
+    const Py_ssize_t *strides,
+    c_dpctl.DPCTLSyclUSMRef ptr,
+    c_dpctl.DPCTLSyclQueueRef QRef,
+    Py_ssize_t offset,
+    object owner
+):
+    """
+    General usm_ndarray constructor from externally made USM-allocation.
+
+    Args:
+        nd: number of dimensions (non-negative)
+        shape: array of nd non-negative array's sizes along each dimension
+        typenum: array elemental type number
+        strides: array of nd strides along each dimension in elements
+        ptr: pointer to the start of allocation
+        QRef: DPCTLSyclQueueRef associated with the allocation
+        offset: distance between element with zero multi-index and the
+                start of allocation
+        owner: Python object managing lifetime of USM allocation.
+               Value None implies transfer of USM allocation ownership
+               to the created array object.
+    Returns:
+        Created usm_ndarray instance
+    """
+    cdef int itemsize = type_bytesize(typenum)
+    cdef size_t nelems = 1
+    cdef Py_ssize_t min_disp = 0
+    cdef Py_ssize_t max_disp = 0
+    cdef Py_ssize_t step_ = 0
+    cdef Py_ssize_t dim_ = 0
+    cdef it = 0
+    cdef c_dpmem._Memory mobj
+    cdef usm_ndarray arr
+    cdef object obj_shape
+    cdef object obj_strides
+
+    if (itemsize < 1):
+        raise ValueError(
+            "dtype with typenum=" + str(typenum) + " is not supported."
+        )
+    if (nd < 0):
+        raise ValueError("Dimensionality must be non-negative")
+    if (ptr is NULL or QRef is NULL):
+        raise ValueError(
+            "Non-null USM allocation pointer and QRef are expected"
+        )
+    if (nd == 0):
+        # case of 0d scalars
+        mobj = c_dpmem._Memory.create_from_usm_pointer_size_qref(
+            ptr, itemsize, QRef, memory_owner=owner
+        )
+        arr = usm_ndarray(
+            tuple(),
+            dtype=_make_typestr(typenum),
+            buffer=mobj
+        )
+        return arr
+    if (shape is NULL or strides is NULL):
+        raise ValueError("Both shape and stride vectors are required")
+    for it in range(nd):
+        dim_ = shape[it]
+        if dim_ < 0:
+            raise ValueError(
+                f"Dimension along axis {it} must be non-negative"
+            )
+        nelems *= dim_
+        if dim_ > 0:
+            step_ = strides[it]
+            if step_ > 0:
+                max_disp += step_ * (dim_ - 1)
+            else:
+                min_disp += step_ * (dim_ - 1)
+
+    obj_shape = _make_int_tuple(nd, shape)
+    obj_strides = _make_int_tuple(nd, strides)
+    if nelems == 0:
+        mobj = c_dpmem._Memory.create_from_usm_pointer_size_qref(
+            ptr, itemsize, QRef, memory_owner=owner
+        )
+        arr = usm_ndarray(
+            obj_shape,
+            dtype=_make_typestr(typenum),
+            strides=obj_strides,
+            buffer=mobj,
+            offset=0
+        )
+        return arr
+    if offset + min_disp < 0:
+        raise ValueError(
+            "Given shape, strides and offset reference out-of-bound memory"
+        )
+    nbytes = (<size_t> itemsize) * (offset + max_disp + 1)
+    mobj = c_dpmem._Memory.create_from_usm_pointer_size_qref(
+        ptr, nbytes, QRef, memory_owner=owner
+    )
+    arr = usm_ndarray(
+        obj_shape,
+        dtype=_make_typestr(typenum),
+        strides=obj_strides,
+        buffer=mobj,
+        offset=offset
+    )
+    return arr
+
+
+def _is_object_with_buffer_protocol(o):
+    "Returns True if object supports Python buffer protocol"
+    return _is_buffer(o)
diff --git a/dpctl_ext/tensor/_utility_functions.py b/dpctl_ext/tensor/_utility_functions.py
index 821f0954017a..c892d777102d 100644
--- a/dpctl_ext/tensor/_utility_functions.py
+++ b/dpctl_ext/tensor/_utility_functions.py
@@ -29,12 +29,11 @@
 import builtins
 import operator
 
-import dpctl.tensor as dpt
 import dpctl.utils as du
 
 # TODO: revert to `import dpctl.tensor...`
 # when dpnp fully migrates dpctl/tensor
-import dpctl_ext.tensor as dpt_ext
+import dpctl_ext.tensor as dpt
 import dpctl_ext.tensor._tensor_impl as ti
 import dpctl_ext.tensor._tensor_reductions_impl as tri
 
@@ -60,7 +59,7 @@ def _boolean_reduction(x, axis, keepdims, func):
         red_nd = nd
         # case of a scalar
         if red_nd == 0:
-            return dpt_ext.astype(x, dpt.bool)
+            return dpt.astype(x, dpt.bool)
         x_tmp = x
         res_shape = ()
         perm = list(range(nd))
@@ -72,9 +71,9 @@ def _boolean_reduction(x, axis, keepdims, func):
         red_nd = len(axis)
         # check for axis=()
         if red_nd == 0:
-            return dpt_ext.astype(x, dpt.bool)
+            return dpt.astype(x, dpt.bool)
         perm = [i for i in range(nd) if i not in axis] + list(axis)
-        x_tmp = dpt_ext.permute_dims(x, perm)
+        x_tmp = dpt.permute_dims(x, perm)
         res_shape = x_tmp.shape[: nd - red_nd]
 
     exec_q = x.sycl_queue
@@ -85,7 +84,7 @@ def _boolean_reduction(x, axis, keepdims, func):
     # always allocate the temporary as
     # int32 and usm-device  to ensure that atomic updates
     # are supported
-    res_tmp = dpt_ext.empty(
+    res_tmp = dpt.empty(
         res_shape,
         dtype=dpt.int32,
         usm_type="device",
@@ -101,7 +100,7 @@ def _boolean_reduction(x, axis, keepdims, func):
     _manager.add_event_pair(hev0, ev0)
 
     # copy to boolean result array
-    res = dpt_ext.empty(
+    res = dpt.empty(
         res_shape,
         dtype=dpt.bool,
         usm_type=res_usm_type,
@@ -115,7 +114,7 @@ def _boolean_reduction(x, axis, keepdims, func):
     if keepdims:
         res_shape = res_shape + (1,) * red_nd
         inv_perm = sorted(range(nd), key=lambda d: perm[d])
-        res = dpt_ext.permute_dims(dpt_ext.reshape(res, res_shape), inv_perm)
+        res = dpt.permute_dims(dpt.reshape(res, res_shape), inv_perm)
     return res
 
 
@@ -292,7 +291,7 @@ def _concat_diff_input(arr, axis, prepend, append):
         if isinstance(prepend, dpt.usm_ndarray):
             a_prepend = prepend
         else:
-            a_prepend = dpt_ext.asarray(
+            a_prepend = dpt.asarray(
                 prepend,
                 dtype=prepend_dtype,
                 usm_type=coerced_usm_type,
@@ -301,7 +300,7 @@ def _concat_diff_input(arr, axis, prepend, append):
         if isinstance(append, dpt.usm_ndarray):
             a_append = append
         else:
-            a_append = dpt_ext.asarray(
+            a_append = dpt.asarray(
                 append,
                 dtype=append_dtype,
                 usm_type=coerced_usm_type,
@@ -309,11 +308,11 @@ def _concat_diff_input(arr, axis, prepend, append):
             )
         if not prepend_shape:
             prepend_shape = arr_shape[:axis] + (1,) + arr_shape[axis + 1 :]
-            a_prepend = dpt_ext.broadcast_to(a_prepend, prepend_shape)
+            a_prepend = dpt.broadcast_to(a_prepend, prepend_shape)
         if not append_shape:
             append_shape = arr_shape[:axis] + (1,) + arr_shape[axis + 1 :]
-            a_append = dpt_ext.broadcast_to(a_append, append_shape)
-        return dpt_ext.concat((a_prepend, arr, a_append), axis=axis)
+            a_append = dpt.broadcast_to(a_append, append_shape)
+        return dpt.concat((a_prepend, arr, a_append), axis=axis)
     elif prepend is not None:
         q1, x_usm_type = arr.sycl_queue, arr.usm_type
         q2, prepend_usm_type = _get_queue_usm_type(prepend)
@@ -361,7 +360,7 @@ def _concat_diff_input(arr, axis, prepend, append):
         if isinstance(prepend, dpt.usm_ndarray):
             a_prepend = prepend
         else:
-            a_prepend = dpt_ext.asarray(
+            a_prepend = dpt.asarray(
                 prepend,
                 dtype=prepend_dtype,
                 usm_type=coerced_usm_type,
@@ -369,8 +368,8 @@ def _concat_diff_input(arr, axis, prepend, append):
             )
         if not prepend_shape:
             prepend_shape = arr_shape[:axis] + (1,) + arr_shape[axis + 1 :]
-            a_prepend = dpt_ext.broadcast_to(a_prepend, prepend_shape)
-        return dpt_ext.concat((a_prepend, arr), axis=axis)
+            a_prepend = dpt.broadcast_to(a_prepend, prepend_shape)
+        return dpt.concat((a_prepend, arr), axis=axis)
     elif append is not None:
         q1, x_usm_type = arr.sycl_queue, arr.usm_type
         q2, append_usm_type = _get_queue_usm_type(append)
@@ -416,7 +415,7 @@ def _concat_diff_input(arr, axis, prepend, append):
         if isinstance(append, dpt.usm_ndarray):
             a_append = append
         else:
-            a_append = dpt_ext.asarray(
+            a_append = dpt.asarray(
                 append,
                 dtype=append_dtype,
                 usm_type=coerced_usm_type,
@@ -424,8 +423,8 @@ def _concat_diff_input(arr, axis, prepend, append):
             )
         if not append_shape:
             append_shape = arr_shape[:axis] + (1,) + arr_shape[axis + 1 :]
-            a_append = dpt_ext.broadcast_to(a_append, append_shape)
-        return dpt_ext.concat((arr, a_append), axis=axis)
+            a_append = dpt.broadcast_to(a_append, append_shape)
+        return dpt.concat((arr, a_append), axis=axis)
     else:
         arr1 = arr
     return arr1
@@ -489,7 +488,7 @@ def diff(x, /, *, axis=-1, n=1, prepend=None, append=None):
         slice(None) if i != axis else slice(None, -1) for i in range(x_nd)
     )
 
-    diff_op = dpt_ext.not_equal if x.dtype == dpt.bool else dpt_ext.subtract
+    diff_op = dpt.not_equal if x.dtype == dpt.bool else dpt.subtract
     if n > 1:
         arr_tmp0 = diff_op(arr[sl0], arr[sl1])
         arr_tmp1 = diff_op(arr_tmp0[sl0], arr_tmp0[sl1])
diff --git a/dpctl_ext/tensor/include/dlpack/LICENSE.third-party b/dpctl_ext/tensor/include/dlpack/LICENSE.third-party
new file mode 100644
index 000000000000..20a9c8a7b4dc
--- /dev/null
+++ b/dpctl_ext/tensor/include/dlpack/LICENSE.third-party
@@ -0,0 +1,201 @@
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "{}"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright 2017 by Contributors
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
diff --git a/dpctl_ext/tensor/include/dlpack/README.md b/dpctl_ext/tensor/include/dlpack/README.md
new file mode 100644
index 000000000000..315ad1b9a566
--- /dev/null
+++ b/dpctl_ext/tensor/include/dlpack/README.md
@@ -0,0 +1,7 @@
+# DLPack header
+
+The header `dlpack.h` downloaded from `https://github.com/dmlc/dlpack.git` remote at tag v1.3 commit [`84d107b`](https://github.com/dmlc/dlpack/commit/84d107bf416c6bab9ae68ad285876600d230490d).
+
+The file can also be viewed using github web interface at https://github.com/dmlc/dlpack/blob/v1.3/include/dlpack/dlpack.h
+
+License file was retrieved from https://github.com/dmlc/dlpack/blob/main/LICENSE
diff --git a/dpctl_ext/tensor/include/dlpack/dlpack.h b/dpctl_ext/tensor/include/dlpack/dlpack.h
new file mode 100644
index 000000000000..5196acc87711
--- /dev/null
+++ b/dpctl_ext/tensor/include/dlpack/dlpack.h
@@ -0,0 +1,683 @@
+/*!
+ *  Copyright (c) 2017 -  by Contributors
+ * \file dlpack.h
+ * \brief The common header of DLPack.
+ */
+#ifndef DLPACK_DLPACK_H_
+#define DLPACK_DLPACK_H_
+
+/**
+ * \brief Compatibility with C++
+ */
+#ifdef __cplusplus
+#define DLPACK_EXTERN_C extern "C"
+#else
+#define DLPACK_EXTERN_C
+#endif
+
+/*! \brief The current major version of dlpack */
+#define DLPACK_MAJOR_VERSION 1
+
+/*! \brief The current minor version of dlpack */
+#define DLPACK_MINOR_VERSION 3
+
+/*! \brief DLPACK_DLL prefix for windows */
+#ifdef _WIN32
+#ifdef DLPACK_EXPORTS
+#define DLPACK_DLL __declspec(dllexport)
+#else
+#define DLPACK_DLL __declspec(dllimport)
+#endif
+#else
+#define DLPACK_DLL
+#endif
+
+#include <stddef.h>
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+    /*!
+     * \brief The DLPack version.
+     *
+     * A change in major version indicates that we have changed the
+     * data layout of the ABI - DLManagedTensorVersioned.
+     *
+     * A change in minor version indicates that we have added new
+     * code, such as a new device type, but the ABI is kept the same.
+     *
+     * If an obtained DLPack tensor has a major version that disagrees
+     * with the version number specified in this header file
+     * (i.e. major != DLPACK_MAJOR_VERSION), the consumer must call the deleter
+     * (and it is safe to do so). It is not safe to access any other fields
+     * as the memory layout will have changed.
+     *
+     * In the case of a minor version mismatch, the tensor can be safely used as
+     * long as the consumer knows how to interpret all fields. Minor version
+     * updates indicate the addition of enumeration values.
+     */
+    typedef struct
+    {
+        /*! \brief DLPack major version. */
+        uint32_t major;
+        /*! \brief DLPack minor version. */
+        uint32_t minor;
+    } DLPackVersion;
+
+/*!
+ * \brief The device type in DLDevice.
+ */
+#ifdef __cplusplus
+    typedef enum : int32_t
+    {
+#else
+typedef enum
+{
+#endif
+        /*! \brief CPU device */
+        kDLCPU = 1,
+        /*! \brief CUDA GPU device */
+        kDLCUDA = 2,
+        /*!
+         * \brief Pinned CUDA CPU memory by cudaMallocHost
+         */
+        kDLCUDAHost = 3,
+        /*! \brief OpenCL devices. */
+        kDLOpenCL = 4,
+        /*! \brief Vulkan buffer for next generation graphics. */
+        kDLVulkan = 7,
+        /*! \brief Metal for Apple GPU. */
+        kDLMetal = 8,
+        /*! \brief Verilog simulator buffer */
+        kDLVPI = 9,
+        /*! \brief ROCm GPUs for AMD GPUs */
+        kDLROCM = 10,
+        /*!
+         * \brief Pinned ROCm CPU memory allocated by hipMallocHost
+         */
+        kDLROCMHost = 11,
+        /*!
+         * \brief Reserved extension device type,
+         * used for quickly test extension device
+         * The semantics can differ depending on the implementation.
+         */
+        kDLExtDev = 12,
+        /*!
+         * \brief CUDA managed/unified memory allocated by cudaMallocManaged
+         */
+        kDLCUDAManaged = 13,
+        /*!
+         * \brief Unified shared memory allocated on a oneAPI non-partititioned
+         * device. Call to oneAPI runtime is required to determine the device
+         * type, the USM allocation type and the sycl context it is bound to.
+         *
+         */
+        kDLOneAPI = 14,
+        /*! \brief GPU support for next generation WebGPU standard. */
+        kDLWebGPU = 15,
+        /*! \brief Qualcomm Hexagon DSP */
+        kDLHexagon = 16,
+        /*! \brief Microsoft MAIA devices */
+        kDLMAIA = 17,
+        /*! \brief AWS Trainium */
+        kDLTrn = 18,
+    } DLDeviceType;
+
+    /*!
+     * \brief A Device for Tensor and operator.
+     */
+    typedef struct
+    {
+        /*! \brief The device type used in the device. */
+        DLDeviceType device_type;
+        /*!
+         * \brief The device index.
+         * For vanilla CPU memory, pinned memory, or managed memory, this is set
+         * to 0.
+         */
+        int32_t device_id;
+    } DLDevice;
+
+    /*!
+     * \brief The type code options DLDataType.
+     */
+    typedef enum
+    {
+        /*! \brief signed integer */
+        kDLInt = 0U,
+        /*! \brief unsigned integer */
+        kDLUInt = 1U,
+        /*! \brief IEEE floating point */
+        kDLFloat = 2U,
+        /*!
+         * \brief Opaque handle type, reserved for testing purposes.
+         * Frameworks need to agree on the handle data type for the exchange to
+         * be well-defined.
+         */
+        kDLOpaqueHandle = 3U,
+        /*! \brief bfloat16 */
+        kDLBfloat = 4U,
+        /*!
+         * \brief complex number
+         * (C/C++/Python layout: compact struct per complex number)
+         */
+        kDLComplex = 5U,
+        /*! \brief boolean */
+        kDLBool = 6U,
+        /*! \brief FP8 data types */
+        kDLFloat8_e3m4 = 7U,
+        kDLFloat8_e4m3 = 8U,
+        kDLFloat8_e4m3b11fnuz = 9U,
+        kDLFloat8_e4m3fn = 10U,
+        kDLFloat8_e4m3fnuz = 11U,
+        kDLFloat8_e5m2 = 12U,
+        kDLFloat8_e5m2fnuz = 13U,
+        kDLFloat8_e8m0fnu = 14U,
+        /*! \brief FP6 data types
+         * Setting bits != 6 is currently unspecified, and the producer must
+         * ensure it is set while the consumer must stop importing if the value
+         * is unexpected.
+         */
+        kDLFloat6_e2m3fn = 15U,
+        kDLFloat6_e3m2fn = 16U,
+        /*! \brief FP4 data types
+         * Setting bits != 4 is currently unspecified, and the producer must
+         * ensure it is set while the consumer must stop importing if the value
+         * is unexpected.
+         */
+        kDLFloat4_e2m1fn = 17U,
+    } DLDataTypeCode;
+
+    /*!
+     * \brief The data type the tensor can hold. The data type is assumed to
+     * follow the native endian-ness. An explicit error message should be raised
+     * when attempting to export an array with non-native endianness
+     *
+     *  Examples
+     *   - float: type_code = 2, bits = 32, lanes = 1
+     *   - float4(vectorized 4 float): type_code = 2, bits = 32, lanes = 4
+     *   - int8: type_code = 0, bits = 8, lanes = 1
+     *   - std::complex<float>: type_code = 5, bits = 64, lanes = 1
+     *   - bool: type_code = 6, bits = 8, lanes = 1 (as per common array library
+     * convention, the underlying storage size of bool is 8 bits)
+     *   - float8_e4m3: type_code = 8, bits = 8, lanes = 1 (packed in memory)
+     *   - float6_e3m2fn: type_code = 16, bits = 6, lanes = 1 (packed in memory)
+     *   - float4_e2m1fn: type_code = 17, bits = 4, lanes = 1 (packed in memory)
+     *
+     *  When a sub-byte type is packed, DLPack requires the data to be in little
+     * bit-endian, i.e., for a packed data set D ((D >> (i * bits)) && bit_mask)
+     * stores the i-th element.
+     */
+    typedef struct
+    {
+        /*!
+         * \brief Type code of base types.
+         * We keep it uint8_t instead of DLDataTypeCode for minimal memory
+         * footprint, but the value should be one of DLDataTypeCode enum values.
+         * */
+        uint8_t code;
+        /*!
+         * \brief Number of bits, common choices are 8, 16, 32.
+         */
+        uint8_t bits;
+        /*! \brief Number of lanes in the type, used for vector types. */
+        uint16_t lanes;
+    } DLDataType;
+
+    /*!
+     * \brief Plain C Tensor object, does not manage memory.
+     */
+    typedef struct
+    {
+        /*!
+         * \brief The data pointer points to the allocated data. This will be
+         * CUDA device pointer or cl_mem handle in OpenCL. It may be opaque on
+         * some device types. This pointer is always aligned to 256 bytes as in
+         * CUDA. The `byte_offset` field should be used to point to the
+         * beginning of the data.
+         *
+         * Note that as of Nov 2021, multiple libraries (CuPy, PyTorch,
+         * TensorFlow, TVM, perhaps others) do not adhere to this 256 byte
+         * alignment requirement on CPU/CUDA/ROCm, and always use
+         * `byte_offset=0`.  This must be fixed (after which this note will be
+         * updated); at the moment it is recommended to not rely on the data
+         * pointer being correctly aligned.
+         *
+         * For given DLTensor, the size of memory required to store the contents
+         * of data is calculated as follows:
+         *
+         * \code{.c}
+         * static inline size_t GetDataSize(const DLTensor* t) {
+         *   size_t size = 1;
+         *   for (tvm_index_t i = 0; i < t->ndim; ++i) {
+         *     size *= t->shape[i];
+         *   }
+         *   size *= (t->dtype.bits * t->dtype.lanes + 7) / 8;
+         *   return size;
+         * }
+         * \endcode
+         *
+         * Note that if the tensor is of size zero, then the data pointer should
+         * be set to `NULL`.
+         */
+        void *data;
+        /*! \brief The device of the tensor */
+        DLDevice device;
+        /*! \brief Number of dimensions */
+        int32_t ndim;
+        /*! \brief The data type of the pointer*/
+        DLDataType dtype;
+        /*!
+         * \brief The shape of the tensor
+         *
+         *  When ndim == 0, shape can be set to NULL.
+         */
+        int64_t *shape;
+        /*!
+         * \brief strides of the tensor (in number of elements, not bytes),
+         *  can not be NULL if ndim != 0, must points to
+         *  an array of ndim elements that specifies the strides,
+         *  so consumer can always rely on strides[dim] being valid for 0 <= dim
+         * < ndim.
+         *
+         *  When ndim == 0, strides can be set to NULL.
+         *
+         *  \note Before DLPack v1.2, strides can be NULL to indicate contiguous
+         * data. This is not allowed in DLPack v1.2 and later. The rationale is
+         * to simplify the consumer handling.
+         */
+        int64_t *strides;
+        /*! \brief The offset in bytes to the beginning pointer to data */
+        uint64_t byte_offset;
+    } DLTensor;
+
+    /*!
+     * \brief C Tensor object, manage memory of DLTensor. This data structure is
+     *  intended to facilitate the borrowing of DLTensor by another framework.
+     * It is not meant to transfer the tensor. When the borrowing framework
+     * doesn't need the tensor, it should call the deleter to notify the host
+     * that the resource is no longer needed.
+     *
+     * \note This data structure is used as Legacy DLManagedTensor
+     *       in DLPack exchange and is deprecated after DLPack v0.8
+     *       Use DLManagedTensorVersioned instead.
+     *       This data structure may get renamed or deleted in future versions.
+     *
+     * \sa DLManagedTensorVersioned
+     */
+    typedef struct DLManagedTensor
+    {
+        /*! \brief DLTensor which is being memory managed */
+        DLTensor dl_tensor;
+        /*! \brief the context of the original host framework of DLManagedTensor
+         * in which DLManagedTensor is used in the framework. It can also be
+         * NULL.
+         */
+        void *manager_ctx;
+        /*!
+         * \brief Destructor - this should be called
+         * to destruct the manager_ctx  which backs the DLManagedTensor. It can
+         * be NULL if there is no way for the caller to provide a reasonable
+         * destructor. The destructor deletes the argument self as well.
+         */
+        void (*deleter)(struct DLManagedTensor *self);
+    } DLManagedTensor;
+
+// bit masks used in the DLManagedTensorVersioned
+
+/*! \brief bit mask to indicate that the tensor is read only. */
+#define DLPACK_FLAG_BITMASK_READ_ONLY (1UL << 0UL)
+
+/*!
+ * \brief bit mask to indicate that the tensor is a copy made by the producer.
+ *
+ * If set, the tensor is considered solely owned throughout its lifetime by the
+ * consumer, until the producer-provided deleter is invoked.
+ */
+#define DLPACK_FLAG_BITMASK_IS_COPIED (1UL << 1UL)
+
+/*!
+ * \brief bit mask to indicate that whether a sub-byte type is packed or padded.
+ *
+ * The default for sub-byte types (ex: fp4/fp6) is assumed packed. This flag can
+ * be set by the producer to signal that a tensor of sub-byte type is padded.
+ */
+#define DLPACK_FLAG_BITMASK_IS_SUBBYTE_TYPE_PADDED (1UL << 2UL)
+
+    /*!
+     * \brief A versioned and managed C Tensor object, manage memory of
+     * DLTensor.
+     *
+     * This data structure is intended to facilitate the borrowing of DLTensor
+     * by another framework. It is not meant to transfer the tensor. When the
+     * borrowing framework doesn't need the tensor, it should call the deleter
+     * to notify the host that the resource is no longer needed.
+     *
+     * \note This is the current standard DLPack exchange data structure.
+     */
+    typedef struct DLManagedTensorVersioned
+    {
+        /*!
+         * \brief The API and ABI version of the current managed Tensor
+         */
+        DLPackVersion version;
+        /*!
+         * \brief the context of the original host framework.
+         *
+         * Stores DLManagedTensorVersioned is used in the
+         * framework. It can also be NULL.
+         */
+        void *manager_ctx;
+        /*!
+         * \brief Destructor.
+         *
+         * This should be called to destruct manager_ctx which holds the
+         * DLManagedTensorVersioned. It can be NULL if there is no way for the
+         * caller to provide a reasonable destructor. The destructor deletes the
+         * argument self as well.
+         */
+        void (*deleter)(struct DLManagedTensorVersioned *self);
+        /*!
+         * \brief Additional bitmask flags information about the tensor.
+         *
+         * By default the flags should be set to 0.
+         *
+         * \note Future ABI changes should keep everything until this field
+         *       stable, to ensure that deleter can be correctly called.
+         *
+         * \sa DLPACK_FLAG_BITMASK_READ_ONLY
+         * \sa DLPACK_FLAG_BITMASK_IS_COPIED
+         */
+        uint64_t flags;
+        /*! \brief DLTensor which is being memory managed */
+        DLTensor dl_tensor;
+    } DLManagedTensorVersioned;
+
+    //----------------------------------------------------------------------
+    // DLPack `__dlpack_c_exchange_api__` fast exchange protocol definitions
+    //----------------------------------------------------------------------
+    /*!
+     * \brief Request a producer library to create a new tensor.
+     *
+     * Create a new `DLManagedTensorVersioned` within the context of the
+     * producer library. The allocation is defined via the prototype DLTensor.
+     *
+     * This function is exposed by the framework through the DLPackExchangeAPI.
+     *
+     * \param prototype The prototype DLTensor. Only the dtype, ndim, shape,
+     *        and device fields are used.
+     * \param out The output DLManagedTensorVersioned.
+     * \param error_ctx Context for `SetError`.
+     * \param SetError The function to set the error.
+     * \return The owning DLManagedTensorVersioned* or NULL on failure.
+     *         SetError is called exactly when NULL is returned (the implementer
+     *         must ensure this).
+     * \note - As a C function, must not thrown C++ exceptions.
+     *       - Error propagation via SetError to avoid any direct need
+     *         of Python API. Due to this `SetError` may have to ensure the GIL
+     * is held since it will presumably set a Python error.
+     *
+     * \sa DLPackExchangeAPI
+     */
+    typedef int (*DLPackManagedTensorAllocator)( //
+        DLTensor *prototype,
+        DLManagedTensorVersioned **out,
+        void *error_ctx, //
+        void (*SetError)(void *error_ctx,
+                         const char *kind,
+                         const char *message) //
+    );
+
+    /*!
+     * \brief Exports a PyObject* Tensor/NDArray to a DLManagedTensorVersioned.
+     *
+     * This function does not perform any stream synchronization. The consumer
+     * should query DLPackCurrentWorkStream to get the current work stream and
+     * launch kernels on it.
+     *
+     * This function is exposed by the framework through the DLPackExchangeAPI.
+     *
+     * \param py_object The Python object to convert. Must have the same type
+     *        as the one the `DLPackExchangeAPI` was discovered from.
+     * \param out The output DLManagedTensorVersioned.
+     * \return The owning DLManagedTensorVersioned* or NULL on failure with a
+     *         Python exception set. If the data cannot be described using
+     * DLPack this should be a BufferError if possible. \note - As a C function,
+     * must not thrown C++ exceptions.
+     *
+     * \sa DLPackExchangeAPI, DLPackCurrentWorkStream
+     */
+    typedef int (*DLPackManagedTensorFromPyObjectNoSync)( //
+        void *py_object,                                  //
+        DLManagedTensorVersioned **out                    //
+    );
+
+    /*!
+     * \brief Exports a PyObject* Tensor/NDArray to a provided DLTensor.
+     *
+     * This function provides a faster interface for temporary, non-owning,
+     * exchange. The producer (implementer) still owns the memory of data,
+     * strides, shape. The liveness of the DLTensor and the data it views is
+     * only guaranteed until control is returned.
+     *
+     * This function currently assumes that the producer (implementer) can fill
+     * in the DLTensor shape and strides without the need for temporary
+     * allocations.
+     *
+     * This function does not perform any stream synchronization. The consumer
+     * should query DLPackCurrentWorkStream to get the current work stream and
+     * launch kernels on it.
+     *
+     * This function is exposed by the framework through the DLPackExchangeAPI.
+     *
+     * \param py_object The Python object to convert. Must have the same type
+     *        as the one the `DLPackExchangeAPI` was discovered from.
+     * \param out The output DLTensor, whose space is pre-allocated on stack.
+     * \return 0 on success, -1 on failure with a Python exception set.
+     * \note - As a C function, must not thrown C++ exceptions.
+     *
+     * \sa DLPackExchangeAPI, DLPackCurrentWorkStream
+     */
+    typedef int (*DLPackDLTensorFromPyObjectNoSync)( //
+        void *py_object,                             //
+        DLTensor *out                                //
+    );
+
+    /*!
+     * \brief Obtain the current work stream of a device.
+     *
+     * Obtain the current work stream of a device from the producer framework.
+     * For example, it should map to torch.cuda.current_stream in PyTorch.
+     *
+     * When device_type is kDLCPU, the consumer do not have to query the stream
+     * and the producer can simply return NULL when queried.
+     * The consumer do not have to do anything on stream sync or setting.
+     * So CPU only framework can just provide a dummy implementation that
+     * always set out_current_stream[0] to NULL.
+     *
+     * \param device_type The device type.
+     * \param device_id The device id.
+     * \param out_current_stream The output current work stream.
+     *
+     * \return 0 on success, -1 on failure with a Python exception set.
+     * \note - As a C function, must not thrown C++ exceptions.
+     *
+     * \sa DLPackExchangeAPI
+     */
+    typedef int (*DLPackCurrentWorkStream)( //
+        DLDeviceType device_type,           //
+        int32_t device_id,                  //
+        void **out_current_stream           //
+    );
+
+    /*!
+     * \brief Imports a DLManagedTensorVersioned to a PyObject* Tensor/NDArray.
+     *
+     * Convert an owning DLManagedTensorVersioned* to the Python tensor of the
+     * producer (implementer) library with the correct type.
+     *
+     * This function does not perform any stream synchronization.
+     *
+     * This function is exposed by the framework through the DLPackExchangeAPI.
+     *
+     * \param tensor The DLManagedTensorVersioned to convert the ownership of
+     * the tensor is stolen. \param out_py_object The output Python object.
+     * \return 0 on success, -1 on failure with a Python exception set.
+     *
+     * \sa DLPackExchangeAPI
+     */
+    typedef int (*DLPackManagedTensorToPyObjectNoSync)( //
+        DLManagedTensorVersioned *tensor,               //
+        void **out_py_object                            //
+    );
+
+    /*!
+     * \brief DLPackExchangeAPI stable header.
+     * \sa DLPackExchangeAPI
+     */
+    typedef struct DLPackExchangeAPIHeader
+    {
+        /*!
+         * \brief The provided DLPack version the consumer must check major
+         * version compatibility before using this struct.
+         */
+        DLPackVersion version;
+        /*!
+         * \brief Optional pointer to an older DLPackExchangeAPI in the chain.
+         *
+         * It must be NULL if the framework does not support older versions.
+         * If the current major version is larger than the one supported by the
+         * consumer, the consumer may walk this to find an earlier supported
+         * version.
+         *
+         * \sa DLPackExchangeAPI
+         */
+        struct DLPackExchangeAPIHeader *prev_api;
+    } DLPackExchangeAPIHeader;
+
+    /*!
+     * \brief Framework-specific function pointers table for DLPack exchange.
+     *
+     * Additionally to `__dlpack__()` we define a C function table sharable by
+     *
+     * Python implementations via `__dlpack_c_exchange_api__`.
+     * This attribute must be set on the type as a Python PyCapsule
+     * with name "dlpack_exchange_api".
+     *
+     * A consumer library may use a pattern such as:
+     *
+     * \code
+     *
+     *  PyObject *api_capsule = PyObject_GetAttrString(
+     *    (PyObject *)Py_TYPE(tensor_obj), "__dlpack_c_exchange_api__")
+     *  );
+     *  if (api_capsule == NULL) { goto handle_error; }
+     *  MyDLPackExchangeAPI *api = (MyDLPackExchangeAPI *)PyCapsule_GetPointer(
+     *    api_capsule, "dlpack_exchange_api"
+     *  );
+     *  Py_DECREF(api_capsule);
+     *  if (api == NULL) { goto handle_error; }
+     *
+     * \endcode
+     *
+     * Note that this must be defined on the type. The consumer should look up
+     * the attribute on the type and may cache the result for each unique type.
+     *
+     * The precise API table is given by:
+     * \code
+     * struct MyDLPackExchangeAPI : public DLPackExchangeAPI {
+     *   MyDLPackExchangeAPI() {
+     *     header.version.major = DLPACK_MAJOR_VERSION;
+     *     header.version.minor = DLPACK_MINOR_VERSION;
+     *     header.prev_version_api = nullptr;
+     *
+     *     managed_tensor_allocator = MyDLPackManagedTensorAllocator;
+     *     managed_tensor_from_py_object_no_sync =
+     * MyDLPackManagedTensorFromPyObjectNoSync;
+     *     managed_tensor_to_py_object_no_sync =
+     * MyDLPackManagedTensorToPyObjectNoSync; dltensor_from_py_object_no_sync =
+     * MyDLPackDLTensorFromPyObjectNoSync; current_work_stream =
+     * MyDLPackCurrentWorkStream;
+     *  }
+     *
+     *  static const DLPackExchangeAPI* Global() {
+     *     static MyDLPackExchangeAPI inst;
+     *     return &inst;
+     *  }
+     * };
+     * \endcode
+     *
+     * Guidelines for leveraging DLPackExchangeAPI:
+     *
+     * There are generally two kinds of consumer needs for DLPack exchange:
+     * - N0: library support, where consumer.kernel(x, y, z) would like to run a
+     * kernel with the data from x, y, z. The consumer is also expected to run
+     * the kernel with the same stream context as the producer. For example,
+     * when x, y, z is torch.Tensor, consumer should query
+     * exchange_api->current_work_stream to get the current stream and launch
+     * the kernel with the same stream. This setup is necessary for no
+     * synchronization in kernel launch and maximum compatibility with CUDA
+     * graph capture in the producer. This is the desirable behavior for library
+     * extension support for frameworks like PyTorch.
+     * - N1: data ingestion and retention
+     *
+     * Note that obj.__dlpack__() API should provide useful ways for N1.
+     * The primary focus of the current DLPackExchangeAPI is to enable faster
+     * exchange N0 with the support of the function pointer current_work_stream.
+     *
+     * Array/Tensor libraries should statically create and initialize this
+     * structure then return a pointer to DLPackExchangeAPI as an int value in
+     * Tensor/Array. The DLPackExchangeAPI* must stay alive throughout the
+     * lifetime of the process.
+     *
+     * One simple way to do so is to create a static instance of
+     * DLPackExchangeAPI within the framework and return a pointer to it. The
+     * following code shows an example to do so in C++. It should also be
+     * reasonably easy to do so in other languages.
+     */
+    typedef struct DLPackExchangeAPI
+    {
+        /*!
+         * \brief The header that remains stable across versions.
+         */
+        DLPackExchangeAPIHeader header;
+        /*!
+         * \brief Producer function pointer for DLPackManagedTensorAllocator
+         *        This function must not be NULL.
+         * \sa DLPackManagedTensorAllocator
+         */
+        DLPackManagedTensorAllocator managed_tensor_allocator;
+        /*!
+         * \brief Producer function pointer for DLPackManagedTensorFromPyObject
+         *        This function must be not NULL.
+         * \sa DLPackManagedTensorFromPyObject
+         */
+        DLPackManagedTensorFromPyObjectNoSync
+            managed_tensor_from_py_object_no_sync;
+        /*!
+         * \brief Producer function pointer for DLPackManagedTensorToPyObject
+         *        This function must be not NULL.
+         * \sa DLPackManagedTensorToPyObjectNoSync
+         */
+        DLPackManagedTensorToPyObjectNoSync managed_tensor_to_py_object_no_sync;
+        /*!
+         * \brief Producer function pointer for DLPackDLTensorFromPyObject
+         *        This function can be NULL when the producer does not support
+         * this function. \sa DLPackDLTensorFromPyObjectNoSync
+         */
+        DLPackDLTensorFromPyObjectNoSync dltensor_from_py_object_no_sync;
+        /*!
+         * \brief Producer function pointer for DLPackCurrentWorkStream
+         *        This function must be not NULL.
+         * \sa DLPackCurrentWorkStream
+         */
+        DLPackCurrentWorkStream current_work_stream;
+    } DLPackExchangeAPI;
+
+#ifdef __cplusplus
+} // DLPACK_EXTERN_C
+#endif
+#endif // DLPACK_DLPACK_H_
diff --git a/dpnp/__init__.py b/dpnp/__init__.py
index 02420107972f..dd413d02f2bb 100644
--- a/dpnp/__init__.py
+++ b/dpnp/__init__.py
@@ -28,7 +28,6 @@
 
 import os
 import sys
-import warnings
 
 mypath = os.path.dirname(os.path.realpath(__file__))
 
@@ -61,10 +60,9 @@
                 [os.getenv("PATH", ""), dll_path]
             )
 
-# Borrowed from DPCTL
-with warnings.catch_warnings():
-    warnings.simplefilter("ignore", DeprecationWarning)
-    from dpctl.tensor import __array_api_version__, DLDeviceType
+# TODO: revert to `from dpctl.tensor...`
+# when dpnp fully migrates dpctl/tensor
+from dpctl_ext.tensor import __array_api_version__, DLDeviceType
 
 from .dpnp_array import dpnp_array as ndarray
 from .dpnp_array_api_info import __array_namespace_info__
diff --git a/dpnp/backend/extensions/blas/CMakeLists.txt b/dpnp/backend/extensions/blas/CMakeLists.txt
index 69a99b996d97..2dce27001bbd 100644
--- a/dpnp/backend/extensions/blas/CMakeLists.txt
+++ b/dpnp/backend/extensions/blas/CMakeLists.txt
@@ -39,6 +39,8 @@ set(_module_src
 pybind11_add_module(${python_module_name} MODULE ${_module_src})
 add_sycl_to_target(TARGET ${python_module_name} SOURCES ${_module_src})
 
+target_link_libraries(${python_module_name} PRIVATE DpctlExtCAPI)
+
 if(_dpnp_sycl_targets)
     # make fat binary
     target_compile_options(
diff --git a/dpnp/backend/extensions/fft/CMakeLists.txt b/dpnp/backend/extensions/fft/CMakeLists.txt
index 8a96d8cbd25a..bfebe1ed4226 100644
--- a/dpnp/backend/extensions/fft/CMakeLists.txt
+++ b/dpnp/backend/extensions/fft/CMakeLists.txt
@@ -33,6 +33,8 @@ set(_module_src ${CMAKE_CURRENT_SOURCE_DIR}/fft_py.cpp)
 pybind11_add_module(${python_module_name} MODULE ${_module_src})
 add_sycl_to_target(TARGET ${python_module_name} SOURCES ${_module_src})
 
+target_link_libraries(${python_module_name} PRIVATE DpctlExtCAPI)
+
 if(_dpnp_sycl_targets)
     # make fat binary
     target_compile_options(
diff --git a/dpnp/backend/extensions/indexing/CMakeLists.txt b/dpnp/backend/extensions/indexing/CMakeLists.txt
index 373c6152f662..7729e2807a4d 100644
--- a/dpnp/backend/extensions/indexing/CMakeLists.txt
+++ b/dpnp/backend/extensions/indexing/CMakeLists.txt
@@ -36,6 +36,8 @@ set(_module_src
 pybind11_add_module(${python_module_name} MODULE ${_module_src})
 add_sycl_to_target(TARGET ${python_module_name} SOURCES ${_module_src})
 
+target_link_libraries(${python_module_name} PRIVATE DpctlExtCAPI)
+
 if(_dpnp_sycl_targets)
     # make fat binary
     target_compile_options(
diff --git a/dpnp/backend/extensions/lapack/CMakeLists.txt b/dpnp/backend/extensions/lapack/CMakeLists.txt
index 2bac0932a673..a3ee4bae8ee5 100644
--- a/dpnp/backend/extensions/lapack/CMakeLists.txt
+++ b/dpnp/backend/extensions/lapack/CMakeLists.txt
@@ -55,6 +55,7 @@ set(_module_src
 
 pybind11_add_module(${python_module_name} MODULE ${_module_src})
 add_sycl_to_target(TARGET ${python_module_name} SOURCES ${_module_src})
+target_link_libraries(${python_module_name} PRIVATE DpctlExtCAPI)
 
 if(_dpnp_sycl_targets)
     # make fat binary
diff --git a/dpnp/backend/extensions/statistics/CMakeLists.txt b/dpnp/backend/extensions/statistics/CMakeLists.txt
index 60d26295acf8..88b3f185e6f6 100644
--- a/dpnp/backend/extensions/statistics/CMakeLists.txt
+++ b/dpnp/backend/extensions/statistics/CMakeLists.txt
@@ -41,6 +41,8 @@ set(_module_src
 pybind11_add_module(${python_module_name} MODULE ${_module_src})
 add_sycl_to_target(TARGET ${python_module_name} SOURCES ${_module_src})
 
+target_link_libraries(${python_module_name} PRIVATE DpctlExtCAPI)
+
 if(_dpnp_sycl_targets)
     # make fat binary
     target_compile_options(
diff --git a/dpnp/backend/extensions/ufunc/CMakeLists.txt b/dpnp/backend/extensions/ufunc/CMakeLists.txt
index 45d2706fb48d..d954316dcb2a 100644
--- a/dpnp/backend/extensions/ufunc/CMakeLists.txt
+++ b/dpnp/backend/extensions/ufunc/CMakeLists.txt
@@ -67,6 +67,8 @@ set(_module_src
 pybind11_add_module(${python_module_name} MODULE ${_module_src})
 add_sycl_to_target(TARGET ${python_module_name} SOURCES ${_module_src})
 
+target_link_libraries(${python_module_name} PRIVATE DpctlExtCAPI)
+
 if(WIN32)
     if(${CMAKE_VERSION} VERSION_LESS "3.27")
         # this is a work-around for target_link_options inserting option after -link option, cause
diff --git a/dpnp/backend/extensions/vm/CMakeLists.txt b/dpnp/backend/extensions/vm/CMakeLists.txt
index 32f7d4281c2f..0d69c4e79c03 100644
--- a/dpnp/backend/extensions/vm/CMakeLists.txt
+++ b/dpnp/backend/extensions/vm/CMakeLists.txt
@@ -90,6 +90,8 @@ set(python_module_name _vm_impl)
 pybind11_add_module(${python_module_name} MODULE ${_module_src})
 add_sycl_to_target(TARGET ${python_module_name} SOURCES ${_module_src})
 
+target_link_libraries(${python_module_name} PRIVATE DpctlExtCAPI)
+
 if(WIN32)
     if(${CMAKE_VERSION} VERSION_LESS "3.27")
         # this is a work-around for target_link_options inserting option after -link option, cause
diff --git a/dpnp/backend/extensions/window/CMakeLists.txt b/dpnp/backend/extensions/window/CMakeLists.txt
index 5b7921ad324c..c8cbd7c03bbc 100644
--- a/dpnp/backend/extensions/window/CMakeLists.txt
+++ b/dpnp/backend/extensions/window/CMakeLists.txt
@@ -36,6 +36,8 @@ set(_module_src
 pybind11_add_module(${python_module_name} MODULE ${_module_src})
 add_sycl_to_target(TARGET ${python_module_name} SOURCES ${_module_src})
 
+target_link_libraries(${python_module_name} PRIVATE DpctlExtCAPI)
+
 if(_dpnp_sycl_targets)
     # make fat binary
     target_compile_options(
diff --git a/dpnp/backend/include/dpnp4pybind11.hpp b/dpnp/backend/include/dpnp4pybind11.hpp
index 1a4fb69782dd..43a3df995cc6 100644
--- a/dpnp/backend/include/dpnp4pybind11.hpp
+++ b/dpnp/backend/include/dpnp4pybind11.hpp
@@ -28,7 +28,17 @@
 
 #pragma once
 
-#include "dpctl_capi.h"
+// Include dpctl_ext C-API (provides unified access to both dpctl and dpctl_ext)
+// This includes:
+// - dpctl C-API (from external dpctl package - SYCL interface)
+// - dpctl_ext C-API (tensor interface: usm_ndarray)
+//
+// TODO: When dpctl_ext is renamed to dpctl.tensor:
+//   - Update include: "dpctl_ext_capi.h" → "dpctl/tensor/tensor_capi.h"
+//     (Use tensor_capi.h, NOT dpctl_capi.h, to avoid conflict with external
+//     dpctl)
+//   - Update import calls: import_dpctl_ext() → import_dpctl_tensor()
+#include "dpctl_ext_capi.h"
 
 #include <array>
 #include <cassert>
@@ -284,7 +294,8 @@ class dpctl_capi
         // e.g. SyclDevice_GetDeviceRef, etc.
         // pointers to Python types, i.e. PySyclDeviceType, etc.
         // and exported constants, i.e. USM_ARRAY_C_CONTIGUOUS, etc.
-        import_dpctl();
+        // TODO: rename once dpctl_ext is renamed
+        import_dpctl_ext(); // Imports both dpctl and dpctl_ext C-APIs
 
         // Python type objects for classes implemented by dpctl
         this->Py_SyclDeviceType_ = &Py_SyclDeviceType;
@@ -414,8 +425,10 @@ class dpctl_capi
         default_usm_memory_ = std::shared_ptr<py::object>(
             new py::object{py_default_usm_memory}, Deleter{});
 
+        // TODO: revert to `py::module_::import("dpctl.tensor._usmarray");`
+        // when dpnp fully migrates dpctl/tensor
         py::module_ mod_usmarray =
-            py::module_::import("dpctl.tensor._usmarray");
+            py::module_::import("dpctl_ext.tensor._usmarray");
         auto tensor_kl = mod_usmarray.attr("usm_ndarray");
 
         const py::object &py_default_usm_ndarray =
diff --git a/dpnp/dpnp_algo/dpnp_arraycreation.py b/dpnp/dpnp_algo/dpnp_arraycreation.py
index 4e2ee8531a18..fb277dd4d310 100644
--- a/dpnp/dpnp_algo/dpnp_arraycreation.py
+++ b/dpnp/dpnp_algo/dpnp_arraycreation.py
@@ -29,13 +29,12 @@
 import math
 import operator
 
-import dpctl.tensor as dpt
 import dpctl.utils as dpu
 import numpy
 
 # TODO: revert to `import dpctl.tensor...`
 # when dpnp fully migrates dpctl/tensor
-import dpctl_ext.tensor as dpt_ext
+import dpctl_ext.tensor as dpt
 import dpnp
 from dpnp.dpnp_array import dpnp_array
 from dpnp.dpnp_utils import get_usm_allocations, map_dtype_to_device
@@ -53,7 +52,7 @@ def _as_usm_ndarray(a, usm_type, sycl_queue):
 
     if isinstance(a, dpnp_array):
         a = a.get_array()
-    return dpt_ext.asarray(a, usm_type=usm_type, sycl_queue=sycl_queue)
+    return dpt.asarray(a, usm_type=usm_type, sycl_queue=sycl_queue)
 
 
 def _check_has_zero_val(a):
@@ -196,7 +195,7 @@ def dpnp_linspace(
 
     if dpnp.isscalar(start) and dpnp.isscalar(stop):
         # Call linspace() function for scalars.
-        usm_res = dpt_ext.linspace(
+        usm_res = dpt.linspace(
             start,
             stop,
             num,
@@ -213,19 +212,19 @@ def dpnp_linspace(
             else:
                 step = dpnp.nan
     else:
-        usm_start = dpt_ext.asarray(
+        usm_start = dpt.asarray(
             start,
             dtype=dt,
             usm_type=_usm_type,
             sycl_queue=sycl_queue_normalized,
         )
-        usm_stop = dpt_ext.asarray(
+        usm_stop = dpt.asarray(
             stop, dtype=dt, usm_type=_usm_type, sycl_queue=sycl_queue_normalized
         )
 
         delta = usm_stop - usm_start
 
-        usm_res = dpt_ext.arange(
+        usm_res = dpt.arange(
             0,
             stop=num,
             step=1,
@@ -233,9 +232,7 @@ def dpnp_linspace(
             usm_type=_usm_type,
             sycl_queue=sycl_queue_normalized,
         )
-        usm_res = dpt_ext.reshape(
-            usm_res, (-1,) + (1,) * delta.ndim, copy=False
-        )
+        usm_res = dpt.reshape(usm_res, (-1,) + (1,) * delta.ndim, copy=False)
 
         if step_num > 0:
             step = delta / step_num
@@ -243,7 +240,7 @@ def dpnp_linspace(
             # Needed a special handling for denormal numbers (when step == 0),
             # see numpy#5437 for more details.
             # Note, dpt.where() is used to avoid a synchronization branch.
-            usm_res = dpt_ext.where(
+            usm_res = dpt.where(
                 step == 0, (usm_res / step_num) * delta, usm_res * step
             )
         else:
@@ -256,17 +253,17 @@ def dpnp_linspace(
             usm_res[-1, ...] = usm_stop
 
     if axis != 0:
-        usm_res = dpt_ext.moveaxis(usm_res, 0, axis)
+        usm_res = dpt.moveaxis(usm_res, 0, axis)
 
     if dpnp.issubdtype(dtype, dpnp.integer):
         dpt.floor(usm_res, out=usm_res)
 
-    res = dpt_ext.astype(usm_res, dtype, copy=False)
+    res = dpt.astype(usm_res, dtype, copy=False)
     res = dpnp_array._create_from_usm_ndarray(res)
 
     if retstep is True:
         if dpnp.isscalar(step):
-            step = dpt_ext.asarray(
+            step = dpt.asarray(
                 step, usm_type=res.usm_type, sycl_queue=res.sycl_queue
             )
         return res, dpnp_array._create_from_usm_ndarray(step)
diff --git a/dpnp/dpnp_algo/dpnp_elementwise_common.py b/dpnp/dpnp_algo/dpnp_elementwise_common.py
index 6aaf46f7ad9c..271013b58090 100644
--- a/dpnp/dpnp_algo/dpnp_elementwise_common.py
+++ b/dpnp/dpnp_algo/dpnp_elementwise_common.py
@@ -29,14 +29,13 @@
 import warnings
 from functools import wraps
 
-import dpctl.tensor as dpt
 import dpctl.utils as dpu
 import numpy
 
 # pylint: disable=no-name-in-module
 # TODO: revert to `import dpctl.tensor...`
 # when dpnp fully migrates dpctl/tensor
-import dpctl_ext.tensor as dpt_ext
+import dpctl_ext.tensor as dpt
 import dpctl_ext.tensor._copy_utils as dtc
 import dpctl_ext.tensor._tensor_impl as dti
 import dpctl_ext.tensor._type_utils as dtu
@@ -213,7 +212,7 @@ def __call__(
 
         x_usm = dpnp.get_usm_ndarray(x)
         if dtype is not None:
-            x_usm = dpt_ext.astype(x_usm, dtype, copy=False)
+            x_usm = dpt.astype(x_usm, dtype, copy=False)
 
         out = self._unpack_out_kw(out)
         out_usm = None if out is None else dpnp.get_usm_ndarray(out)
@@ -467,7 +466,7 @@ def __call__(
                     )
 
                 # Allocate a temporary buffer with the required dtype
-                out[i] = dpt_ext.empty_like(res, dtype=res_dt)
+                out[i] = dpt.empty_like(res, dtype=res_dt)
             elif (
                 buf_dt is None
                 and dti._array_overlap(x, res)
@@ -476,7 +475,7 @@ def __call__(
                 # Allocate a temporary buffer to avoid memory overlapping.
                 # Note if `buf_dt` is not None, a temporary copy of `x` will be
                 # created, so the array overlap check isn't needed.
-                out[i] = dpt_ext.empty_like(res)
+                out[i] = dpt.empty_like(res)
 
         _manager = dpu.SequentialOrderManager[exec_q]
         dep_evs = _manager.submitted_events
@@ -486,7 +485,7 @@ def __call__(
             if order == "K":
                 buf = dtc._empty_like_orderK(x, buf_dt)
             else:
-                buf = dpt_ext.empty_like(x, dtype=buf_dt, order=order)
+                buf = dpt.empty_like(x, dtype=buf_dt, order=order)
 
             ht_copy_ev, copy_ev = dti._copy_usm_ndarray_into_usm_ndarray(
                 src=x, dst=buf, sycl_queue=exec_q, depends=dep_evs
@@ -503,7 +502,7 @@ def __call__(
                 if order == "K":
                     out[i] = dtc._empty_like_orderK(x, res_dt)
                 else:
-                    out[i] = dpt_ext.empty_like(x, dtype=res_dt, order=order)
+                    out[i] = dpt.empty_like(x, dtype=res_dt, order=order)
 
         # Call the unary function with input and output arrays
         ht_unary_ev, unary_ev = self.get_implementation_function()(
@@ -713,24 +712,24 @@ def __call__(
 
         if dtype is not None:
             if dpnp.isscalar(x1):
-                x1_usm = dpt_ext.asarray(
+                x1_usm = dpt.asarray(
                     x1,
                     dtype=dtype,
                     sycl_queue=x2.sycl_queue,
                     usm_type=x2.usm_type,
                 )
-                x2_usm = dpt_ext.astype(x2_usm, dtype, copy=False)
+                x2_usm = dpt.astype(x2_usm, dtype, copy=False)
             elif dpnp.isscalar(x2):
-                x1_usm = dpt_ext.astype(x1_usm, dtype, copy=False)
-                x2_usm = dpt_ext.asarray(
+                x1_usm = dpt.astype(x1_usm, dtype, copy=False)
+                x2_usm = dpt.asarray(
                     x2,
                     dtype=dtype,
                     sycl_queue=x1.sycl_queue,
                     usm_type=x1.usm_type,
                 )
             else:
-                x1_usm = dpt_ext.astype(x1_usm, dtype, copy=False)
-                x2_usm = dpt_ext.astype(x2_usm, dtype, copy=False)
+                x1_usm = dpt.astype(x1_usm, dtype, copy=False)
+                x2_usm = dpt.astype(x2_usm, dtype, copy=False)
 
         res_usm = super().__call__(x1_usm, x2_usm, out=out_usm, order=order)
 
@@ -1078,7 +1077,7 @@ def __call__(
                     )
 
                 # Allocate a temporary buffer with the required dtype
-                out[i] = dpt_ext.empty_like(res, dtype=res_dt)
+                out[i] = dpt.empty_like(res, dtype=res_dt)
             else:
                 # If `dt` is not None, a temporary copy of `x` will be created,
                 # so the array overlap check isn't needed.
@@ -1094,7 +1093,7 @@ def __call__(
                     for x in x_to_check
                 ):
                     # allocate a temporary buffer to avoid memory overlapping
-                    out[i] = dpt_ext.empty_like(res)
+                    out[i] = dpt.empty_like(res)
 
         x1 = dpnp.as_usm_ndarray(x1, dtype=x1_dt, sycl_queue=exec_q)
         x2 = dpnp.as_usm_ndarray(x2, dtype=x2_dt, sycl_queue=exec_q)
@@ -1127,7 +1126,7 @@ def __call__(
                 if order == "K":
                     buf = dtc._empty_like_orderK(x, buf_dt)
                 else:
-                    buf = dpt_ext.empty_like(x, dtype=buf_dt, order=order)
+                    buf = dpt.empty_like(x, dtype=buf_dt, order=order)
 
                 ht_copy_ev, copy_ev = dti._copy_usm_ndarray_into_usm_ndarray(
                     src=x, dst=buf, sycl_queue=exec_q, depends=dep_evs
@@ -1146,7 +1145,7 @@ def __call__(
                         x1, x2, res_dt, res_shape, res_usm_type, exec_q
                     )
                 else:
-                    out[i] = dpt_ext.empty(
+                    out[i] = dpt.empty(
                         res_shape,
                         dtype=res_dt,
                         order=order,
@@ -1156,9 +1155,9 @@ def __call__(
 
         # Broadcast shapes of input arrays
         if x1.shape != res_shape:
-            x1 = dpt_ext.broadcast_to(x1, res_shape)
+            x1 = dpt.broadcast_to(x1, res_shape)
         if x2.shape != res_shape:
-            x2 = dpt_ext.broadcast_to(x2, res_shape)
+            x2 = dpt.broadcast_to(x2, res_shape)
 
         # Call the binary function with input and output arrays
         ht_binary_ev, binary_ev = self.get_implementation_function()(
@@ -1326,7 +1325,7 @@ def __call__(self, x, /, decimals=0, out=None, *, dtype=None):
                 res_usm = dpt.divide(x_usm, 10**decimals, out=out_usm)
 
             if dtype is not None:
-                res_usm = dpt_ext.astype(res_usm, dtype, copy=False)
+                res_usm = dpt.astype(res_usm, dtype, copy=False)
 
             if out is not None and isinstance(out, dpnp_array):
                 return out
diff --git a/dpnp/dpnp_array.py b/dpnp/dpnp_array.py
index 6418302d6e7b..cbb5835bbfc4 100644
--- a/dpnp/dpnp_array.py
+++ b/dpnp/dpnp_array.py
@@ -37,11 +37,9 @@
 
 import warnings
 
-import dpctl.tensor as dpt
-
 # TODO: revert to `import dpctl.tensor...`
 # when dpnp fully migrates dpctl/tensor
-import dpctl_ext.tensor as dpt_ext
+import dpctl_ext.tensor as dpt
 import dpctl_ext.tensor._type_utils as dtu
 import dpnp
 from dpctl_ext.tensor._numpy_helper import AxisError
@@ -777,7 +775,7 @@ def asnumpy(self):
 
         """
 
-        return dpt_ext.asnumpy(self._array_obj)
+        return dpt.asnumpy(self._array_obj)
 
     def astype(
         self,
@@ -2283,7 +2281,7 @@ def transpose(self, *axes):
                 # self.transpose(None).shape == self.shape[::-1]
                 axes = tuple((ndim - x - 1) for x in range(ndim))
 
-            usm_res = dpt_ext.permute_dims(self._array_obj, axes)
+            usm_res = dpt.permute_dims(self._array_obj, axes)
         return dpnp_array._create_from_usm_ndarray(usm_res)
 
     def var(
diff --git a/dpnp/dpnp_array_api_info.py b/dpnp/dpnp_array_api_info.py
index 6a3939d046b0..f792600cbb66 100644
--- a/dpnp/dpnp_array_api_info.py
+++ b/dpnp/dpnp_array_api_info.py
@@ -36,7 +36,9 @@
 
 """
 
-import dpctl.tensor as dpt
+# TODO: revert to `import dpctl.tensor...`
+# when dpnp fully migrates dpctl/tensor
+import dpctl_ext.tensor as dpt
 
 
 def __array_namespace_info__():
diff --git a/dpnp/dpnp_iface.py b/dpnp/dpnp_iface.py
index 9fca083a6413..13b957ffff8f 100644
--- a/dpnp/dpnp_iface.py
+++ b/dpnp/dpnp_iface.py
@@ -45,17 +45,16 @@
 import os
 
 import dpctl
-import dpctl.tensor as dpt
 import dpctl.utils as dpu
 import numpy
-from dpctl.tensor._device import normalize_queue_device
 
 # pylint: disable=no-name-in-module
 # TODO: revert to `import dpctl.tensor...`
 # when dpnp fully migrates dpctl/tensor
-import dpctl_ext.tensor as dpt_ext
+import dpctl_ext.tensor as dpt
 import dpctl_ext.tensor._tensor_impl as ti
 import dpnp
+from dpctl_ext.tensor._device import normalize_queue_device
 
 from .dpnp_array import dpnp_array
 from .dpnp_utils import (
@@ -137,7 +136,7 @@ def asnumpy(a, order="C"):
         return a.asnumpy()
 
     if isinstance(a, dpt.usm_ndarray):
-        return dpt_ext.asnumpy(a)
+        return dpt.asnumpy(a)
 
     return numpy.asarray(a, order=order)
 
@@ -191,7 +190,7 @@ def as_usm_ndarray(a, dtype=None, device=None, usm_type=None, sycl_queue=None):
     if is_supported_array_type(a):
         return get_usm_ndarray(a)
 
-    return dpt_ext.asarray(
+    return dpt.asarray(
         a, dtype=dtype, device=device, usm_type=usm_type, sycl_queue=sycl_queue
     )
 
diff --git a/dpnp/dpnp_iface_arraycreation.py b/dpnp/dpnp_iface_arraycreation.py
index d09cc17bde79..2800df0b2ac8 100644
--- a/dpnp/dpnp_iface_arraycreation.py
+++ b/dpnp/dpnp_iface_arraycreation.py
@@ -43,12 +43,11 @@
 
 import operator
 
-import dpctl.tensor as dpt
 import numpy
 
 # TODO: revert to `import dpctl.tensor...`
 # when dpnp fully migrates dpctl/tensor
-import dpctl_ext.tensor as dpt_ext
+import dpctl_ext.tensor as dpt
 import dpnp
 from dpnp import dpnp_container
 
@@ -937,7 +936,7 @@ def astype(x, dtype, /, *, order="K", casting="unsafe", copy=True, device=None):
         order = "K"
 
     usm_x = dpnp.get_usm_ndarray(x)
-    usm_res = dpt_ext.astype(
+    usm_res = dpt.astype(
         usm_x, dtype, order=order, casting=casting, copy=copy, device=device
     )
 
@@ -3119,7 +3118,7 @@ def meshgrid(*xi, copy=True, sparse=False, indexing="xy"):
 
     s0 = (1,) * ndim
     output = [
-        dpt_ext.reshape(dpnp.get_usm_ndarray(x), s0[:i] + (-1,) + s0[i + 1 :])
+        dpt.reshape(dpnp.get_usm_ndarray(x), s0[:i] + (-1,) + s0[i + 1 :])
         for i, x in enumerate(xi)
     ]
 
@@ -3127,14 +3126,14 @@ def meshgrid(*xi, copy=True, sparse=False, indexing="xy"):
     _, _ = get_usm_allocations(output)
 
     if indexing == "xy" and ndim > 1:
-        output[0] = dpt_ext.reshape(output[0], (1, -1) + s0[2:])
-        output[1] = dpt_ext.reshape(output[1], (-1, 1) + s0[2:])
+        output[0] = dpt.reshape(output[0], (1, -1) + s0[2:])
+        output[1] = dpt.reshape(output[1], (-1, 1) + s0[2:])
 
     if not sparse:
-        output = dpt_ext.broadcast_arrays(*output)
+        output = dpt.broadcast_arrays(*output)
 
     if copy:
-        output = [dpt_ext.copy(x) for x in output]
+        output = [dpt.copy(x) for x in output]
 
     return [dpnp_array._create_from_usm_ndarray(x) for x in output]
 
@@ -3696,7 +3695,7 @@ def tri(
     if usm_type is None:
         usm_type = "device"
 
-    m = dpt_ext.ones(
+    m = dpt.ones(
         (N, M),
         dtype=_dtype,
         device=device,
@@ -3912,7 +3911,7 @@ def vander(
 
     if dpnp.is_supported_array_type(x):
         x = dpnp.get_usm_ndarray(x)
-    usm_x = dpt_ext.asarray(
+    usm_x = dpt.asarray(
         x, device=device, usm_type=usm_type, sycl_queue=sycl_queue
     )
 
@@ -3934,8 +3933,8 @@ def vander(
 
     tmp = m[:, ::-1] if not increasing else m
     dpnp.power(
-        dpt_ext.reshape(usm_x, (-1, 1)),
-        dpt_ext.arange(
+        dpt.reshape(usm_x, (-1, 1)),
+        dpt.arange(
             N, dtype=_dtype, usm_type=x_usm_type, sycl_queue=x_sycl_queue
         ),
         out=tmp,
diff --git a/dpnp/dpnp_iface_indexing.py b/dpnp/dpnp_iface_indexing.py
index a52196e9e4db..4b8fb7bb6a38 100644
--- a/dpnp/dpnp_iface_indexing.py
+++ b/dpnp/dpnp_iface_indexing.py
@@ -44,14 +44,13 @@
 import operator
 from collections.abc import Iterable
 
-import dpctl.tensor as dpt
 import dpctl.utils as dpu
 import numpy
 
 # pylint: disable=no-name-in-module
 # TODO: revert to `import dpctl.tensor...`
 # when dpnp fully migrates dpctl/tensor
-import dpctl_ext.tensor as dpt_ext
+import dpctl_ext.tensor as dpt
 import dpctl_ext.tensor._tensor_impl as ti
 import dpnp
 
@@ -141,9 +140,9 @@ def _choose_run(inds, chcs, q, usm_type, out=None, mode=0):
             ti._array_overlap(out, chc) for chc in chcs
         ):
             # Allocate a temporary buffer to avoid memory overlapping.
-            out = dpt_ext.empty_like(out)
+            out = dpt.empty_like(out)
     else:
-        out = dpt_ext.empty(
+        out = dpt.empty(
             inds.shape, dtype=chcs[0].dtype, usm_type=usm_type, sycl_queue=q
         )
 
@@ -242,7 +241,7 @@ def choose(a, choices, out=None, mode="wrap"):
         # NumPy will cast up to int64 in general but
         # int32 is more than safe for bool
         if ind_dt == dpnp.bool:
-            inds = dpt_ext.astype(inds, dpt.int32)
+            inds = dpt.astype(inds, dpt.int32)
         else:
             raise TypeError("input index array must be of integer data type")
 
@@ -250,17 +249,17 @@ def choose(a, choices, out=None, mode="wrap"):
 
     res_usm_type, exec_q = get_usm_allocations(choices + [inds])
     # apply type promotion to input choices
-    res_dt = dpt_ext.result_type(*choices)
+    res_dt = dpt.result_type(*choices)
     if len(choices) > 1:
         choices = tuple(
             map(
                 lambda chc: (
-                    chc if chc.dtype == res_dt else dpt_ext.astype(chc, res_dt)
+                    chc if chc.dtype == res_dt else dpt.astype(chc, res_dt)
                 ),
                 choices,
             )
         )
-    arrs_broadcast = dpt_ext.broadcast_arrays(inds, *choices)
+    arrs_broadcast = dpt.broadcast_arrays(inds, *choices)
     inds = arrs_broadcast[0]
     choices = tuple(arrs_broadcast[1:])
 
@@ -301,11 +300,9 @@ def _take_index(x, inds, axis, q, usm_type, out=None, mode=0):
 
         if ti._array_overlap(x, out):
             # Allocate a temporary buffer to avoid memory overlapping.
-            out = dpt_ext.empty_like(out)
+            out = dpt.empty_like(out)
     else:
-        out = dpt_ext.empty(
-            res_sh, dtype=x.dtype, usm_type=usm_type, sycl_queue=q
-        )
+        out = dpt.empty(res_sh, dtype=x.dtype, usm_type=usm_type, sycl_queue=q)
 
     _manager = dpu.SequentialOrderManager[q]
     dep_evs = _manager.submitted_events
@@ -816,16 +813,16 @@ def extract(condition, a):
     )
 
     if usm_cond.size != usm_a.size:
-        usm_a = dpt_ext.reshape(usm_a, -1)
-        usm_cond = dpt_ext.reshape(usm_cond, -1)
+        usm_a = dpt.reshape(usm_a, -1)
+        usm_cond = dpt.reshape(usm_cond, -1)
 
-        usm_res = dpt_ext.take(usm_a, dpt_ext.nonzero(usm_cond)[0])
+        usm_res = dpt.take(usm_a, dpt.nonzero(usm_cond)[0])
     else:
         if usm_cond.shape != usm_a.shape:
-            usm_a = dpt_ext.reshape(usm_a, -1)
-            usm_cond = dpt_ext.reshape(usm_cond, -1)
+            usm_a = dpt.reshape(usm_a, -1)
+            usm_cond = dpt.reshape(usm_cond, -1)
 
-        usm_res = dpt_ext.extract(usm_cond, usm_a)
+        usm_res = dpt.extract(usm_cond, usm_a)
 
     return dpnp_array._create_from_usm_ndarray(usm_res)
 
@@ -960,18 +957,18 @@ def fill_diagonal(a, val, wrap=False):
     # a.flat[:end:step] = val
     # but need to consider use case when `a` is usm_ndarray also
     a_sh = a.shape
-    tmp_a = dpt_ext.reshape(usm_a, -1)
+    tmp_a = dpt.reshape(usm_a, -1)
     if dpnp.isscalar(usm_val):
         tmp_a[:end:step] = usm_val
     else:
-        usm_val = dpt_ext.reshape(usm_val, -1)
+        usm_val = dpt.reshape(usm_val, -1)
 
         # Setitem can work only if index size equal val size.
         # Using loop for general case without dependencies of val size.
         for i in range(0, usm_val.size):
             tmp_a[step * i : end : step * (i + 1)] = usm_val[i]
 
-    tmp_a = dpt_ext.reshape(tmp_a, a_sh)
+    tmp_a = dpt.reshape(tmp_a, a_sh)
     usm_a[:] = tmp_a
 
 
@@ -1548,7 +1545,7 @@ def nonzero(a):
 
     usm_a = dpnp.get_usm_ndarray(a)
     return tuple(
-        dpnp_array._create_from_usm_ndarray(y) for y in dpt_ext.nonzero(usm_a)
+        dpnp_array._create_from_usm_ndarray(y) for y in dpt.nonzero(usm_a)
     )
 
 
@@ -1612,16 +1609,14 @@ def place(a, mask, vals):
 
     if usm_vals.ndim != 1:
         # dpt.place supports only 1-D array of values
-        usm_vals = dpt_ext.reshape(usm_vals, -1)
+        usm_vals = dpt.reshape(usm_vals, -1)
 
     if usm_vals.dtype != usm_a.dtype:
         # dpt.place casts values to a.dtype with "unsafe" rule,
         # while numpy.place does that with "safe" casting rule
-        usm_vals = dpt_ext.astype(
-            usm_vals, usm_a.dtype, casting="safe", copy=False
-        )
+        usm_vals = dpt.astype(usm_vals, usm_a.dtype, casting="safe", copy=False)
 
-    dpt_ext.place(usm_a, usm_mask, usm_vals)
+    dpt.place(usm_a, usm_mask, usm_vals)
 
 
 def put(a, ind, v, /, *, axis=None, mode="wrap"):
@@ -1711,19 +1706,19 @@ def put(a, ind, v, /, *, axis=None, mode="wrap"):
 
     if usm_ind.ndim != 1:
         # dpt.put supports only 1-D array of indices
-        usm_ind = dpt_ext.reshape(usm_ind, -1, copy=False)
+        usm_ind = dpt.reshape(usm_ind, -1, copy=False)
 
     if not dpnp.issubdtype(usm_ind.dtype, dpnp.integer):
         # dpt.put supports only integer dtype for array of indices
-        usm_ind = dpt_ext.astype(usm_ind, dpnp.intp, casting="safe")
+        usm_ind = dpt.astype(usm_ind, dpnp.intp, casting="safe")
 
     in_usm_a = usm_a
     if axis is None and usm_a.ndim > 1:
-        usm_a = dpt_ext.reshape(usm_a, -1)
+        usm_a = dpt.reshape(usm_a, -1)
 
-    dpt_ext.put(usm_a, usm_ind, usm_v, axis=axis, mode=mode)
+    dpt.put(usm_a, usm_ind, usm_v, axis=axis, mode=mode)
     if in_usm_a._pointer != usm_a._pointer:  # pylint: disable=protected-access
-        in_usm_a[:] = dpt_ext.reshape(usm_a, in_usm_a.shape, copy=False)
+        in_usm_a[:] = dpt.reshape(usm_a, in_usm_a.shape, copy=False)
 
 
 def put_along_axis(a, ind, values, axis, mode="wrap"):
@@ -1805,11 +1800,11 @@ def put_along_axis(a, ind, values, axis, mode="wrap"):
     if dpnp.is_supported_array_type(values):
         usm_vals = dpnp.get_usm_ndarray(values)
     else:
-        usm_vals = dpt_ext.asarray(
+        usm_vals = dpt.asarray(
             values, usm_type=a.usm_type, sycl_queue=a.sycl_queue
         )
 
-    dpt_ext.put_along_axis(usm_a, usm_ind, usm_vals, axis=axis, mode=mode)
+    dpt.put_along_axis(usm_a, usm_ind, usm_vals, axis=axis, mode=mode)
 
 
 def putmask(x1, mask, values):
@@ -2153,7 +2148,7 @@ def take(a, indices, /, *, axis=None, out=None, mode="wrap"):
 
     usm_a = dpnp.get_usm_ndarray(a)
     if not dpnp.is_supported_array_type(indices):
-        usm_ind = dpt_ext.asarray(
+        usm_ind = dpt.asarray(
             indices, usm_type=a.usm_type, sycl_queue=a.sycl_queue
         )
     else:
@@ -2165,7 +2160,7 @@ def take(a, indices, /, *, axis=None, out=None, mode="wrap"):
     if axis is None:
         if a_ndim > 1:
             # flatten input array
-            usm_a = dpt_ext.reshape(usm_a, -1)
+            usm_a = dpt.reshape(usm_a, -1)
         axis = 0
     elif a_ndim == 0:
         axis = normalize_axis_index(operator.index(axis), 1)
@@ -2174,7 +2169,7 @@ def take(a, indices, /, *, axis=None, out=None, mode="wrap"):
 
     if not dpnp.issubdtype(usm_ind.dtype, dpnp.integer):
         # dpt.take supports only integer dtype for array of indices
-        usm_ind = dpt_ext.astype(usm_ind, dpnp.intp, copy=False, casting="safe")
+        usm_ind = dpt.astype(usm_ind, dpnp.intp, copy=False, casting="safe")
 
     usm_res = _take_index(
         usm_a, usm_ind, axis, exec_q, res_usm_type, out=out, mode=mode
@@ -2297,7 +2292,7 @@ def take_along_axis(a, indices, axis=-1, mode="wrap"):
     usm_a = dpnp.get_usm_ndarray(a)
     usm_ind = dpnp.get_usm_ndarray(indices)
 
-    usm_res = dpt_ext.take_along_axis(usm_a, usm_ind, axis=axis, mode=mode)
+    usm_res = dpt.take_along_axis(usm_a, usm_ind, axis=axis, mode=mode)
     return dpnp_array._create_from_usm_ndarray(usm_res)
 
 
diff --git a/dpnp/dpnp_iface_manipulation.py b/dpnp/dpnp_iface_manipulation.py
index 2ff08cc6ec8b..0fc2c3f80fde 100644
--- a/dpnp/dpnp_iface_manipulation.py
+++ b/dpnp/dpnp_iface_manipulation.py
@@ -45,12 +45,11 @@
 from typing import NamedTuple
 
 import dpctl
-import dpctl.tensor as dpt
 import numpy
 
 # TODO: revert to `import dpctl.tensor...`
 # when dpnp fully migrates dpctl/tensor
-import dpctl_ext.tensor as dpt_ext
+import dpctl_ext.tensor as dpt
 import dpnp
 from dpctl_ext.tensor._numpy_helper import (
     AxisError,
@@ -375,27 +374,25 @@ def _get_first_nan_index(usm_a):
         ):
             if dpnp.issubdtype(usm_a.dtype, dpnp.complexfloating):
                 # for complex all NaNs are considered equivalent
-                true_val = dpt_ext.asarray(
+                true_val = dpt.asarray(
                     True, sycl_queue=usm_a.sycl_queue, usm_type=usm_a.usm_type
                 )
-                return dpt_ext.searchsorted(
-                    dpt.isnan(usm_a), true_val, side="left"
-                )
-            return dpt_ext.searchsorted(usm_a, usm_a[-1], side="left")
+                return dpt.searchsorted(dpt.isnan(usm_a), true_val, side="left")
+            return dpt.searchsorted(usm_a, usm_a[-1], side="left")
         return None
 
     usm_ar = dpnp.get_usm_ndarray(ar)
 
     num_of_flags = (return_index, return_inverse, return_counts).count(True)
     if num_of_flags == 0:
-        usm_res = dpt_ext.unique_values(usm_ar)
+        usm_res = dpt.unique_values(usm_ar)
         usm_res = (usm_res,)  # cast to a tuple to align with other cases
     elif num_of_flags == 1 and return_inverse:
-        usm_res = dpt_ext.unique_inverse(usm_ar)
+        usm_res = dpt.unique_inverse(usm_ar)
     elif num_of_flags == 1 and return_counts:
-        usm_res = dpt_ext.unique_counts(usm_ar)
+        usm_res = dpt.unique_counts(usm_ar)
     else:
-        usm_res = dpt_ext.unique_all(usm_ar)
+        usm_res = dpt.unique_all(usm_ar)
 
     first_nan = None
     if equal_nan:
@@ -417,10 +414,10 @@ def _get_first_nan_index(usm_a):
         if first_nan is not None:
             # all NaNs are collapsed, so need to replace the indices with
             # the index of the first NaN value in result array of unique values
-            dpt_ext.place(
+            dpt.place(
                 usm_res.inverse_indices,
                 usm_res.inverse_indices > first_nan,
-                dpt_ext.reshape(first_nan, 1),
+                dpt.reshape(first_nan, 1),
             )
 
         result += (usm_res.inverse_indices,)
@@ -428,9 +425,7 @@ def _get_first_nan_index(usm_a):
         if first_nan is not None:
             # all NaNs are collapsed, so need to put a count of all NaNs
             # at the last index
-            dpt_ext.sum(
-                usm_res.counts[first_nan:], out=usm_res.counts[first_nan]
-            )
+            dpt.sum(usm_res.counts[first_nan:], out=usm_res.counts[first_nan])
             result += (usm_res.counts[: first_nan + 1],)
         else:
             result += (usm_res.counts,)
@@ -1097,9 +1092,7 @@ def broadcast_arrays(*args, subok=False):
     if len(args) == 0:
         return []
 
-    usm_arrays = dpt_ext.broadcast_arrays(
-        *[dpnp.get_usm_ndarray(a) for a in args]
-    )
+    usm_arrays = dpt.broadcast_arrays(*[dpnp.get_usm_ndarray(a) for a in args])
     return [dpnp_array._create_from_usm_ndarray(a) for a in usm_arrays]
 
 
@@ -1184,7 +1177,7 @@ def broadcast_to(array, /, shape, subok=False):
         raise NotImplementedError(f"subok={subok} is currently not supported")
 
     usm_array = dpnp.get_usm_ndarray(array)
-    new_array = dpt_ext.broadcast_to(usm_array, shape)
+    new_array = dpt.broadcast_to(usm_array, shape)
     return dpnp_array._create_from_usm_ndarray(new_array)
 
 
@@ -1276,7 +1269,7 @@ def can_cast(from_, to, casting="safe"):
         if dpnp.is_supported_array_type(from_)
         else dpnp.dtype(from_)
     )
-    return dpt_ext.can_cast(dtype_from, to, casting=casting)
+    return dpt.can_cast(dtype_from, to, casting=casting)
 
 
 def column_stack(tup):
@@ -1422,7 +1415,7 @@ def concatenate(
         )
 
     usm_arrays = [dpnp.get_usm_ndarray(x) for x in arrays]
-    usm_res = dpt_ext.concat(usm_arrays, axis=axis)
+    usm_res = dpt.concat(usm_arrays, axis=axis)
 
     res = dpnp_array._create_from_usm_ndarray(usm_res)
     if dtype is not None:
@@ -1527,7 +1520,7 @@ def copyto(dst, src, casting="same_kind", where=True):
                 f"but got {where.dtype}"
             )
 
-        dst_usm, src_usm, mask_usm = dpt_ext.broadcast_arrays(
+        dst_usm, src_usm, mask_usm = dpt.broadcast_arrays(
             dpnp.get_usm_ndarray(dst),
             dpnp.get_usm_ndarray(src),
             dpnp.get_usm_ndarray(where),
@@ -1855,7 +1848,7 @@ def expand_dims(a, axis):
     """
 
     usm_a = dpnp.get_usm_ndarray(a)
-    usm_res = dpt_ext.expand_dims(usm_a, axis=axis)
+    usm_res = dpt.expand_dims(usm_a, axis=axis)
     return dpnp_array._create_from_usm_ndarray(usm_res)
 
 
@@ -1926,7 +1919,7 @@ def flip(m, axis=None):
     """
 
     m_usm = dpnp.get_usm_ndarray(m)
-    return dpnp_array._create_from_usm_ndarray(dpt_ext.flip(m_usm, axis=axis))
+    return dpnp_array._create_from_usm_ndarray(dpt.flip(m_usm, axis=axis))
 
 
 def fliplr(m):
@@ -2370,7 +2363,7 @@ def matrix_transpose(x, /):
             f"but it is {usm_x.ndim}"
         )
 
-    usm_res = dpt_ext.matrix_transpose(usm_x)
+    usm_res = dpt.matrix_transpose(usm_x)
     return dpnp_array._create_from_usm_ndarray(usm_res)
 
 
@@ -2414,7 +2407,7 @@ def moveaxis(a, source, destination):
 
     usm_array = dpnp.get_usm_ndarray(a)
     return dpnp_array._create_from_usm_ndarray(
-        dpt_ext.moveaxis(usm_array, source, destination)
+        dpt.moveaxis(usm_array, source, destination)
     )
 
 
@@ -2843,7 +2836,7 @@ def repeat(a, repeats, axis=None):
         a = dpnp.ravel(a)
 
     usm_arr = dpnp.get_usm_ndarray(a)
-    usm_res = dpt_ext.repeat(usm_arr, repeats, axis=axis)
+    usm_res = dpt.repeat(usm_arr, repeats, axis=axis)
     return dpnp_array._create_from_usm_ndarray(usm_res)
 
 
@@ -3066,7 +3059,7 @@ def reshape(a, /, shape, order="C", *, copy=None):
         )
 
     usm_a = dpnp.get_usm_ndarray(a)
-    usm_res = dpt_ext.reshape(usm_a, shape=shape, order=order, copy=copy)
+    usm_res = dpt.reshape(usm_a, shape=shape, order=order, copy=copy)
     return dpnp_array._create_from_usm_ndarray(usm_res)
 
 
@@ -3201,7 +3194,7 @@ def result_type(*arrays_and_dtypes):
         )
         for X in arrays_and_dtypes
     ]
-    return dpt_ext.result_type(*usm_arrays_and_dtypes)
+    return dpt.result_type(*usm_arrays_and_dtypes)
 
 
 def roll(x, shift, axis=None):
@@ -3268,9 +3261,9 @@ def roll(x, shift, axis=None):
         shift = dpnp.asnumpy(shift)
 
     if axis is None:
-        return roll(dpt_ext.reshape(usm_x, -1), shift, 0).reshape(x.shape)
+        return roll(dpt.reshape(usm_x, -1), shift, 0).reshape(x.shape)
 
-    usm_res = dpt_ext.roll(usm_x, shift=shift, axis=axis)
+    usm_res = dpt.roll(usm_x, shift=shift, axis=axis)
     return dpnp_array._create_from_usm_ndarray(usm_res)
 
 
@@ -3669,7 +3662,7 @@ def squeeze(a, /, axis=None):
     """
 
     usm_a = dpnp.get_usm_ndarray(a)
-    usm_res = dpt_ext.squeeze(usm_a, axis=axis)
+    usm_res = dpt.squeeze(usm_a, axis=axis)
     return dpnp_array._create_from_usm_ndarray(usm_res)
 
 
@@ -3757,7 +3750,7 @@ def stack(arrays, /, *, axis=0, out=None, dtype=None, casting="same_kind"):
         )
 
     usm_arrays = [dpnp.get_usm_ndarray(x) for x in arrays]
-    usm_res = dpt_ext.stack(usm_arrays, axis=axis)
+    usm_res = dpt.stack(usm_arrays, axis=axis)
 
     res = dpnp_array._create_from_usm_ndarray(usm_res)
     if dtype is not None:
@@ -3818,7 +3811,7 @@ def swapaxes(a, axis1, axis2):
     """
 
     usm_a = dpnp.get_usm_ndarray(a)
-    usm_res = dpt_ext.swapaxes(usm_a, axis1=axis1, axis2=axis2)
+    usm_res = dpt.swapaxes(usm_a, axis1=axis1, axis2=axis2)
     return dpnp_array._create_from_usm_ndarray(usm_res)
 
 
@@ -3898,7 +3891,7 @@ def tile(A, reps):
     """
 
     usm_a = dpnp.get_usm_ndarray(A)
-    usm_res = dpt_ext.tile(usm_a, reps)
+    usm_res = dpt.tile(usm_a, reps)
     return dpnp_array._create_from_usm_ndarray(usm_res)
 
 
@@ -4528,7 +4521,7 @@ def unstack(x, /, *, axis=0):
     if usm_x.ndim == 0:
         raise ValueError("Input array must be at least 1-d.")
 
-    res = dpt_ext.unstack(usm_x, axis=axis)
+    res = dpt.unstack(usm_x, axis=axis)
     return tuple(dpnp_array._create_from_usm_ndarray(a) for a in res)
 
 
diff --git a/dpnp/dpnp_iface_statistics.py b/dpnp/dpnp_iface_statistics.py
index 1d89d14c8df8..4063233dc981 100644
--- a/dpnp/dpnp_iface_statistics.py
+++ b/dpnp/dpnp_iface_statistics.py
@@ -39,6 +39,8 @@
 
 """
 
+# pylint: disable=no-name-in-module
+
 import math
 
 import dpctl.utils as dpu
@@ -49,16 +51,14 @@
 import dpctl_ext.tensor as dpt
 import dpctl_ext.tensor._tensor_elementwise_impl as ti
 import dpnp
-
-# pylint: disable=no-name-in-module
 import dpnp.backend.extensions.statistics._statistics_impl as statistics_ext
 from dpctl_ext.tensor._numpy_helper import normalize_axis_index
-from dpnp.dpnp_utils.dpnp_utils_common import (
+
+from .dpnp_utils import get_usm_allocations
+from .dpnp_utils.dpnp_utils_common import (
     result_type_for_device,
     to_supported_dtypes,
 )
-
-from .dpnp_utils import get_usm_allocations
 from .dpnp_utils.dpnp_utils_reduction import dpnp_wrap_reduction_call
 from .dpnp_utils.dpnp_utils_statistics import dpnp_cov, dpnp_median
 
diff --git a/dpnp/dpnp_iface_types.py b/dpnp/dpnp_iface_types.py
index f133333d6b83..7d2d60089d98 100644
--- a/dpnp/dpnp_iface_types.py
+++ b/dpnp/dpnp_iface_types.py
@@ -37,12 +37,11 @@
 import functools
 
 import dpctl
-import dpctl.tensor as dpt
 import numpy
 
 # TODO: revert to `import dpctl.tensor...`
 # when dpnp fully migrates dpctl/tensor
-import dpctl_ext.tensor as dpt_ext
+import dpctl_ext.tensor as dpt
 import dpnp
 
 from .dpnp_array import dpnp_array
@@ -214,7 +213,7 @@ def finfo(dtype):
     """
     if isinstance(dtype, dpnp_array):
         dtype = dtype.dtype
-    return dpt_ext.finfo(dtype)
+    return dpt.finfo(dtype)
 
 
 # pylint: disable=redefined-outer-name
@@ -247,7 +246,7 @@ def iinfo(dtype):
 
     if isinstance(dtype, dpnp_array):
         dtype = dtype.dtype
-    return dpt_ext.iinfo(dtype)
+    return dpt.iinfo(dtype)
 
 
 def isdtype(dtype, kind):
@@ -301,7 +300,7 @@ def isdtype(dtype, kind):
     elif isinstance(kind, tuple):
         kind = tuple(dpt.dtype(k) if isinstance(k, type) else k for k in kind)
 
-    return dpt_ext.isdtype(dtype, kind)
+    return dpt.isdtype(dtype, kind)
 
 
 def issubdtype(arg1, arg2):
diff --git a/dpnp/dpnp_utils/dpnp_utils_statistics.py b/dpnp/dpnp_utils/dpnp_utils_statistics.py
index ec67b619a13f..cd9932cb7153 100644
--- a/dpnp/dpnp_utils/dpnp_utils_statistics.py
+++ b/dpnp/dpnp_utils/dpnp_utils_statistics.py
@@ -29,13 +29,12 @@
 import warnings
 
 import dpctl
-import dpctl.tensor as dpt
 from dpctl.utils import ExecutionPlacementError
 
-import dpnp
-
 # TODO: revert to `from dpctl.tensor...`
 # when dpnp fully migrates dpctl/tensor
+import dpctl_ext.tensor as dpt
+import dpnp
 from dpctl_ext.tensor._numpy_helper import normalize_axis_tuple
 from dpnp.dpnp_array import dpnp_array
 
diff --git a/dpnp/exceptions/__init__.py b/dpnp/exceptions/__init__.py
index 26d78a853f41..7abcdbf0553f 100644
--- a/dpnp/exceptions/__init__.py
+++ b/dpnp/exceptions/__init__.py
@@ -32,10 +32,13 @@
     SyclQueueCreationError,
 )
 from dpctl.memory import USMAllocationError
-from dpctl.tensor._dlpack import DLPackCreationError
 from dpctl.utils import ExecutionPlacementError
 from numpy.exceptions import AxisError
 
+# TODO: revert to `from dpctl.tensor...`
+# when dpnp fully migrates dpctl/tensor
+from dpctl_ext.tensor._dlpack import DLPackCreationError
+
 __all__ = [
     "AxisError",
     "DLPackCreationError",
diff --git a/dpnp/memory/_memory.py b/dpnp/memory/_memory.py
index f978c5e50db2..3e95baacd424 100644
--- a/dpnp/memory/_memory.py
+++ b/dpnp/memory/_memory.py
@@ -26,11 +26,14 @@
 # THE POSSIBILITY OF SUCH DAMAGE.
 # *****************************************************************************
 
-import dpctl.tensor as dpt
 from dpctl.memory import MemoryUSMDevice as DPCTLMemoryUSMDevice
 from dpctl.memory import MemoryUSMHost as DPCTLMemoryUSMHost
 from dpctl.memory import MemoryUSMShared as DPCTLMemoryUSMShared
 
+# TODO: revert to `from dpctl.tensor...`
+# when dpnp fully migrates dpctl/tensor
+import dpctl_ext.tensor as dpt
+
 
 def _add_ptr_property(cls):
     _storage_attr = "_ptr"
diff --git a/dpnp/tests/test_mathematical.py b/dpnp/tests/test_mathematical.py
index c03787790280..155f4cdb06fb 100644
--- a/dpnp/tests/test_mathematical.py
+++ b/dpnp/tests/test_mathematical.py
@@ -1,5 +1,4 @@
 import dpctl
-import dpctl.tensor as dpt
 import numpy
 import pytest
 from dpctl.utils import ExecutionPlacementError
@@ -13,7 +12,7 @@
 
 # TODO: revert to `import dpctl.tensor...`
 # when dpnp fully migrates dpctl/tensor
-import dpctl_ext.tensor as dpt_ext
+import dpctl_ext.tensor as dpt
 import dpnp
 
 # TODO: revert to `from dpctl.tensor...`
@@ -672,15 +671,15 @@ def test_to_begin_to_end(self, to_begin, to_end):
         "to_begin, to_end",
         [
             (-20, 20),
-            (dpt_ext.asarray([-20, -30]), dpt_ext.asarray([20, 15])),
-            (dpt_ext.asarray([[-20, -30]]), dpt_ext.asarray([[20, 15]])),
+            (dpt.asarray([-20, -30]), dpt.asarray([20, 15])),
+            (dpt.asarray([[-20, -30]]), dpt.asarray([[20, 15]])),
             ([1, 2], [3, 4]),
             ((1, 2), (3, 4)),
         ],
     )
     def test_usm_ndarray(self, to_begin, to_end):
         a = numpy.array([[1, 2, 0]])
-        dpt_a = dpt_ext.asarray(a)
+        dpt_a = dpt.asarray(a)
 
         if isinstance(to_begin, dpt.usm_ndarray):
             np_to_begin = dpt.asnumpy(to_begin)
@@ -1581,7 +1580,7 @@ def test_out(self):
         assert_allclose(result, expected)
 
         # output is usm_ndarray
-        dpt_out = dpt_ext.empty(expected.shape, dtype=expected.dtype)
+        dpt_out = dpt.empty(expected.shape, dtype=expected.dtype)
         result = dpnp.prod(ia, axis=0, out=dpt_out)
         assert dpt_out is result.get_array()
         assert_allclose(result, expected)
@@ -2634,7 +2633,7 @@ def test_out_float16(self, func):
     def test_out_usm_ndarray(self, func, dt):
         a = generate_random_numpy_array(10, dt)
         out = numpy.empty(a.shape, dtype=dt)
-        ia, usm_out = dpnp.array(a), dpt_ext.asarray(out)
+        ia, usm_out = dpnp.array(a), dpt.asarray(out)
 
         expected = getattr(numpy, func)(a, out=out)
         result = getattr(dpnp, func)(ia, out=usm_out)
diff --git a/dpnp/tests/test_memory.py b/dpnp/tests/test_memory.py
index 94aeda33f505..dd87a993e1dc 100644
--- a/dpnp/tests/test_memory.py
+++ b/dpnp/tests/test_memory.py
@@ -1,10 +1,9 @@
-import dpctl.tensor as dpt
 import numpy
 import pytest
 
 # TODO: revert to `import dpctl.tensor...`
 # when dpnp fully migrates dpctl/tensor
-import dpctl_ext.tensor as dpt_ext
+import dpctl_ext.tensor as dpt
 import dpnp
 import dpnp.memory as dpm
 
@@ -24,7 +23,7 @@ def test_wrong_input_type(self, x):
             dpm.create_data(x)
 
     def test_wrong_usm_data(self):
-        a = dpt_ext.ones(10)
+        a = dpt.ones(10)
         d = IntUsmData(a.shape, buffer=a)
 
         with pytest.raises(TypeError):
diff --git a/dpnp/tests/test_ndarray.py b/dpnp/tests/test_ndarray.py
index a27f0fe6aa14..8944043d90a0 100644
--- a/dpnp/tests/test_ndarray.py
+++ b/dpnp/tests/test_ndarray.py
@@ -1,4 +1,3 @@
-import dpctl.tensor as dpt
 import numpy
 import pytest
 from numpy.testing import (
@@ -11,7 +10,7 @@
 
 # TODO: revert to `import dpctl.tensor...`
 # when dpnp fully migrates dpctl/tensor
-import dpctl_ext.tensor as dpt_ext
+import dpctl_ext.tensor as dpt
 import dpnp
 
 from .helper import (
@@ -410,7 +409,7 @@ def test_error(self):
 class TestUsmNdarrayProtocol:
     def test_basic(self):
         a = dpnp.arange(256, dtype=dpnp.int64)
-        usm_a = dpt_ext.asarray(a)
+        usm_a = dpt.asarray(a)
 
         assert a.sycl_queue == usm_a.sycl_queue
         assert a.usm_type == usm_a.usm_type
diff --git a/dpnp/tests/third_party/cupy/core_tests/test_dlpack.py b/dpnp/tests/third_party/cupy/core_tests/test_dlpack.py
index 41df0a82e0a0..e44f51f09b20 100644
--- a/dpnp/tests/third_party/cupy/core_tests/test_dlpack.py
+++ b/dpnp/tests/third_party/cupy/core_tests/test_dlpack.py
@@ -1,10 +1,12 @@
 from __future__ import annotations
 
 import dpctl
-import dpctl.tensor._dlpack as dlp
 import numpy
 import pytest
 
+# TODO: revert to `import dpctl.tensor...`
+# when dpnp fully migrates dpctl/tensor
+import dpctl_ext.tensor._dlpack as dlp
 import dpnp as cupy
 from dpnp.tests.third_party.cupy import testing
 

From a31ab47a80e743624544a4cef55c3c1ffe072955 Mon Sep 17 00:00:00 2001
From: vlad-perevezentsev <vladislav.perevezentsev@intel.com>
Date: Tue, 7 Apr 2026 14:58:38 +0200
Subject: [PATCH 22/43] Remove dpctl_ext.tensor C-API  (#2830)

This PR removes the unused external C-API from `dpctl_ext.tensor` and
replaces function pointer calls with direct struct member access.

Changes:
1. Remove all `cdef api` functions from `_usmarray.pyx`
2. Delete `dpctl_ext_capi.h` and `DpctlExtCAPI` CMake interface library
3. Update `dpnp4pybind11.hpp` to access `PyUSMArrayObject` members
directly
4. Update build configuration
---
 CMakeLists.txt                                |  11 -
 dpctl_ext/CMakeLists.txt                      |   8 -
 dpctl_ext/apis/include/dpctl_ext_capi.h       | 106 --------
 dpctl_ext/tensor/CMakeLists.txt               |  10 +-
 dpctl_ext/tensor/_usmarray.pyx                | 232 ------------------
 dpnp/backend/extensions/blas/CMakeLists.txt   |   9 +-
 dpnp/backend/extensions/fft/CMakeLists.txt    |   9 +-
 .../extensions/indexing/CMakeLists.txt        |   9 +-
 dpnp/backend/extensions/lapack/CMakeLists.txt |  15 +-
 .../extensions/statistics/CMakeLists.txt      |   9 +-
 dpnp/backend/extensions/ufunc/CMakeLists.txt  |   9 +-
 dpnp/backend/extensions/vm/CMakeLists.txt     |   9 +-
 dpnp/backend/extensions/window/CMakeLists.txt |   9 +-
 dpnp/backend/include/dpnp4pybind11.hpp        | 216 ++++++++--------
 14 files changed, 157 insertions(+), 504 deletions(-)
 delete mode 100644 dpctl_ext/apis/include/dpctl_ext_capi.h

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 5db9fe9a6759..6dedacc3bc43 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -344,16 +344,5 @@ if(DEFINED SKBUILD)
     set(_ignore_me ${SKBUILD})
 endif()
 
-# DpctlExtCAPI: Interface library for dpctl_ext C-API
-# Provides access to:
-# 1. Public C-API headers from dpctl_ext/apis/include
-# 2. Generated Cython headers via per-target header interface libraries
-
-add_library(DpctlExtCAPI INTERFACE)
-target_include_directories(
-    DpctlExtCAPI
-    INTERFACE ${CMAKE_CURRENT_SOURCE_DIR}/dpctl_ext/apis/include
-)
-
 add_subdirectory(dpctl_ext)
 add_subdirectory(dpnp)
diff --git a/dpctl_ext/CMakeLists.txt b/dpctl_ext/CMakeLists.txt
index 5baba4de80d0..fa187463414d 100644
--- a/dpctl_ext/CMakeLists.txt
+++ b/dpctl_ext/CMakeLists.txt
@@ -199,12 +199,4 @@ function(build_dpctl_ext _trgt _src _dest)
     target_include_directories(${_trgt_headers} INTERFACE ${_trgt_headers_dir})
 endfunction()
 
-# Install dpctl_ext C-API headers (similar to dpctl's C-API installation)
-install(
-    DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/apis/include/
-    DESTINATION ${CMAKE_INSTALL_PREFIX}/dpctl_ext/include
-    FILES_MATCHING
-    REGEX "\\.h(pp)?$"
-)
-
 add_subdirectory(tensor)
diff --git a/dpctl_ext/apis/include/dpctl_ext_capi.h b/dpctl_ext/apis/include/dpctl_ext_capi.h
deleted file mode 100644
index 65d332fb73cc..000000000000
--- a/dpctl_ext/apis/include/dpctl_ext_capi.h
+++ /dev/null
@@ -1,106 +0,0 @@
-//*****************************************************************************
-// Copyright (c) 2026, Intel Corporation
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are met:
-// - Redistributions of source code must retain the above copyright notice,
-//   this list of conditions and the following disclaimer.
-// - Redistributions in binary form must reproduce the above copyright notice,
-//   this list of conditions and the following disclaimer in the documentation
-//   and/or other materials provided with the distribution.
-// - Neither the name of the copyright holder nor the names of its contributors
-//   may be used to endorse or promote products derived from this software
-//   without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
-// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
-// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
-// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
-// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
-// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
-// THE POSSIBILITY OF SUCH DAMAGE.
-//*****************************************************************************
-//
-//===---------------------------------------------------------------------===//
-///
-/// \file
-/// This file provides access to dpctl_ext's C-API, including:
-/// - dpctl C-API (from external dpctl package - SYCL interface)
-/// - dpctl_ext tensor C-API (usm_ndarray)
-//===---------------------------------------------------------------------===//
-
-#pragma once
-
-// Include dpctl C-API headers explicitly from external dpctl package (SYCL
-// interface)
-// TODO: Once dpctl removes its tensor module and stabilizes dpctl_capi.h,
-// we can simplify to just: #include "dpctl_capi.h"
-// For now, explicit includes ensure we only get SYCL interface without tensor.
-
-#include "syclinterface/dpctl_sycl_extension_interface.h"
-#include "syclinterface/dpctl_sycl_types.h"
-
-#ifdef __cplusplus
-#define CYTHON_EXTERN_C extern "C"
-#else
-#define CYTHON_EXTERN_C
-#endif
-
-#include "dpctl/_sycl_context.h"
-#include "dpctl/_sycl_context_api.h"
-#include "dpctl/_sycl_device.h"
-#include "dpctl/_sycl_device_api.h"
-#include "dpctl/_sycl_event.h"
-#include "dpctl/_sycl_event_api.h"
-#include "dpctl/_sycl_queue.h"
-#include "dpctl/_sycl_queue_api.h"
-#include "dpctl/memory/_memory.h"
-#include "dpctl/memory/_memory_api.h"
-#include "dpctl/program/_program.h"
-#include "dpctl/program/_program_api.h"
-
-// Include the generated Cython C-API headers for usm_ndarray
-// These headers are generated during build and placed in the build directory
-#include "dpctl_ext/tensor/_usmarray.h"
-#include "dpctl_ext/tensor/_usmarray_api.h"
-
-/*
- * Function to import dpctl_ext C-API and make it available.
- * This imports both:
- * - dpctl C-API (from external dpctl package - SYCL interface)
- * - dpctl_ext C-API (tensor interface - usm_ndarray)
- *
- * C functions can use dpctl_ext's C-API functions without linking to
- * shared objects defining these symbols, if they call `import_dpctl_ext()`
- * prior to using those symbols.
- *
- * It is declared inline to allow multiple definitions in
- * different translation units.
- *
- * TODO: When dpctl_ext is renamed to dpctl.tensor:
- *   - Rename this file: dpctl_ext_capi.h → dpctl/tensor/tensor_capi.h
- *     (Use tensor_capi.h, NOT dpctl_capi.h, to avoid conflict with external
- * dpctl)
- *   - Rename this function: import_dpctl_ext() → import_dpctl_tensor()
- *   - Include external dpctl_capi.h and simplify imports to use import_dpctl()
- */
-static inline void import_dpctl_ext(void)
-{
-    // Import dpctl SYCL interface
-    // TODO: Once dpctl removes its tensor module and stabilizes dpctl_capi.h,
-    // we can simplify to just: import_dpctl()
-    import_dpctl___sycl_device();
-    import_dpctl___sycl_context();
-    import_dpctl___sycl_event();
-    import_dpctl___sycl_queue();
-    import_dpctl__memory___memory();
-    import_dpctl__program___program();
-    // Import dpctl_ext tensor interface
-    import_dpctl_ext__tensor___usmarray();
-    return;
-}
diff --git a/dpctl_ext/tensor/CMakeLists.txt b/dpctl_ext/tensor/CMakeLists.txt
index 8df593b0838d..13c9e248594c 100644
--- a/dpctl_ext/tensor/CMakeLists.txt
+++ b/dpctl_ext/tensor/CMakeLists.txt
@@ -34,8 +34,6 @@ foreach(_cy_file ${_cython_sources})
     get_filename_component(_trgt ${_cy_file} NAME_WLE)
     build_dpctl_ext(${_trgt} ${_cy_file} "dpctl_ext/tensor" RELATIVE_PATH "..")
     target_include_directories(${_trgt} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include)
-    # target_link_libraries(DpctlCAPI INTERFACE ${_trgt}_headers)
-    target_link_libraries(DpctlExtCAPI INTERFACE ${_trgt}_headers)
 endforeach()
 
 if(WIN32)
@@ -318,6 +316,7 @@ foreach(python_module_name ${_py_trgts})
             ${Dpctl_INCLUDE_DIR}
             ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/include
             ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/
+            ${CMAKE_BINARY_DIR} # For generated Cython headers
     )
     target_link_options(${python_module_name} PRIVATE ${_linker_options})
     if(DPCTL_GENERATE_COVERAGE)
@@ -343,11 +342,8 @@ foreach(python_module_name ${_py_trgts})
             PRIVATE ${_dpnp_sycl_target_link_options}
         )
     endif()
-    # TODO: update source so they reference individual libraries instead of
-    #   dpctl4pybind11.hpp. It will allow to simplify dependency tree
-    # NOTE: dpctl C-API is resolved at runtime via Python
-    # target_link_libraries(${python_module_name} PRIVATE DpctlCAPI)
-    target_link_libraries(${python_module_name} PRIVATE DpctlExtCAPI)
+    # Ensure Cython modules build first so _usmarray.h exists
+    add_dependencies(${python_module_name} _usmarray)
     if(DPNP_WITH_REDIST)
         set_target_properties(
             ${python_module_name}
diff --git a/dpctl_ext/tensor/_usmarray.pyx b/dpctl_ext/tensor/_usmarray.pyx
index 4f3856a29fe4..e3b33fd71ac8 100644
--- a/dpctl_ext/tensor/_usmarray.pyx
+++ b/dpctl_ext/tensor/_usmarray.pyx
@@ -1749,238 +1749,6 @@ cdef usm_ndarray _zero_like(usm_ndarray ary):
     return r
 
 
-cdef api char* UsmNDArray_GetData(usm_ndarray arr):
-    """Get allocation pointer of zero index element of array """
-    return arr.get_data()
-
-
-cdef api int UsmNDArray_GetNDim(usm_ndarray arr):
-    """Get array rank: length of its shape"""
-    return arr.get_ndim()
-
-
-cdef api Py_ssize_t* UsmNDArray_GetShape(usm_ndarray arr):
-    """Get host pointer to shape vector"""
-    return arr.get_shape()
-
-
-cdef api Py_ssize_t* UsmNDArray_GetStrides(usm_ndarray arr):
-    """Get host pointer to strides vector"""
-    return arr.get_strides()
-
-
-cdef api int UsmNDArray_GetTypenum(usm_ndarray arr):
-    """Get type number for data type of array elements"""
-    return arr.get_typenum()
-
-
-cdef api int UsmNDArray_GetElementSize(usm_ndarray arr):
-    """Get array element size in bytes"""
-    return arr.get_itemsize()
-
-
-cdef api int UsmNDArray_GetFlags(usm_ndarray arr):
-    """Get flags of array"""
-    return arr.get_flags()
-
-
-cdef api c_dpctl.DPCTLSyclQueueRef UsmNDArray_GetQueueRef(usm_ndarray arr):
-    """Get DPCTLSyclQueueRef for queue associated with the array"""
-    return arr.get_queue_ref()
-
-
-cdef api Py_ssize_t UsmNDArray_GetOffset(usm_ndarray arr):
-    """Get offset of zero-index array element from the beginning of the USM
-    allocation"""
-    return arr.get_offset()
-
-
-cdef api object UsmNDArray_GetUSMData(usm_ndarray arr):
-    """Get USM data object underlying the array"""
-    return arr.get_base()
-
-
-cdef api void UsmNDArray_SetWritableFlag(usm_ndarray arr, int flag):
-    """Set/unset USM_ARRAY_WRITABLE in the given array `arr`."""
-    arr._set_writable_flag(flag)
-
-
-cdef api object UsmNDArray_MakeSimpleFromMemory(
-    int nd, const Py_ssize_t *shape, int typenum,
-    c_dpmem._Memory mobj, Py_ssize_t offset, char order
-):
-    """Create contiguous usm_ndarray.
-
-    Args:
-        nd: number of dimensions (non-negative)
-        shape: array of nd non-negative array's sizes along each dimension
-        typenum: array elemental type number
-        ptr: pointer to the start of allocation
-        QRef: DPCTLSyclQueueRef associated with the allocation
-        offset: distance between element with zero multi-index and the
-                start of allocation
-        order: Memory layout of the array. Use 'C' for C-contiguous or
-               row-major layout; 'F' for F-contiguous or column-major layout
-    Returns:
-        Created usm_ndarray instance
-    """
-    cdef object shape_tuple = _make_int_tuple(nd, <Py_ssize_t *>shape)
-    cdef usm_ndarray arr = usm_ndarray(
-        shape_tuple,
-        dtype=_make_typestr(typenum),
-        buffer=mobj,
-        offset=offset,
-        order=<bytes>(order)
-    )
-    return arr
-
-
-cdef api object UsmNDArray_MakeSimpleFromPtr(
-    size_t nelems,
-    int typenum,
-    c_dpctl.DPCTLSyclUSMRef ptr,
-    c_dpctl.DPCTLSyclQueueRef QRef,
-    object owner
-):
-    """Create 1D contiguous usm_ndarray from pointer.
-
-    Args:
-        nelems: number of elements in array
-        typenum: array elemental type number
-        ptr: pointer to the start of allocation
-        QRef: DPCTLSyclQueueRef associated with the allocation
-        owner: Python object managing lifetime of USM allocation.
-               Value None implies transfer of USM allocation ownership
-               to the created array object.
-    Returns:
-        Created usm_ndarray instance
-    """
-    cdef int itemsize = type_bytesize(typenum)
-    if (itemsize < 1):
-        raise ValueError(
-            "dtype with typenum=" + str(typenum) + " is not supported."
-        )
-    cdef size_t nbytes = (<size_t> itemsize) * nelems
-    cdef c_dpmem._Memory mobj
-    mobj = c_dpmem._Memory.create_from_usm_pointer_size_qref(
-        ptr, nbytes, QRef, memory_owner=owner
-    )
-    cdef usm_ndarray arr = usm_ndarray(
-        (nelems,),
-        dtype=_make_typestr(typenum),
-        buffer=mobj
-    )
-    return arr
-
-cdef api object UsmNDArray_MakeFromPtr(
-    int nd,
-    const Py_ssize_t *shape,
-    int typenum,
-    const Py_ssize_t *strides,
-    c_dpctl.DPCTLSyclUSMRef ptr,
-    c_dpctl.DPCTLSyclQueueRef QRef,
-    Py_ssize_t offset,
-    object owner
-):
-    """
-    General usm_ndarray constructor from externally made USM-allocation.
-
-    Args:
-        nd: number of dimensions (non-negative)
-        shape: array of nd non-negative array's sizes along each dimension
-        typenum: array elemental type number
-        strides: array of nd strides along each dimension in elements
-        ptr: pointer to the start of allocation
-        QRef: DPCTLSyclQueueRef associated with the allocation
-        offset: distance between element with zero multi-index and the
-                start of allocation
-        owner: Python object managing lifetime of USM allocation.
-               Value None implies transfer of USM allocation ownership
-               to the created array object.
-    Returns:
-        Created usm_ndarray instance
-    """
-    cdef int itemsize = type_bytesize(typenum)
-    cdef size_t nelems = 1
-    cdef Py_ssize_t min_disp = 0
-    cdef Py_ssize_t max_disp = 0
-    cdef Py_ssize_t step_ = 0
-    cdef Py_ssize_t dim_ = 0
-    cdef it = 0
-    cdef c_dpmem._Memory mobj
-    cdef usm_ndarray arr
-    cdef object obj_shape
-    cdef object obj_strides
-
-    if (itemsize < 1):
-        raise ValueError(
-            "dtype with typenum=" + str(typenum) + " is not supported."
-        )
-    if (nd < 0):
-        raise ValueError("Dimensionality must be non-negative")
-    if (ptr is NULL or QRef is NULL):
-        raise ValueError(
-            "Non-null USM allocation pointer and QRef are expected"
-        )
-    if (nd == 0):
-        # case of 0d scalars
-        mobj = c_dpmem._Memory.create_from_usm_pointer_size_qref(
-            ptr, itemsize, QRef, memory_owner=owner
-        )
-        arr = usm_ndarray(
-            tuple(),
-            dtype=_make_typestr(typenum),
-            buffer=mobj
-        )
-        return arr
-    if (shape is NULL or strides is NULL):
-        raise ValueError("Both shape and stride vectors are required")
-    for it in range(nd):
-        dim_ = shape[it]
-        if dim_ < 0:
-            raise ValueError(
-                f"Dimension along axis {it} must be non-negative"
-            )
-        nelems *= dim_
-        if dim_ > 0:
-            step_ = strides[it]
-            if step_ > 0:
-                max_disp += step_ * (dim_ - 1)
-            else:
-                min_disp += step_ * (dim_ - 1)
-
-    obj_shape = _make_int_tuple(nd, shape)
-    obj_strides = _make_int_tuple(nd, strides)
-    if nelems == 0:
-        mobj = c_dpmem._Memory.create_from_usm_pointer_size_qref(
-            ptr, itemsize, QRef, memory_owner=owner
-        )
-        arr = usm_ndarray(
-            obj_shape,
-            dtype=_make_typestr(typenum),
-            strides=obj_strides,
-            buffer=mobj,
-            offset=0
-        )
-        return arr
-    if offset + min_disp < 0:
-        raise ValueError(
-            "Given shape, strides and offset reference out-of-bound memory"
-        )
-    nbytes = (<size_t> itemsize) * (offset + max_disp + 1)
-    mobj = c_dpmem._Memory.create_from_usm_pointer_size_qref(
-        ptr, nbytes, QRef, memory_owner=owner
-    )
-    arr = usm_ndarray(
-        obj_shape,
-        dtype=_make_typestr(typenum),
-        strides=obj_strides,
-        buffer=mobj,
-        offset=offset
-    )
-    return arr
-
-
 def _is_object_with_buffer_protocol(o):
     "Returns True if object supports Python buffer protocol"
     return _is_buffer(o)
diff --git a/dpnp/backend/extensions/blas/CMakeLists.txt b/dpnp/backend/extensions/blas/CMakeLists.txt
index 2dce27001bbd..1bf6055d080b 100644
--- a/dpnp/backend/extensions/blas/CMakeLists.txt
+++ b/dpnp/backend/extensions/blas/CMakeLists.txt
@@ -39,7 +39,8 @@ set(_module_src
 pybind11_add_module(${python_module_name} MODULE ${_module_src})
 add_sycl_to_target(TARGET ${python_module_name} SOURCES ${_module_src})
 
-target_link_libraries(${python_module_name} PRIVATE DpctlExtCAPI)
+# Ensure Cython modules build first so _usmarray.h exists
+add_dependencies(${python_module_name} _usmarray)
 
 if(_dpnp_sycl_targets)
     # make fat binary
@@ -77,7 +78,11 @@ target_include_directories(
 target_include_directories(
     ${python_module_name}
     SYSTEM
-    PRIVATE ${SYCL_INCLUDE_DIR} ${Dpctl_INCLUDE_DIRS} ${Dpctl_TENSOR_INCLUDE_DIR}
+    PRIVATE
+        ${SYCL_INCLUDE_DIR}
+        ${Dpctl_INCLUDE_DIRS}
+        ${Dpctl_TENSOR_INCLUDE_DIR}
+        ${CMAKE_BINARY_DIR} # For generated Cython headers
 )
 
 if(WIN32)
diff --git a/dpnp/backend/extensions/fft/CMakeLists.txt b/dpnp/backend/extensions/fft/CMakeLists.txt
index bfebe1ed4226..28433ab5d98f 100644
--- a/dpnp/backend/extensions/fft/CMakeLists.txt
+++ b/dpnp/backend/extensions/fft/CMakeLists.txt
@@ -33,7 +33,8 @@ set(_module_src ${CMAKE_CURRENT_SOURCE_DIR}/fft_py.cpp)
 pybind11_add_module(${python_module_name} MODULE ${_module_src})
 add_sycl_to_target(TARGET ${python_module_name} SOURCES ${_module_src})
 
-target_link_libraries(${python_module_name} PRIVATE DpctlExtCAPI)
+# Ensure Cython modules build first so _usmarray.h exists
+add_dependencies(${python_module_name} _usmarray)
 
 if(_dpnp_sycl_targets)
     # make fat binary
@@ -70,7 +71,11 @@ target_include_directories(
 target_include_directories(
     ${python_module_name}
     SYSTEM
-    PRIVATE ${SYCL_INCLUDE_DIR} ${Dpctl_INCLUDE_DIRS} ${Dpctl_TENSOR_INCLUDE_DIR}
+    PRIVATE
+        ${SYCL_INCLUDE_DIR}
+        ${Dpctl_INCLUDE_DIRS}
+        ${Dpctl_TENSOR_INCLUDE_DIR}
+        ${CMAKE_BINARY_DIR} # For generated Cython headers
 )
 
 if(WIN32)
diff --git a/dpnp/backend/extensions/indexing/CMakeLists.txt b/dpnp/backend/extensions/indexing/CMakeLists.txt
index 7729e2807a4d..e00cee6a29a5 100644
--- a/dpnp/backend/extensions/indexing/CMakeLists.txt
+++ b/dpnp/backend/extensions/indexing/CMakeLists.txt
@@ -36,7 +36,8 @@ set(_module_src
 pybind11_add_module(${python_module_name} MODULE ${_module_src})
 add_sycl_to_target(TARGET ${python_module_name} SOURCES ${_module_src})
 
-target_link_libraries(${python_module_name} PRIVATE DpctlExtCAPI)
+# Ensure Cython modules build first so _usmarray.h exists
+add_dependencies(${python_module_name} _usmarray)
 
 if(_dpnp_sycl_targets)
     # make fat binary
@@ -74,7 +75,11 @@ target_include_directories(
 target_include_directories(
     ${python_module_name}
     SYSTEM
-    PRIVATE ${SYCL_INCLUDE_DIR} ${Dpctl_INCLUDE_DIRS} ${Dpctl_TENSOR_INCLUDE_DIR}
+    PRIVATE
+        ${SYCL_INCLUDE_DIR}
+        ${Dpctl_INCLUDE_DIRS}
+        ${Dpctl_TENSOR_INCLUDE_DIR}
+        ${CMAKE_BINARY_DIR} # For generated Cython headers
 )
 
 if(WIN32)
diff --git a/dpnp/backend/extensions/lapack/CMakeLists.txt b/dpnp/backend/extensions/lapack/CMakeLists.txt
index a3ee4bae8ee5..3105771d9722 100644
--- a/dpnp/backend/extensions/lapack/CMakeLists.txt
+++ b/dpnp/backend/extensions/lapack/CMakeLists.txt
@@ -55,7 +55,9 @@ set(_module_src
 
 pybind11_add_module(${python_module_name} MODULE ${_module_src})
 add_sycl_to_target(TARGET ${python_module_name} SOURCES ${_module_src})
-target_link_libraries(${python_module_name} PRIVATE DpctlExtCAPI)
+
+# Ensure Cython modules build first so _usmarray.h exists
+add_dependencies(${python_module_name} _usmarray)
 
 if(_dpnp_sycl_targets)
     # make fat binary
@@ -81,11 +83,6 @@ set_target_properties(
     PROPERTIES CMAKE_POSITION_INDEPENDENT_CODE ON
 )
 
-target_include_directories(
-    ${python_module_name}
-    PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../common
-)
-
 target_include_directories(
     ${python_module_name}
     PRIVATE
@@ -99,7 +96,11 @@ target_include_directories(
 target_include_directories(
     ${python_module_name}
     SYSTEM
-    PRIVATE ${SYCL_INCLUDE_DIR} ${Dpctl_INCLUDE_DIRS} ${Dpctl_TENSOR_INCLUDE_DIR}
+    PRIVATE
+        ${SYCL_INCLUDE_DIR}
+        ${Dpctl_INCLUDE_DIRS}
+        ${Dpctl_TENSOR_INCLUDE_DIR}
+        ${CMAKE_BINARY_DIR} # For generated Cython headers
 )
 
 if(WIN32)
diff --git a/dpnp/backend/extensions/statistics/CMakeLists.txt b/dpnp/backend/extensions/statistics/CMakeLists.txt
index 88b3f185e6f6..0e96d7ead6c1 100644
--- a/dpnp/backend/extensions/statistics/CMakeLists.txt
+++ b/dpnp/backend/extensions/statistics/CMakeLists.txt
@@ -41,7 +41,8 @@ set(_module_src
 pybind11_add_module(${python_module_name} MODULE ${_module_src})
 add_sycl_to_target(TARGET ${python_module_name} SOURCES ${_module_src})
 
-target_link_libraries(${python_module_name} PRIVATE DpctlExtCAPI)
+# Ensure Cython modules build first so _usmarray.h exists
+add_dependencies(${python_module_name} _usmarray)
 
 if(_dpnp_sycl_targets)
     # make fat binary
@@ -79,7 +80,11 @@ target_include_directories(
 target_include_directories(
     ${python_module_name}
     SYSTEM
-    PRIVATE ${SYCL_INCLUDE_DIR} ${Dpctl_INCLUDE_DIRS} ${Dpctl_TENSOR_INCLUDE_DIR}
+    PRIVATE
+        ${SYCL_INCLUDE_DIR}
+        ${Dpctl_INCLUDE_DIRS}
+        ${Dpctl_TENSOR_INCLUDE_DIR}
+        ${CMAKE_BINARY_DIR} # For generated Cython headers
 )
 
 if(WIN32)
diff --git a/dpnp/backend/extensions/ufunc/CMakeLists.txt b/dpnp/backend/extensions/ufunc/CMakeLists.txt
index d954316dcb2a..53d3a64122b3 100644
--- a/dpnp/backend/extensions/ufunc/CMakeLists.txt
+++ b/dpnp/backend/extensions/ufunc/CMakeLists.txt
@@ -67,7 +67,8 @@ set(_module_src
 pybind11_add_module(${python_module_name} MODULE ${_module_src})
 add_sycl_to_target(TARGET ${python_module_name} SOURCES ${_module_src})
 
-target_link_libraries(${python_module_name} PRIVATE DpctlExtCAPI)
+# Ensure Cython modules build first so _usmarray.h exists
+add_dependencies(${python_module_name} _usmarray)
 
 if(WIN32)
     if(${CMAKE_VERSION} VERSION_LESS "3.27")
@@ -97,7 +98,11 @@ target_include_directories(
 target_include_directories(
     ${python_module_name}
     SYSTEM
-    PRIVATE ${SYCL_INCLUDE_DIR} ${Dpctl_INCLUDE_DIRS} ${Dpctl_TENSOR_INCLUDE_DIR}
+    PRIVATE
+        ${SYCL_INCLUDE_DIR}
+        ${Dpctl_INCLUDE_DIRS}
+        ${Dpctl_TENSOR_INCLUDE_DIR}
+        ${CMAKE_BINARY_DIR} # For generated Cython headers
 )
 
 if(_dpnp_sycl_targets)
diff --git a/dpnp/backend/extensions/vm/CMakeLists.txt b/dpnp/backend/extensions/vm/CMakeLists.txt
index 0d69c4e79c03..5e0409f77671 100644
--- a/dpnp/backend/extensions/vm/CMakeLists.txt
+++ b/dpnp/backend/extensions/vm/CMakeLists.txt
@@ -90,7 +90,8 @@ set(python_module_name _vm_impl)
 pybind11_add_module(${python_module_name} MODULE ${_module_src})
 add_sycl_to_target(TARGET ${python_module_name} SOURCES ${_module_src})
 
-target_link_libraries(${python_module_name} PRIVATE DpctlExtCAPI)
+# Ensure Cython modules build first so _usmarray.h exists
+add_dependencies(${python_module_name} _usmarray)
 
 if(WIN32)
     if(${CMAKE_VERSION} VERSION_LESS "3.27")
@@ -119,7 +120,11 @@ target_include_directories(
 target_include_directories(
     ${python_module_name}
     SYSTEM
-    PRIVATE ${SYCL_INCLUDE_DIR} ${Dpctl_INCLUDE_DIRS} ${Dpctl_TENSOR_INCLUDE_DIR}
+    PRIVATE
+        ${SYCL_INCLUDE_DIR}
+        ${Dpctl_INCLUDE_DIRS}
+        ${Dpctl_TENSOR_INCLUDE_DIR}
+        ${CMAKE_BINARY_DIR} # For generated Cython headers
 )
 
 if(WIN32)
diff --git a/dpnp/backend/extensions/window/CMakeLists.txt b/dpnp/backend/extensions/window/CMakeLists.txt
index c8cbd7c03bbc..6898cdb332e0 100644
--- a/dpnp/backend/extensions/window/CMakeLists.txt
+++ b/dpnp/backend/extensions/window/CMakeLists.txt
@@ -36,7 +36,8 @@ set(_module_src
 pybind11_add_module(${python_module_name} MODULE ${_module_src})
 add_sycl_to_target(TARGET ${python_module_name} SOURCES ${_module_src})
 
-target_link_libraries(${python_module_name} PRIVATE DpctlExtCAPI)
+# Ensure Cython modules build first so _usmarray.h exists
+add_dependencies(${python_module_name} _usmarray)
 
 if(_dpnp_sycl_targets)
     # make fat binary
@@ -75,7 +76,11 @@ target_include_directories(
 target_include_directories(
     ${python_module_name}
     SYSTEM
-    PRIVATE ${SYCL_INCLUDE_DIR} ${Dpctl_INCLUDE_DIRS} ${Dpctl_TENSOR_INCLUDE_DIR}
+    PRIVATE
+        ${SYCL_INCLUDE_DIR}
+        ${Dpctl_INCLUDE_DIRS}
+        ${Dpctl_TENSOR_INCLUDE_DIR}
+        ${CMAKE_BINARY_DIR} # For generated Cython headers
 )
 
 if(WIN32)
diff --git a/dpnp/backend/include/dpnp4pybind11.hpp b/dpnp/backend/include/dpnp4pybind11.hpp
index 43a3df995cc6..d8e4c81b20a5 100644
--- a/dpnp/backend/include/dpnp4pybind11.hpp
+++ b/dpnp/backend/include/dpnp4pybind11.hpp
@@ -28,17 +28,33 @@
 
 #pragma once
 
-// Include dpctl_ext C-API (provides unified access to both dpctl and dpctl_ext)
-// This includes:
-// - dpctl C-API (from external dpctl package - SYCL interface)
-// - dpctl_ext C-API (tensor interface: usm_ndarray)
-//
-// TODO: When dpctl_ext is renamed to dpctl.tensor:
-//   - Update include: "dpctl_ext_capi.h" → "dpctl/tensor/tensor_capi.h"
-//     (Use tensor_capi.h, NOT dpctl_capi.h, to avoid conflict with external
-//     dpctl)
-//   - Update import calls: import_dpctl_ext() → import_dpctl_tensor()
-#include "dpctl_ext_capi.h"
+// Include dpctl SYCL interface from external dpctl package
+#include "syclinterface/dpctl_sycl_extension_interface.h"
+#include "syclinterface/dpctl_sycl_types.h"
+
+#ifdef __cplusplus
+#define CYTHON_EXTERN_C extern "C"
+#else
+#define CYTHON_EXTERN_C
+#endif
+
+// Include dpctl C-API headers (both declarations and import functions)
+#include "dpctl/_sycl_context.h"
+#include "dpctl/_sycl_context_api.h"
+#include "dpctl/_sycl_device.h"
+#include "dpctl/_sycl_device_api.h"
+#include "dpctl/_sycl_event.h"
+#include "dpctl/_sycl_event_api.h"
+#include "dpctl/_sycl_queue.h"
+#include "dpctl/_sycl_queue_api.h"
+#include "dpctl/memory/_memory.h"
+#include "dpctl/memory/_memory_api.h"
+#include "dpctl/program/_program.h"
+#include "dpctl/program/_program_api.h"
+
+// Include generated Cython headers for usm_ndarray struct definition and C-API
+#include "dpctl_ext/tensor/_usmarray.h"
+#include "dpctl_ext/tensor/_usmarray_api.h"
 
 #include <array>
 #include <cassert>
@@ -129,38 +145,6 @@ class dpctl_capi
         PySyclProgramObject *);
     PySyclProgramObject *(*SyclProgram_Make_)(DPCTLSyclKernelBundleRef);
 
-    // tensor
-    char *(*UsmNDArray_GetData_)(PyUSMArrayObject *);
-    int (*UsmNDArray_GetNDim_)(PyUSMArrayObject *);
-    py::ssize_t *(*UsmNDArray_GetShape_)(PyUSMArrayObject *);
-    py::ssize_t *(*UsmNDArray_GetStrides_)(PyUSMArrayObject *);
-    int (*UsmNDArray_GetTypenum_)(PyUSMArrayObject *);
-    int (*UsmNDArray_GetElementSize_)(PyUSMArrayObject *);
-    int (*UsmNDArray_GetFlags_)(PyUSMArrayObject *);
-    DPCTLSyclQueueRef (*UsmNDArray_GetQueueRef_)(PyUSMArrayObject *);
-    py::ssize_t (*UsmNDArray_GetOffset_)(PyUSMArrayObject *);
-    PyObject *(*UsmNDArray_GetUSMData_)(PyUSMArrayObject *);
-    void (*UsmNDArray_SetWritableFlag_)(PyUSMArrayObject *, int);
-    PyObject *(*UsmNDArray_MakeSimpleFromMemory_)(int,
-                                                  const py::ssize_t *,
-                                                  int,
-                                                  Py_MemoryObject *,
-                                                  py::ssize_t,
-                                                  char);
-    PyObject *(*UsmNDArray_MakeSimpleFromPtr_)(size_t,
-                                               int,
-                                               DPCTLSyclUSMRef,
-                                               DPCTLSyclQueueRef,
-                                               PyObject *);
-    PyObject *(*UsmNDArray_MakeFromPtr_)(int,
-                                         const py::ssize_t *,
-                                         int,
-                                         const py::ssize_t *,
-                                         DPCTLSyclUSMRef,
-                                         DPCTLSyclQueueRef,
-                                         py::ssize_t,
-                                         PyObject *);
-
     int USM_ARRAY_C_CONTIGUOUS_;
     int USM_ARRAY_F_CONTIGUOUS_;
     int USM_ARRAY_WRITABLE_;
@@ -268,15 +252,7 @@ class dpctl_capi
           Memory_GetQueueRef_(nullptr), Memory_GetNumBytes_(nullptr),
           Memory_Make_(nullptr), SyclKernel_GetKernelRef_(nullptr),
           SyclKernel_Make_(nullptr), SyclProgram_GetKernelBundleRef_(nullptr),
-          SyclProgram_Make_(nullptr), UsmNDArray_GetData_(nullptr),
-          UsmNDArray_GetNDim_(nullptr), UsmNDArray_GetShape_(nullptr),
-          UsmNDArray_GetStrides_(nullptr), UsmNDArray_GetTypenum_(nullptr),
-          UsmNDArray_GetElementSize_(nullptr), UsmNDArray_GetFlags_(nullptr),
-          UsmNDArray_GetQueueRef_(nullptr), UsmNDArray_GetOffset_(nullptr),
-          UsmNDArray_GetUSMData_(nullptr), UsmNDArray_SetWritableFlag_(nullptr),
-          UsmNDArray_MakeSimpleFromMemory_(nullptr),
-          UsmNDArray_MakeSimpleFromPtr_(nullptr),
-          UsmNDArray_MakeFromPtr_(nullptr), USM_ARRAY_C_CONTIGUOUS_(0),
+          SyclProgram_Make_(nullptr), USM_ARRAY_C_CONTIGUOUS_(0),
           USM_ARRAY_F_CONTIGUOUS_(0), USM_ARRAY_WRITABLE_(0), UAR_BOOL_(-1),
           UAR_BYTE_(-1), UAR_UBYTE_(-1), UAR_SHORT_(-1), UAR_USHORT_(-1),
           UAR_INT_(-1), UAR_UINT_(-1), UAR_LONG_(-1), UAR_ULONG_(-1),
@@ -288,14 +264,16 @@ class dpctl_capi
           default_usm_memory_{}, default_usm_ndarray_{}, as_usm_memory_{}
 
     {
-        // Import Cython-generated C-API for dpctl
-        // This imports python modules and initializes
-        // static variables such as function pointers for C-API,
-        // e.g. SyclDevice_GetDeviceRef, etc.
-        // pointers to Python types, i.e. PySyclDeviceType, etc.
-        // and exported constants, i.e. USM_ARRAY_C_CONTIGUOUS, etc.
-        // TODO: rename once dpctl_ext is renamed
-        import_dpctl_ext(); // Imports both dpctl and dpctl_ext C-APIs
+        // Import dpctl SYCL interface modules
+        // This imports python modules and initializes pointers to Python types
+        import_dpctl___sycl_device();
+        import_dpctl___sycl_context();
+        import_dpctl___sycl_event();
+        import_dpctl___sycl_queue();
+        import_dpctl__memory___memory();
+        import_dpctl__program___program();
+        // Import dpctl_ext tensor module for PyUSMArrayType
+        import_dpctl_ext__tensor___usmarray();
 
         // Python type objects for classes implemented by dpctl
         this->Py_SyclDeviceType_ = &Py_SyclDeviceType;
@@ -344,23 +322,6 @@ class dpctl_capi
         this->SyclProgram_GetKernelBundleRef_ = SyclProgram_GetKernelBundleRef;
         this->SyclProgram_Make_ = SyclProgram_Make;
 
-        // dpctl.tensor.usm_ndarray API
-        this->UsmNDArray_GetData_ = UsmNDArray_GetData;
-        this->UsmNDArray_GetNDim_ = UsmNDArray_GetNDim;
-        this->UsmNDArray_GetShape_ = UsmNDArray_GetShape;
-        this->UsmNDArray_GetStrides_ = UsmNDArray_GetStrides;
-        this->UsmNDArray_GetTypenum_ = UsmNDArray_GetTypenum;
-        this->UsmNDArray_GetElementSize_ = UsmNDArray_GetElementSize;
-        this->UsmNDArray_GetFlags_ = UsmNDArray_GetFlags;
-        this->UsmNDArray_GetQueueRef_ = UsmNDArray_GetQueueRef;
-        this->UsmNDArray_GetOffset_ = UsmNDArray_GetOffset;
-        this->UsmNDArray_GetUSMData_ = UsmNDArray_GetUSMData;
-        this->UsmNDArray_SetWritableFlag_ = UsmNDArray_SetWritableFlag;
-        this->UsmNDArray_MakeSimpleFromMemory_ =
-            UsmNDArray_MakeSimpleFromMemory;
-        this->UsmNDArray_MakeSimpleFromPtr_ = UsmNDArray_MakeSimpleFromPtr;
-        this->UsmNDArray_MakeFromPtr_ = UsmNDArray_MakeFromPtr;
-
         // constants
         this->USM_ARRAY_C_CONTIGUOUS_ = USM_ARRAY_C_CONTIGUOUS;
         this->USM_ARRAY_F_CONTIGUOUS_ = USM_ARRAY_F_CONTIGUOUS;
@@ -984,9 +945,7 @@ class usm_ndarray : public py::object
     char *get_data() const
     {
         PyUSMArrayObject *raw_ar = usm_array_ptr();
-
-        auto const &api = ::dpctl::detail::dpctl_capi::get();
-        return api.UsmNDArray_GetData_(raw_ar);
+        return raw_ar->data_;
     }
 
     template <typename T>
@@ -998,17 +957,13 @@ class usm_ndarray : public py::object
     int get_ndim() const
     {
         PyUSMArrayObject *raw_ar = usm_array_ptr();
-
-        auto const &api = ::dpctl::detail::dpctl_capi::get();
-        return api.UsmNDArray_GetNDim_(raw_ar);
+        return raw_ar->nd_;
     }
 
     const py::ssize_t *get_shape_raw() const
     {
         PyUSMArrayObject *raw_ar = usm_array_ptr();
-
-        auto const &api = ::dpctl::detail::dpctl_capi::get();
-        return api.UsmNDArray_GetShape_(raw_ar);
+        return raw_ar->shape_;
     }
 
     std::vector<py::ssize_t> get_shape_vector() const
@@ -1029,9 +984,7 @@ class usm_ndarray : public py::object
     const py::ssize_t *get_strides_raw() const
     {
         PyUSMArrayObject *raw_ar = usm_array_ptr();
-
-        auto const &api = ::dpctl::detail::dpctl_capi::get();
-        return api.UsmNDArray_GetStrides_(raw_ar);
+        return raw_ar->strides_;
     }
 
     std::vector<py::ssize_t> get_strides_vector() const
@@ -1066,9 +1019,8 @@ class usm_ndarray : public py::object
     {
         PyUSMArrayObject *raw_ar = usm_array_ptr();
 
-        auto const &api = ::dpctl::detail::dpctl_capi::get();
-        int ndim = api.UsmNDArray_GetNDim_(raw_ar);
-        const py::ssize_t *shape = api.UsmNDArray_GetShape_(raw_ar);
+        int ndim = raw_ar->nd_;
+        const py::ssize_t *shape = raw_ar->shape_;
 
         py::ssize_t nelems = 1;
         for (int i = 0; i < ndim; ++i) {
@@ -1083,10 +1035,9 @@ class usm_ndarray : public py::object
     {
         PyUSMArrayObject *raw_ar = usm_array_ptr();
 
-        auto const &api = ::dpctl::detail::dpctl_capi::get();
-        int nd = api.UsmNDArray_GetNDim_(raw_ar);
-        const py::ssize_t *shape = api.UsmNDArray_GetShape_(raw_ar);
-        const py::ssize_t *strides = api.UsmNDArray_GetStrides_(raw_ar);
+        int nd = raw_ar->nd_;
+        const py::ssize_t *shape = raw_ar->shape_;
+        const py::ssize_t *strides = raw_ar->strides_;
 
         py::ssize_t offset_min = 0;
         py::ssize_t offset_max = 0;
@@ -1114,43 +1065,77 @@ class usm_ndarray : public py::object
     sycl::queue get_queue() const
     {
         PyUSMArrayObject *raw_ar = usm_array_ptr();
+        Py_MemoryObject *mem_obj =
+            reinterpret_cast<Py_MemoryObject *>(raw_ar->base_);
 
         auto const &api = ::dpctl::detail::dpctl_capi::get();
-        DPCTLSyclQueueRef QRef = api.UsmNDArray_GetQueueRef_(raw_ar);
+        DPCTLSyclQueueRef QRef = api.Memory_GetQueueRef_(mem_obj);
         return *(reinterpret_cast<sycl::queue *>(QRef));
     }
 
     sycl::device get_device() const
     {
         PyUSMArrayObject *raw_ar = usm_array_ptr();
+        Py_MemoryObject *mem_obj =
+            reinterpret_cast<Py_MemoryObject *>(raw_ar->base_);
 
         auto const &api = ::dpctl::detail::dpctl_capi::get();
-        DPCTLSyclQueueRef QRef = api.UsmNDArray_GetQueueRef_(raw_ar);
+        DPCTLSyclQueueRef QRef = api.Memory_GetQueueRef_(mem_obj);
         return reinterpret_cast<sycl::queue *>(QRef)->get_device();
     }
 
     int get_typenum() const
     {
         PyUSMArrayObject *raw_ar = usm_array_ptr();
-
-        auto const &api = ::dpctl::detail::dpctl_capi::get();
-        return api.UsmNDArray_GetTypenum_(raw_ar);
+        return raw_ar->typenum_;
     }
 
     int get_flags() const
     {
         PyUSMArrayObject *raw_ar = usm_array_ptr();
-
-        auto const &api = ::dpctl::detail::dpctl_capi::get();
-        return api.UsmNDArray_GetFlags_(raw_ar);
+        return raw_ar->flags_;
     }
 
     int get_elemsize() const
     {
-        PyUSMArrayObject *raw_ar = usm_array_ptr();
-
+        int typenum = get_typenum();
         auto const &api = ::dpctl::detail::dpctl_capi::get();
-        return api.UsmNDArray_GetElementSize_(raw_ar);
+
+        // Lookup table for element sizes based on typenum
+        if (typenum == api.UAR_BOOL_)
+            return 1;
+        if (typenum == api.UAR_BYTE_)
+            return 1;
+        if (typenum == api.UAR_UBYTE_)
+            return 1;
+        if (typenum == api.UAR_SHORT_)
+            return 2;
+        if (typenum == api.UAR_USHORT_)
+            return 2;
+        if (typenum == api.UAR_INT_)
+            return 4;
+        if (typenum == api.UAR_UINT_)
+            return 4;
+        if (typenum == api.UAR_LONG_)
+            return sizeof(long);
+        if (typenum == api.UAR_ULONG_)
+            return sizeof(unsigned long);
+        if (typenum == api.UAR_LONGLONG_)
+            return 8;
+        if (typenum == api.UAR_ULONGLONG_)
+            return 8;
+        if (typenum == api.UAR_FLOAT_)
+            return 4;
+        if (typenum == api.UAR_DOUBLE_)
+            return 8;
+        if (typenum == api.UAR_CFLOAT_)
+            return 8;
+        if (typenum == api.UAR_CDOUBLE_)
+            return 16;
+        if (typenum == api.UAR_HALF_)
+            return 2;
+
+        return 0; // Unknown type
     }
 
     bool is_c_contiguous() const
@@ -1178,10 +1163,9 @@ class usm_ndarray : public py::object
     py::object get_usm_data() const
     {
         PyUSMArrayObject *raw_ar = usm_array_ptr();
-
-        auto const &api = ::dpctl::detail::dpctl_capi::get();
-        // UsmNDArray_GetUSMData_ gives a new reference
-        PyObject *usm_data = api.UsmNDArray_GetUSMData_(raw_ar);
+        // base_ is the Memory object - return new reference
+        PyObject *usm_data = raw_ar->base_;
+        Py_XINCREF(usm_data);
 
         // pass reference ownership to py::object
         return py::reinterpret_steal<py::object>(usm_data);
@@ -1190,12 +1174,10 @@ class usm_ndarray : public py::object
     bool is_managed_by_smart_ptr() const
     {
         PyUSMArrayObject *raw_ar = usm_array_ptr();
+        PyObject *usm_data = raw_ar->base_;
 
         auto const &api = ::dpctl::detail::dpctl_capi::get();
-        PyObject *usm_data = api.UsmNDArray_GetUSMData_(raw_ar);
-
         if (!PyObject_TypeCheck(usm_data, api.Py_MemoryType_)) {
-            Py_DECREF(usm_data);
             return false;
         }
 
@@ -1203,20 +1185,17 @@ class usm_ndarray : public py::object
             reinterpret_cast<Py_MemoryObject *>(usm_data);
         const void *opaque_ptr = api.Memory_GetOpaquePointer_(mem_obj);
 
-        Py_DECREF(usm_data);
         return bool(opaque_ptr);
     }
 
     const std::shared_ptr<void> &get_smart_ptr_owner() const
     {
         PyUSMArrayObject *raw_ar = usm_array_ptr();
+        PyObject *usm_data = raw_ar->base_;
 
         auto const &api = ::dpctl::detail::dpctl_capi::get();
 
-        PyObject *usm_data = api.UsmNDArray_GetUSMData_(raw_ar);
-
         if (!PyObject_TypeCheck(usm_data, api.Py_MemoryType_)) {
-            Py_DECREF(usm_data);
             throw std::runtime_error(
                 "usm_ndarray object does not have Memory object "
                 "managing lifetime of USM allocation");
@@ -1225,7 +1204,6 @@ class usm_ndarray : public py::object
         Py_MemoryObject *mem_obj =
             reinterpret_cast<Py_MemoryObject *>(usm_data);
         void *opaque_ptr = api.Memory_GetOpaquePointer_(mem_obj);
-        Py_DECREF(usm_data);
 
         if (opaque_ptr) {
             auto shptr_ptr =

From ed3476f03e5b8ce5a9e4839de4db32ae39c90861 Mon Sep 17 00:00:00 2001
From: vlad-perevezentsev <vladislav.perevezentsev@intel.com>
Date: Wed, 8 Apr 2026 16:57:41 +0200
Subject: [PATCH 23/43] Move dpctl_ext.tensor to dpnp.tensor (#2829)

This PR proposes a refactoring that migrates `dpctl_ext.tensor` module
into `dpnp` package as `dpnp.tensor`

Changes:

1. Moved `dpctl_ext/tensor/` directory to `dpnp/tensor/`
2. Updated all imports from `dpctl_ext.tensor` to `dpnp.tensor` across
the codebase
3. Consolidated build: removed dpctl_ext/CMakeLists.txt, added
build_dpnp_tensor_ext() to dpnp/CMakeLists.txt
4. Added `DPNP_BUILD_COMPONENTS` CMake option
(`ALL/TENSOR_ONLY/SKIP_TENSOR`) for staged builds
5. Split coverage workflow into two steps to avoid memory issues
6. Updated include paths in all backend extension CMake files
7. Removed `dpctl_ext/` directory and cleaned up `.gitignore`
---
 .gitignore                                    |   6 +-
 CMakeLists.txt                                |  12 +-
 dpctl_ext/CMakeLists.txt                      | 202 ------------------
 dpctl_ext/__init__.py                         |  27 ---
 dpnp/CMakeLists.txt                           |  85 ++++++++
 dpnp/__init__.py                              |   4 +-
 dpnp/backend/extensions/blas/CMakeLists.txt   |   2 +-
 dpnp/backend/extensions/fft/CMakeLists.txt    |   2 +-
 .../extensions/indexing/CMakeLists.txt        |   2 +-
 dpnp/backend/extensions/lapack/CMakeLists.txt |   2 +-
 .../extensions/statistics/CMakeLists.txt      |   2 +-
 dpnp/backend/extensions/ufunc/CMakeLists.txt  |   2 +-
 dpnp/backend/extensions/vm/CMakeLists.txt     |   2 +-
 dpnp/backend/extensions/window/CMakeLists.txt |   2 +-
 dpnp/backend/include/dpnp4pybind11.hpp        |  16 +-
 dpnp/dpnp_algo/dpnp_arraycreation.py          |   4 +-
 dpnp/dpnp_algo/dpnp_elementwise_common.py     |  29 ++-
 dpnp/dpnp_algo/dpnp_fill.py                   |   8 +-
 dpnp/dpnp_array.py                            |   9 +-
 dpnp/dpnp_array_api_info.py                   |   4 +-
 dpnp/dpnp_container.py                        |   4 +-
 dpnp/dpnp_iface.py                            |  11 +-
 dpnp/dpnp_iface_arraycreation.py              |   4 +-
 dpnp/dpnp_iface_bitwise.py                    |   4 +-
 dpnp/dpnp_iface_counting.py                   |   4 +-
 dpnp/dpnp_iface_functional.py                 |   9 +-
 dpnp/dpnp_iface_indexing.py                   |  15 +-
 dpnp/dpnp_iface_logic.py                      |   6 +-
 dpnp/dpnp_iface_manipulation.py               |  14 +-
 dpnp/dpnp_iface_mathematical.py               |  16 +-
 dpnp/dpnp_iface_searching.py                  |  11 +-
 dpnp/dpnp_iface_sorting.py                    |   6 +-
 dpnp/dpnp_iface_statistics.py                 |   8 +-
 dpnp/dpnp_iface_trigonometric.py              |   8 +-
 dpnp/dpnp_iface_types.py                      |   4 +-
 dpnp/dpnp_utils/dpnp_algo_utils.pyx           |   2 +-
 dpnp/dpnp_utils/dpnp_utils_common.py          |   4 +-
 dpnp/dpnp_utils/dpnp_utils_linearalgebra.py   |  15 +-
 dpnp/dpnp_utils/dpnp_utils_statistics.py      |   6 +-
 dpnp/exceptions/__init__.py                   |   4 +-
 dpnp/fft/dpnp_utils_fft.py                    |   6 +-
 dpnp/linalg/dpnp_iface_linalg.py              |   5 +-
 dpnp/linalg/dpnp_utils_linalg.py              |   9 +-
 dpnp/memory/_memory.py                        |   4 +-
 dpnp/scipy/linalg/_utils.py                   |   7 +-
 {dpctl_ext => dpnp}/tensor/CMakeLists.txt     |  41 +++-
 {dpctl_ext => dpnp}/tensor/__init__.pxd       |   0
 {dpctl_ext => dpnp}/tensor/__init__.py        |   0
 {dpctl_ext => dpnp}/tensor/_accumulation.py   |   8 +-
 {dpctl_ext => dpnp}/tensor/_array_api.py      |   4 +-
 {dpctl_ext => dpnp}/tensor/_clip.py           |   8 +-
 {dpctl_ext => dpnp}/tensor/_constants.py      |   0
 {dpctl_ext => dpnp}/tensor/_copy_utils.py     |   6 +-
 {dpctl_ext => dpnp}/tensor/_ctors.py          |   6 +-
 {dpctl_ext => dpnp}/tensor/_data_types.py     |   0
 {dpctl_ext => dpnp}/tensor/_device.py         |   0
 .../tensor/_dldevice_conversions.py           |   0
 {dpctl_ext => dpnp}/tensor/_dlpack.pxd        |   0
 {dpctl_ext => dpnp}/tensor/_dlpack.pyx        |   6 +-
 .../tensor/_elementwise_common.py             |   6 +-
 .../tensor/_elementwise_funcs.py              |   4 +-
 {dpctl_ext => dpnp}/tensor/_flags.pyx         |   0
 .../tensor/_indexing_functions.py             |   6 +-
 .../tensor/_linear_algebra_functions.py       |  10 +-
 .../tensor/_manipulation_functions.py         |   6 +-
 {dpctl_ext => dpnp}/tensor/_numpy_helper.py   |   0
 {dpctl_ext => dpnp}/tensor/_print.py          |   6 +-
 {dpctl_ext => dpnp}/tensor/_reduction.py      |   8 +-
 {dpctl_ext => dpnp}/tensor/_reshape.py        |   4 +-
 {dpctl_ext => dpnp}/tensor/_scalar_utils.py   |   4 +-
 .../tensor/_search_functions.py               |   6 +-
 {dpctl_ext => dpnp}/tensor/_searchsorted.py   |   3 -
 {dpctl_ext => dpnp}/tensor/_set_functions.py  |   4 +-
 {dpctl_ext => dpnp}/tensor/_slicing.pxi       |   0
 {dpctl_ext => dpnp}/tensor/_sorting.py        |   6 +-
 .../tensor/_statistical_functions.py          |  10 +-
 {dpctl_ext => dpnp}/tensor/_stride_utils.pxi  |   0
 {dpctl_ext => dpnp}/tensor/_testing.py        |   4 +-
 {dpctl_ext => dpnp}/tensor/_type_utils.py     |   6 +-
 {dpctl_ext => dpnp}/tensor/_types.pxi         |   0
 {dpctl_ext => dpnp}/tensor/_usmarray.pxd      |   0
 {dpctl_ext => dpnp}/tensor/_usmarray.pyx      | 127 +++++------
 .../tensor/_utility_functions.py              |   8 +-
 .../tensor/include/dlpack/LICENSE.third-party |   0
 .../tensor/include/dlpack/README.md           |   0
 .../tensor/include/dlpack/dlpack.h            |   0
 .../include/kernels/accumulators.hpp          |   0
 .../libtensor/include/kernels/alignment.hpp   |   0
 .../kernels/boolean_advanced_indexing.hpp     |   0
 .../tensor/libtensor/include/kernels/clip.hpp |   0
 .../include/kernels/constructors.hpp          |   0
 .../include/kernels/copy_and_cast.hpp         |   0
 .../include/kernels/copy_as_contiguous.hpp    |   0
 .../include/kernels/dpctl_tensor_types.hpp    |   0
 .../kernels/elementwise_functions/abs.hpp     |   0
 .../kernels/elementwise_functions/acos.hpp    |   0
 .../kernels/elementwise_functions/acosh.hpp   |   0
 .../kernels/elementwise_functions/add.hpp     |   0
 .../kernels/elementwise_functions/angle.hpp   |   0
 .../kernels/elementwise_functions/asin.hpp    |   0
 .../kernels/elementwise_functions/asinh.hpp   |   0
 .../kernels/elementwise_functions/atan.hpp    |   0
 .../kernels/elementwise_functions/atan2.hpp   |   0
 .../kernels/elementwise_functions/atanh.hpp   |   0
 .../elementwise_functions/bitwise_and.hpp     |   0
 .../elementwise_functions/bitwise_invert.hpp  |   0
 .../bitwise_left_shift.hpp                    |   0
 .../elementwise_functions/bitwise_or.hpp      |   0
 .../bitwise_right_shift.hpp                   |   0
 .../elementwise_functions/bitwise_xor.hpp     |   0
 .../elementwise_functions/cabs_impl.hpp       |   0
 .../kernels/elementwise_functions/cbrt.hpp    |   0
 .../kernels/elementwise_functions/ceil.hpp    |   0
 .../kernels/elementwise_functions/common.hpp  |   0
 .../elementwise_functions/common_detail.hpp   |   0
 .../elementwise_functions/common_inplace.hpp  |   0
 .../kernels/elementwise_functions/conj.hpp    |   0
 .../elementwise_functions/copysign.hpp        |   0
 .../kernels/elementwise_functions/cos.hpp     |   0
 .../kernels/elementwise_functions/cosh.hpp    |   0
 .../kernels/elementwise_functions/equal.hpp   |   0
 .../kernels/elementwise_functions/exp.hpp     |   0
 .../kernels/elementwise_functions/exp2.hpp    |   0
 .../kernels/elementwise_functions/expm1.hpp   |   0
 .../kernels/elementwise_functions/floor.hpp   |   0
 .../elementwise_functions/floor_divide.hpp    |   0
 .../kernels/elementwise_functions/greater.hpp |   0
 .../elementwise_functions/greater_equal.hpp   |   0
 .../kernels/elementwise_functions/hypot.hpp   |   0
 .../kernels/elementwise_functions/imag.hpp    |   0
 .../elementwise_functions/isfinite.hpp        |   0
 .../kernels/elementwise_functions/isinf.hpp   |   0
 .../kernels/elementwise_functions/isnan.hpp   |   0
 .../kernels/elementwise_functions/less.hpp    |   0
 .../elementwise_functions/less_equal.hpp      |   0
 .../kernels/elementwise_functions/log.hpp     |   0
 .../kernels/elementwise_functions/log10.hpp   |   0
 .../kernels/elementwise_functions/log1p.hpp   |   0
 .../kernels/elementwise_functions/log2.hpp    |   0
 .../elementwise_functions/logaddexp.hpp       |   0
 .../elementwise_functions/logical_and.hpp     |   0
 .../elementwise_functions/logical_not.hpp     |   0
 .../elementwise_functions/logical_or.hpp      |   0
 .../elementwise_functions/logical_xor.hpp     |   0
 .../kernels/elementwise_functions/maximum.hpp |   0
 .../kernels/elementwise_functions/minimum.hpp |   0
 .../elementwise_functions/multiply.hpp        |   0
 .../elementwise_functions/negative.hpp        |   0
 .../elementwise_functions/nextafter.hpp       |   0
 .../elementwise_functions/not_equal.hpp       |   0
 .../elementwise_functions/positive.hpp        |   0
 .../kernels/elementwise_functions/pow.hpp     |   0
 .../kernels/elementwise_functions/proj.hpp    |   0
 .../kernels/elementwise_functions/real.hpp    |   0
 .../elementwise_functions/reciprocal.hpp      |   0
 .../elementwise_functions/remainder.hpp       |   0
 .../kernels/elementwise_functions/round.hpp   |   0
 .../kernels/elementwise_functions/rsqrt.hpp   |   0
 .../kernels/elementwise_functions/sign.hpp    |   0
 .../kernels/elementwise_functions/signbit.hpp |   0
 .../kernels/elementwise_functions/sin.hpp     |   0
 .../kernels/elementwise_functions/sinh.hpp    |   0
 .../kernels/elementwise_functions/sqrt.hpp    |   0
 .../kernels/elementwise_functions/square.hpp  |   0
 .../elementwise_functions/subtract.hpp        |   0
 .../elementwise_functions/sycl_complex.hpp    |   0
 .../kernels/elementwise_functions/tan.hpp     |   0
 .../kernels/elementwise_functions/tanh.hpp    |   0
 .../elementwise_functions/true_divide.hpp     |   0
 .../kernels/elementwise_functions/trunc.hpp   |   0
 .../elementwise_functions/vec_size_util.hpp   |   0
 .../kernels/integer_advanced_indexing.hpp     |   0
 .../kernels/linalg_functions/dot_product.hpp  |   0
 .../include/kernels/linalg_functions/gemm.hpp |   0
 .../libtensor/include/kernels/reductions.hpp  |   0
 .../libtensor/include/kernels/repeat.hpp      |   0
 .../include/kernels/sorting/isin.hpp          |   0
 .../include/kernels/sorting/merge_sort.hpp    |   0
 .../include/kernels/sorting/radix_sort.hpp    |   0
 .../kernels/sorting/search_sorted_detail.hpp  |   0
 .../include/kernels/sorting/searchsorted.hpp  |   0
 .../kernels/sorting/sort_impl_fn_ptr_t.hpp    |   0
 .../include/kernels/sorting/sort_utils.hpp    |   0
 .../include/kernels/sorting/topk.hpp          |   0
 .../libtensor/include/kernels/where.hpp       |   0
 .../include/utils/indexing_utils.hpp          |   0
 .../libtensor/include/utils/math_utils.hpp    |   0
 .../include/utils/memory_overlap.hpp          |   0
 .../libtensor/include/utils/offset_utils.hpp  |   0
 .../include/utils/output_validation.hpp       |   0
 .../include/utils/rich_comparisons.hpp        |   0
 .../libtensor/include/utils/strided_iters.hpp |   0
 .../include/utils/sycl_alloc_utils.hpp        |   0
 .../libtensor/include/utils/sycl_utils.hpp    |   0
 .../libtensor/include/utils/type_dispatch.hpp |   0
 .../include/utils/type_dispatch_building.hpp  |   0
 .../libtensor/include/utils/type_utils.hpp    |   0
 .../tensor/libtensor/source/accumulators.cpp  |   0
 .../tensor/libtensor/source/accumulators.hpp  |   0
 .../accumulators/accumulate_over_axis.hpp     |   0
 .../accumulators/accumulators_common.cpp      |   0
 .../accumulators/accumulators_common.hpp      |   0
 .../accumulators/cumulative_logsumexp.cpp     |   0
 .../accumulators/cumulative_logsumexp.hpp     |   0
 .../source/accumulators/cumulative_prod.cpp   |   0
 .../source/accumulators/cumulative_prod.hpp   |   0
 .../source/accumulators/cumulative_sum.cpp    |   0
 .../source/accumulators/cumulative_sum.hpp    |   0
 .../source/boolean_advanced_indexing.cpp      |   0
 .../source/boolean_advanced_indexing.hpp      |   0
 .../tensor/libtensor/source/clip.cpp          |   0
 .../tensor/libtensor/source/clip.hpp          |   0
 .../source/copy_and_cast_usm_to_usm.cpp       |   0
 .../source/copy_and_cast_usm_to_usm.hpp       |   0
 .../libtensor/source/copy_as_contig.cpp       |   0
 .../libtensor/source/copy_as_contig.hpp       |   0
 .../libtensor/source/copy_for_reshape.cpp     |   0
 .../libtensor/source/copy_for_reshape.hpp     |   0
 .../tensor/libtensor/source/copy_for_roll.cpp |   0
 .../tensor/libtensor/source/copy_for_roll.hpp |   0
 .../copy_numpy_ndarray_into_usm_ndarray.cpp   |   0
 .../copy_numpy_ndarray_into_usm_ndarray.hpp   |   0
 .../source/device_support_queries.cpp         |   0
 .../source/device_support_queries.hpp         |   0
 .../source/elementwise_functions/abs.cpp      |   0
 .../source/elementwise_functions/abs.hpp      |   0
 .../source/elementwise_functions/acos.cpp     |   0
 .../source/elementwise_functions/acos.hpp     |   0
 .../source/elementwise_functions/acosh.cpp    |   0
 .../source/elementwise_functions/acosh.hpp    |   0
 .../source/elementwise_functions/add.cpp      |   0
 .../source/elementwise_functions/add.hpp      |   0
 .../source/elementwise_functions/angle.cpp    |   0
 .../source/elementwise_functions/angle.hpp    |   0
 .../source/elementwise_functions/asin.cpp     |   0
 .../source/elementwise_functions/asin.hpp     |   0
 .../source/elementwise_functions/asinh.cpp    |   0
 .../source/elementwise_functions/asinh.hpp    |   0
 .../source/elementwise_functions/atan.cpp     |   0
 .../source/elementwise_functions/atan.hpp     |   0
 .../source/elementwise_functions/atan2.cpp    |   0
 .../source/elementwise_functions/atan2.hpp    |   0
 .../source/elementwise_functions/atanh.cpp    |   0
 .../source/elementwise_functions/atanh.hpp    |   0
 .../elementwise_functions/bitwise_and.cpp     |   0
 .../elementwise_functions/bitwise_and.hpp     |   0
 .../elementwise_functions/bitwise_invert.cpp  |   0
 .../elementwise_functions/bitwise_invert.hpp  |   0
 .../bitwise_left_shift.cpp                    |   0
 .../bitwise_left_shift.hpp                    |   0
 .../elementwise_functions/bitwise_or.cpp      |   0
 .../elementwise_functions/bitwise_or.hpp      |   0
 .../bitwise_right_shift.cpp                   |   0
 .../bitwise_right_shift.hpp                   |   0
 .../elementwise_functions/bitwise_xor.cpp     |   0
 .../elementwise_functions/bitwise_xor.hpp     |   0
 .../source/elementwise_functions/cbrt.cpp     |   0
 .../source/elementwise_functions/cbrt.hpp     |   0
 .../source/elementwise_functions/ceil.cpp     |   0
 .../source/elementwise_functions/ceil.hpp     |   0
 .../source/elementwise_functions/conj.cpp     |   0
 .../source/elementwise_functions/conj.hpp     |   0
 .../source/elementwise_functions/copysign.cpp |   0
 .../source/elementwise_functions/copysign.hpp |   0
 .../source/elementwise_functions/cos.cpp      |   0
 .../source/elementwise_functions/cos.hpp      |   0
 .../source/elementwise_functions/cosh.cpp     |   0
 .../source/elementwise_functions/cosh.hpp     |   0
 .../elementwise_common.cpp                    |   0
 .../elementwise_common.hpp                    |   0
 .../elementwise_functions.hpp                 |   0
 .../elementwise_functions_type_utils.cpp      |   0
 .../elementwise_functions_type_utils.hpp      |   0
 .../source/elementwise_functions/equal.cpp    |   0
 .../source/elementwise_functions/equal.hpp    |   0
 .../source/elementwise_functions/exp.cpp      |   0
 .../source/elementwise_functions/exp.hpp      |   0
 .../source/elementwise_functions/exp2.cpp     |   0
 .../source/elementwise_functions/exp2.hpp     |   0
 .../source/elementwise_functions/expm1.cpp    |   0
 .../source/elementwise_functions/expm1.hpp    |   0
 .../source/elementwise_functions/floor.cpp    |   0
 .../source/elementwise_functions/floor.hpp    |   0
 .../elementwise_functions/floor_divide.cpp    |   0
 .../elementwise_functions/floor_divide.hpp    |   0
 .../source/elementwise_functions/greater.cpp  |   0
 .../source/elementwise_functions/greater.hpp  |   0
 .../elementwise_functions/greater_equal.cpp   |   0
 .../elementwise_functions/greater_equal.hpp   |   0
 .../source/elementwise_functions/hypot.cpp    |   0
 .../source/elementwise_functions/hypot.hpp    |   0
 .../source/elementwise_functions/imag.cpp     |   0
 .../source/elementwise_functions/imag.hpp     |   0
 .../source/elementwise_functions/isfinite.cpp |   0
 .../source/elementwise_functions/isfinite.hpp |   0
 .../source/elementwise_functions/isinf.cpp    |   0
 .../source/elementwise_functions/isinf.hpp    |   0
 .../source/elementwise_functions/isnan.cpp    |   0
 .../source/elementwise_functions/isnan.hpp    |   0
 .../source/elementwise_functions/less.cpp     |   0
 .../source/elementwise_functions/less.hpp     |   0
 .../elementwise_functions/less_equal.cpp      |   0
 .../elementwise_functions/less_equal.hpp      |   0
 .../source/elementwise_functions/log.cpp      |   0
 .../source/elementwise_functions/log.hpp      |   0
 .../source/elementwise_functions/log10.cpp    |   0
 .../source/elementwise_functions/log10.hpp    |   0
 .../source/elementwise_functions/log1p.cpp    |   0
 .../source/elementwise_functions/log1p.hpp    |   0
 .../source/elementwise_functions/log2.cpp     |   0
 .../source/elementwise_functions/log2.hpp     |   0
 .../elementwise_functions/logaddexp.cpp       |   0
 .../elementwise_functions/logaddexp.hpp       |   0
 .../elementwise_functions/logical_and.cpp     |   0
 .../elementwise_functions/logical_and.hpp     |   0
 .../elementwise_functions/logical_not.cpp     |   0
 .../elementwise_functions/logical_not.hpp     |   0
 .../elementwise_functions/logical_or.cpp      |   0
 .../elementwise_functions/logical_or.hpp      |   0
 .../elementwise_functions/logical_xor.cpp     |   0
 .../elementwise_functions/logical_xor.hpp     |   0
 .../source/elementwise_functions/maximum.cpp  |   0
 .../source/elementwise_functions/maximum.hpp  |   0
 .../source/elementwise_functions/minimum.cpp  |   0
 .../source/elementwise_functions/minimum.hpp  |   0
 .../source/elementwise_functions/multiply.cpp |   0
 .../source/elementwise_functions/multiply.hpp |   0
 .../source/elementwise_functions/negative.cpp |   0
 .../source/elementwise_functions/negative.hpp |   0
 .../elementwise_functions/nextafter.cpp       |   0
 .../elementwise_functions/nextafter.hpp       |   0
 .../elementwise_functions/not_equal.cpp       |   0
 .../elementwise_functions/not_equal.hpp       |   0
 .../source/elementwise_functions/positive.cpp |   0
 .../source/elementwise_functions/positive.hpp |   0
 .../source/elementwise_functions/pow.cpp      |   0
 .../source/elementwise_functions/pow.hpp      |   0
 .../source/elementwise_functions/proj.cpp     |   0
 .../source/elementwise_functions/proj.hpp     |   0
 .../source/elementwise_functions/real.cpp     |   0
 .../source/elementwise_functions/real.hpp     |   0
 .../elementwise_functions/reciprocal.cpp      |   0
 .../elementwise_functions/reciprocal.hpp      |   0
 .../elementwise_functions/remainder.cpp       |   0
 .../elementwise_functions/remainder.hpp       |   0
 .../source/elementwise_functions/round.cpp    |   0
 .../source/elementwise_functions/round.hpp    |   0
 .../source/elementwise_functions/rsqrt.cpp    |   0
 .../source/elementwise_functions/rsqrt.hpp    |   0
 .../source/elementwise_functions/sign.cpp     |   0
 .../source/elementwise_functions/sign.hpp     |   0
 .../source/elementwise_functions/signbit.cpp  |   0
 .../source/elementwise_functions/signbit.hpp  |   0
 .../source/elementwise_functions/sin.cpp      |   0
 .../source/elementwise_functions/sin.hpp      |   0
 .../source/elementwise_functions/sinh.cpp     |   0
 .../source/elementwise_functions/sinh.hpp     |   0
 .../source/elementwise_functions/sqrt.cpp     |   0
 .../source/elementwise_functions/sqrt.hpp     |   0
 .../source/elementwise_functions/square.cpp   |   0
 .../source/elementwise_functions/square.hpp   |   0
 .../source/elementwise_functions/subtract.cpp |   0
 .../source/elementwise_functions/subtract.hpp |   0
 .../source/elementwise_functions/tan.cpp      |   0
 .../source/elementwise_functions/tan.hpp      |   0
 .../source/elementwise_functions/tanh.cpp     |   0
 .../source/elementwise_functions/tanh.hpp     |   0
 .../elementwise_functions/true_divide.cpp     |   0
 .../elementwise_functions/true_divide.hpp     |   0
 .../source/elementwise_functions/trunc.cpp    |   0
 .../source/elementwise_functions/trunc.hpp    |   0
 .../tensor/libtensor/source/eye_ctor.cpp      |   0
 .../tensor/libtensor/source/eye_ctor.hpp      |   0
 .../tensor/libtensor/source/full_ctor.cpp     |   0
 .../tensor/libtensor/source/full_ctor.hpp     |   0
 .../source/integer_advanced_indexing.cpp      |   0
 .../source/integer_advanced_indexing.hpp      |   0
 .../libtensor/source/linalg_functions/dot.cpp |   0
 .../libtensor/source/linalg_functions/dot.hpp |   0
 .../linalg_functions/dot_atomic_support.hpp   |   0
 .../source/linalg_functions/dot_dispatch.hpp  |   0
 .../libtensor/source/linear_sequences.cpp     |   0
 .../libtensor/source/linear_sequences.hpp     |   0
 .../libtensor/source/reductions/all.cpp       |   0
 .../libtensor/source/reductions/all.hpp       |   0
 .../libtensor/source/reductions/any.cpp       |   0
 .../libtensor/source/reductions/any.hpp       |   0
 .../libtensor/source/reductions/argmax.cpp    |   0
 .../libtensor/source/reductions/argmax.hpp    |   0
 .../libtensor/source/reductions/argmin.cpp    |   0
 .../libtensor/source/reductions/argmin.hpp    |   0
 .../libtensor/source/reductions/logsumexp.cpp |   0
 .../libtensor/source/reductions/logsumexp.hpp |   0
 .../libtensor/source/reductions/max.cpp       |   0
 .../libtensor/source/reductions/max.hpp       |   0
 .../libtensor/source/reductions/min.cpp       |   0
 .../libtensor/source/reductions/min.hpp       |   0
 .../libtensor/source/reductions/prod.cpp      |   0
 .../libtensor/source/reductions/prod.hpp      |   0
 .../source/reductions/reduce_hypot.cpp        |   0
 .../source/reductions/reduce_hypot.hpp        |   0
 .../reductions/reduction_atomic_support.hpp   |   0
 .../source/reductions/reduction_common.cpp    |   0
 .../source/reductions/reduction_common.hpp    |   0
 .../source/reductions/reduction_over_axis.hpp |   0
 .../libtensor/source/reductions/sum.cpp       |   0
 .../libtensor/source/reductions/sum.hpp       |   0
 .../tensor/libtensor/source/repeat.cpp        |   0
 .../tensor/libtensor/source/repeat.hpp        |   0
 .../source/simplify_iteration_space.cpp       |   0
 .../source/simplify_iteration_space.hpp       |   0
 .../tensor/libtensor/source/sorting/isin.cpp  |   0
 .../tensor/libtensor/source/sorting/isin.hpp  |   0
 .../source/sorting/merge_argsort.cpp          |   0
 .../source/sorting/merge_argsort.hpp          |   0
 .../libtensor/source/sorting/merge_sort.cpp   |   0
 .../libtensor/source/sorting/merge_sort.hpp   |   0
 .../source/sorting/py_argsort_common.hpp      |   0
 .../source/sorting/py_sort_common.hpp         |   0
 .../source/sorting/radix_argsort.cpp          |   0
 .../source/sorting/radix_argsort.hpp          |   0
 .../libtensor/source/sorting/radix_sort.cpp   |   0
 .../libtensor/source/sorting/radix_sort.hpp   |   0
 .../source/sorting/radix_sort_support.hpp     |   0
 .../libtensor/source/sorting/searchsorted.cpp |   0
 .../libtensor/source/sorting/searchsorted.hpp |   0
 .../tensor/libtensor/source/sorting/topk.cpp  |   0
 .../tensor/libtensor/source/sorting/topk.hpp  |   0
 .../libtensor/source/tensor_accumulation.cpp  |   0
 .../tensor/libtensor/source/tensor_ctors.cpp  |   0
 .../libtensor/source/tensor_elementwise.cpp   |   0
 .../tensor/libtensor/source/tensor_linalg.cpp |   0
 .../libtensor/source/tensor_reductions.cpp    |   0
 .../libtensor/source/tensor_sorting.cpp       |   0
 .../tensor/libtensor/source/triul_ctor.cpp    |   0
 .../tensor/libtensor/source/triul_ctor.hpp    |   0
 .../tensor/libtensor/source/where.cpp         |   0
 .../tensor/libtensor/source/where.hpp         |   0
 .../tensor/libtensor/source/zeros_ctor.cpp    |   0
 .../tensor/libtensor/source/zeros_ctor.hpp    |   0
 dpnp/tests/test_array_api_info.py             |   5 +-
 dpnp/tests/test_arraycreation.py              |   4 +-
 dpnp/tests/test_arraymanipulation.py          |   9 +-
 dpnp/tests/test_counting.py                   |   5 +-
 dpnp/tests/test_fft.py                        |   4 +-
 dpnp/tests/test_flipping.py                   |   5 +-
 dpnp/tests/test_indexing.py                   |   8 +-
 dpnp/tests/test_linalg.py                     |   9 +-
 dpnp/tests/test_manipulation.py               |   9 +-
 dpnp/tests/test_mathematical.py               |  13 +-
 dpnp/tests/test_memory.py                     |   4 +-
 dpnp/tests/test_nanfunctions.py               |   4 +-
 dpnp/tests/test_ndarray.py                    |   4 +-
 dpnp/tests/test_product.py                    |   5 +-
 dpnp/tests/test_search.py                     |   4 +-
 dpnp/tests/test_sort.py                       |   5 +-
 dpnp/tests/test_statistics.py                 |   4 +-
 dpnp/tests/test_sycl_queue.py                 |   4 +-
 dpnp/tests/test_usm_type.py                   |   4 +-
 dpnp/tests/test_utils.py                      |   4 +-
 .../cupy/core_tests/test_dlpack.py            |   4 +-
 .../cupy/core_tests/test_ndarray.py           |   5 +-
 .../cupy/lib_tests/test_shape_base.py         |   5 +-
 .../cupy/manipulation_tests/test_dims.py      |   5 +-
 .../cupy/manipulation_tests/test_transpose.py |   5 +-
 .../cupy/math_tests/test_sumprod.py           |   5 +-
 .../cupy/sorting_tests/test_sort.py           |   5 +-
 .../cupy/statistics_tests/test_meanvar.py     |   5 +-
 dpnp/tests/third_party/cupy/testing/_loops.py |   5 +-
 setup.py                                      |   4 +-
 470 files changed, 385 insertions(+), 691 deletions(-)
 delete mode 100644 dpctl_ext/CMakeLists.txt
 delete mode 100644 dpctl_ext/__init__.py
 rename {dpctl_ext => dpnp}/tensor/CMakeLists.txt (92%)
 rename {dpctl_ext => dpnp}/tensor/__init__.pxd (100%)
 rename {dpctl_ext => dpnp}/tensor/__init__.py (100%)
 rename {dpctl_ext => dpnp}/tensor/_accumulation.py (98%)
 rename {dpctl_ext => dpnp}/tensor/_array_api.py (98%)
 rename {dpctl_ext => dpnp}/tensor/_clip.py (99%)
 rename {dpctl_ext => dpnp}/tensor/_constants.py (100%)
 rename {dpctl_ext => dpnp}/tensor/_copy_utils.py (99%)
 rename {dpctl_ext => dpnp}/tensor/_ctors.py (99%)
 rename {dpctl_ext => dpnp}/tensor/_data_types.py (100%)
 rename {dpctl_ext => dpnp}/tensor/_device.py (100%)
 rename {dpctl_ext => dpnp}/tensor/_dldevice_conversions.py (100%)
 rename {dpctl_ext => dpnp}/tensor/_dlpack.pxd (100%)
 rename {dpctl_ext => dpnp}/tensor/_dlpack.pyx (99%)
 rename {dpctl_ext => dpnp}/tensor/_elementwise_common.py (99%)
 rename {dpctl_ext => dpnp}/tensor/_elementwise_funcs.py (99%)
 rename {dpctl_ext => dpnp}/tensor/_flags.pyx (100%)
 rename {dpctl_ext => dpnp}/tensor/_indexing_functions.py (99%)
 rename {dpctl_ext => dpnp}/tensor/_linear_algebra_functions.py (99%)
 rename {dpctl_ext => dpnp}/tensor/_manipulation_functions.py (99%)
 rename {dpctl_ext => dpnp}/tensor/_numpy_helper.py (100%)
 rename {dpctl_ext => dpnp}/tensor/_print.py (99%)
 rename {dpctl_ext => dpnp}/tensor/_reduction.py (99%)
 rename {dpctl_ext => dpnp}/tensor/_reshape.py (98%)
 rename {dpctl_ext => dpnp}/tensor/_scalar_utils.py (97%)
 rename {dpctl_ext => dpnp}/tensor/_search_functions.py (98%)
 rename {dpctl_ext => dpnp}/tensor/_searchsorted.py (98%)
 rename {dpctl_ext => dpnp}/tensor/_set_functions.py (99%)
 rename {dpctl_ext => dpnp}/tensor/_slicing.pxi (100%)
 rename {dpctl_ext => dpnp}/tensor/_sorting.py (99%)
 rename {dpctl_ext => dpnp}/tensor/_statistical_functions.py (98%)
 rename {dpctl_ext => dpnp}/tensor/_stride_utils.pxi (100%)
 rename {dpctl_ext => dpnp}/tensor/_testing.py (98%)
 rename {dpctl_ext => dpnp}/tensor/_type_utils.py (99%)
 rename {dpctl_ext => dpnp}/tensor/_types.pxi (100%)
 rename {dpctl_ext => dpnp}/tensor/_usmarray.pxd (100%)
 rename {dpctl_ext => dpnp}/tensor/_usmarray.pyx (93%)
 rename {dpctl_ext => dpnp}/tensor/_utility_functions.py (98%)
 rename {dpctl_ext => dpnp}/tensor/include/dlpack/LICENSE.third-party (100%)
 rename {dpctl_ext => dpnp}/tensor/include/dlpack/README.md (100%)
 rename {dpctl_ext => dpnp}/tensor/include/dlpack/dlpack.h (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/include/kernels/accumulators.hpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/include/kernels/alignment.hpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/include/kernels/boolean_advanced_indexing.hpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/include/kernels/clip.hpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/include/kernels/constructors.hpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/include/kernels/copy_and_cast.hpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/include/kernels/copy_as_contiguous.hpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/include/kernels/dpctl_tensor_types.hpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/include/kernels/elementwise_functions/abs.hpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/include/kernels/elementwise_functions/acos.hpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/include/kernels/elementwise_functions/acosh.hpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/include/kernels/elementwise_functions/add.hpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/include/kernels/elementwise_functions/angle.hpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/include/kernels/elementwise_functions/asin.hpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/include/kernels/elementwise_functions/asinh.hpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/include/kernels/elementwise_functions/atan.hpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/include/kernels/elementwise_functions/atan2.hpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/include/kernels/elementwise_functions/atanh.hpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/include/kernels/elementwise_functions/bitwise_and.hpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/include/kernels/elementwise_functions/bitwise_invert.hpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/include/kernels/elementwise_functions/bitwise_left_shift.hpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/include/kernels/elementwise_functions/bitwise_or.hpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/include/kernels/elementwise_functions/bitwise_right_shift.hpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/include/kernels/elementwise_functions/bitwise_xor.hpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/include/kernels/elementwise_functions/cabs_impl.hpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/include/kernels/elementwise_functions/cbrt.hpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/include/kernels/elementwise_functions/ceil.hpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/include/kernels/elementwise_functions/common.hpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/include/kernels/elementwise_functions/common_detail.hpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/include/kernels/elementwise_functions/common_inplace.hpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/include/kernels/elementwise_functions/conj.hpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/include/kernels/elementwise_functions/copysign.hpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/include/kernels/elementwise_functions/cos.hpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/include/kernels/elementwise_functions/cosh.hpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/include/kernels/elementwise_functions/equal.hpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/include/kernels/elementwise_functions/exp.hpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/include/kernels/elementwise_functions/exp2.hpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/include/kernels/elementwise_functions/expm1.hpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/include/kernels/elementwise_functions/floor.hpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/include/kernels/elementwise_functions/floor_divide.hpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/include/kernels/elementwise_functions/greater.hpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/include/kernels/elementwise_functions/greater_equal.hpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/include/kernels/elementwise_functions/hypot.hpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/include/kernels/elementwise_functions/imag.hpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/include/kernels/elementwise_functions/isfinite.hpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/include/kernels/elementwise_functions/isinf.hpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/include/kernels/elementwise_functions/isnan.hpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/include/kernels/elementwise_functions/less.hpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/include/kernels/elementwise_functions/less_equal.hpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/include/kernels/elementwise_functions/log.hpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/include/kernels/elementwise_functions/log10.hpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/include/kernels/elementwise_functions/log1p.hpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/include/kernels/elementwise_functions/log2.hpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/include/kernels/elementwise_functions/logaddexp.hpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/include/kernels/elementwise_functions/logical_and.hpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/include/kernels/elementwise_functions/logical_not.hpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/include/kernels/elementwise_functions/logical_or.hpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/include/kernels/elementwise_functions/logical_xor.hpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/include/kernels/elementwise_functions/maximum.hpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/include/kernels/elementwise_functions/minimum.hpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/include/kernels/elementwise_functions/multiply.hpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/include/kernels/elementwise_functions/negative.hpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/include/kernels/elementwise_functions/nextafter.hpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/include/kernels/elementwise_functions/not_equal.hpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/include/kernels/elementwise_functions/positive.hpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/include/kernels/elementwise_functions/pow.hpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/include/kernels/elementwise_functions/proj.hpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/include/kernels/elementwise_functions/real.hpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/include/kernels/elementwise_functions/reciprocal.hpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/include/kernels/elementwise_functions/remainder.hpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/include/kernels/elementwise_functions/round.hpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/include/kernels/elementwise_functions/rsqrt.hpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/include/kernels/elementwise_functions/sign.hpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/include/kernels/elementwise_functions/signbit.hpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/include/kernels/elementwise_functions/sin.hpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/include/kernels/elementwise_functions/sinh.hpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/include/kernels/elementwise_functions/sqrt.hpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/include/kernels/elementwise_functions/square.hpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/include/kernels/elementwise_functions/subtract.hpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/include/kernels/elementwise_functions/sycl_complex.hpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/include/kernels/elementwise_functions/tan.hpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/include/kernels/elementwise_functions/tanh.hpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/include/kernels/elementwise_functions/true_divide.hpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/include/kernels/elementwise_functions/trunc.hpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/include/kernels/elementwise_functions/vec_size_util.hpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/include/kernels/integer_advanced_indexing.hpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/include/kernels/linalg_functions/dot_product.hpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/include/kernels/linalg_functions/gemm.hpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/include/kernels/reductions.hpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/include/kernels/repeat.hpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/include/kernels/sorting/isin.hpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/include/kernels/sorting/merge_sort.hpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/include/kernels/sorting/radix_sort.hpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/include/kernels/sorting/search_sorted_detail.hpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/include/kernels/sorting/searchsorted.hpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/include/kernels/sorting/sort_impl_fn_ptr_t.hpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/include/kernels/sorting/sort_utils.hpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/include/kernels/sorting/topk.hpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/include/kernels/where.hpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/include/utils/indexing_utils.hpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/include/utils/math_utils.hpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/include/utils/memory_overlap.hpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/include/utils/offset_utils.hpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/include/utils/output_validation.hpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/include/utils/rich_comparisons.hpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/include/utils/strided_iters.hpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/include/utils/sycl_alloc_utils.hpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/include/utils/sycl_utils.hpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/include/utils/type_dispatch.hpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/include/utils/type_dispatch_building.hpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/include/utils/type_utils.hpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/accumulators.cpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/accumulators.hpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/accumulators/accumulate_over_axis.hpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/accumulators/accumulators_common.cpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/accumulators/accumulators_common.hpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/accumulators/cumulative_logsumexp.cpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/accumulators/cumulative_logsumexp.hpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/accumulators/cumulative_prod.cpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/accumulators/cumulative_prod.hpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/accumulators/cumulative_sum.cpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/accumulators/cumulative_sum.hpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/boolean_advanced_indexing.cpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/boolean_advanced_indexing.hpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/clip.cpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/clip.hpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/copy_and_cast_usm_to_usm.cpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/copy_and_cast_usm_to_usm.hpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/copy_as_contig.cpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/copy_as_contig.hpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/copy_for_reshape.cpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/copy_for_reshape.hpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/copy_for_roll.cpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/copy_for_roll.hpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/copy_numpy_ndarray_into_usm_ndarray.cpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/copy_numpy_ndarray_into_usm_ndarray.hpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/device_support_queries.cpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/device_support_queries.hpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/elementwise_functions/abs.cpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/elementwise_functions/abs.hpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/elementwise_functions/acos.cpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/elementwise_functions/acos.hpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/elementwise_functions/acosh.cpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/elementwise_functions/acosh.hpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/elementwise_functions/add.cpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/elementwise_functions/add.hpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/elementwise_functions/angle.cpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/elementwise_functions/angle.hpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/elementwise_functions/asin.cpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/elementwise_functions/asin.hpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/elementwise_functions/asinh.cpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/elementwise_functions/asinh.hpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/elementwise_functions/atan.cpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/elementwise_functions/atan.hpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/elementwise_functions/atan2.cpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/elementwise_functions/atan2.hpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/elementwise_functions/atanh.cpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/elementwise_functions/atanh.hpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/elementwise_functions/bitwise_and.cpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/elementwise_functions/bitwise_and.hpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/elementwise_functions/bitwise_invert.cpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/elementwise_functions/bitwise_invert.hpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/elementwise_functions/bitwise_left_shift.cpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/elementwise_functions/bitwise_left_shift.hpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/elementwise_functions/bitwise_or.cpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/elementwise_functions/bitwise_or.hpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/elementwise_functions/bitwise_right_shift.cpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/elementwise_functions/bitwise_right_shift.hpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/elementwise_functions/bitwise_xor.cpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/elementwise_functions/bitwise_xor.hpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/elementwise_functions/cbrt.cpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/elementwise_functions/cbrt.hpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/elementwise_functions/ceil.cpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/elementwise_functions/ceil.hpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/elementwise_functions/conj.cpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/elementwise_functions/conj.hpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/elementwise_functions/copysign.cpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/elementwise_functions/copysign.hpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/elementwise_functions/cos.cpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/elementwise_functions/cos.hpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/elementwise_functions/cosh.cpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/elementwise_functions/cosh.hpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/elementwise_functions/elementwise_common.cpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/elementwise_functions/elementwise_common.hpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/elementwise_functions/elementwise_functions.hpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/elementwise_functions/elementwise_functions_type_utils.cpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/elementwise_functions/elementwise_functions_type_utils.hpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/elementwise_functions/equal.cpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/elementwise_functions/equal.hpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/elementwise_functions/exp.cpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/elementwise_functions/exp.hpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/elementwise_functions/exp2.cpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/elementwise_functions/exp2.hpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/elementwise_functions/expm1.cpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/elementwise_functions/expm1.hpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/elementwise_functions/floor.cpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/elementwise_functions/floor.hpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/elementwise_functions/floor_divide.cpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/elementwise_functions/floor_divide.hpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/elementwise_functions/greater.cpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/elementwise_functions/greater.hpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/elementwise_functions/greater_equal.cpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/elementwise_functions/greater_equal.hpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/elementwise_functions/hypot.cpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/elementwise_functions/hypot.hpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/elementwise_functions/imag.cpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/elementwise_functions/imag.hpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/elementwise_functions/isfinite.cpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/elementwise_functions/isfinite.hpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/elementwise_functions/isinf.cpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/elementwise_functions/isinf.hpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/elementwise_functions/isnan.cpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/elementwise_functions/isnan.hpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/elementwise_functions/less.cpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/elementwise_functions/less.hpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/elementwise_functions/less_equal.cpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/elementwise_functions/less_equal.hpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/elementwise_functions/log.cpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/elementwise_functions/log.hpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/elementwise_functions/log10.cpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/elementwise_functions/log10.hpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/elementwise_functions/log1p.cpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/elementwise_functions/log1p.hpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/elementwise_functions/log2.cpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/elementwise_functions/log2.hpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/elementwise_functions/logaddexp.cpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/elementwise_functions/logaddexp.hpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/elementwise_functions/logical_and.cpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/elementwise_functions/logical_and.hpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/elementwise_functions/logical_not.cpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/elementwise_functions/logical_not.hpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/elementwise_functions/logical_or.cpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/elementwise_functions/logical_or.hpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/elementwise_functions/logical_xor.cpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/elementwise_functions/logical_xor.hpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/elementwise_functions/maximum.cpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/elementwise_functions/maximum.hpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/elementwise_functions/minimum.cpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/elementwise_functions/minimum.hpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/elementwise_functions/multiply.cpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/elementwise_functions/multiply.hpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/elementwise_functions/negative.cpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/elementwise_functions/negative.hpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/elementwise_functions/nextafter.cpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/elementwise_functions/nextafter.hpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/elementwise_functions/not_equal.cpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/elementwise_functions/not_equal.hpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/elementwise_functions/positive.cpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/elementwise_functions/positive.hpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/elementwise_functions/pow.cpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/elementwise_functions/pow.hpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/elementwise_functions/proj.cpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/elementwise_functions/proj.hpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/elementwise_functions/real.cpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/elementwise_functions/real.hpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/elementwise_functions/reciprocal.cpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/elementwise_functions/reciprocal.hpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/elementwise_functions/remainder.cpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/elementwise_functions/remainder.hpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/elementwise_functions/round.cpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/elementwise_functions/round.hpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/elementwise_functions/rsqrt.cpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/elementwise_functions/rsqrt.hpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/elementwise_functions/sign.cpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/elementwise_functions/sign.hpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/elementwise_functions/signbit.cpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/elementwise_functions/signbit.hpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/elementwise_functions/sin.cpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/elementwise_functions/sin.hpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/elementwise_functions/sinh.cpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/elementwise_functions/sinh.hpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/elementwise_functions/sqrt.cpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/elementwise_functions/sqrt.hpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/elementwise_functions/square.cpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/elementwise_functions/square.hpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/elementwise_functions/subtract.cpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/elementwise_functions/subtract.hpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/elementwise_functions/tan.cpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/elementwise_functions/tan.hpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/elementwise_functions/tanh.cpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/elementwise_functions/tanh.hpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/elementwise_functions/true_divide.cpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/elementwise_functions/true_divide.hpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/elementwise_functions/trunc.cpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/elementwise_functions/trunc.hpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/eye_ctor.cpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/eye_ctor.hpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/full_ctor.cpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/full_ctor.hpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/integer_advanced_indexing.cpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/integer_advanced_indexing.hpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/linalg_functions/dot.cpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/linalg_functions/dot.hpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/linalg_functions/dot_atomic_support.hpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/linalg_functions/dot_dispatch.hpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/linear_sequences.cpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/linear_sequences.hpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/reductions/all.cpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/reductions/all.hpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/reductions/any.cpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/reductions/any.hpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/reductions/argmax.cpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/reductions/argmax.hpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/reductions/argmin.cpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/reductions/argmin.hpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/reductions/logsumexp.cpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/reductions/logsumexp.hpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/reductions/max.cpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/reductions/max.hpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/reductions/min.cpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/reductions/min.hpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/reductions/prod.cpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/reductions/prod.hpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/reductions/reduce_hypot.cpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/reductions/reduce_hypot.hpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/reductions/reduction_atomic_support.hpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/reductions/reduction_common.cpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/reductions/reduction_common.hpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/reductions/reduction_over_axis.hpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/reductions/sum.cpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/reductions/sum.hpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/repeat.cpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/repeat.hpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/simplify_iteration_space.cpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/simplify_iteration_space.hpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/sorting/isin.cpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/sorting/isin.hpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/sorting/merge_argsort.cpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/sorting/merge_argsort.hpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/sorting/merge_sort.cpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/sorting/merge_sort.hpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/sorting/py_argsort_common.hpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/sorting/py_sort_common.hpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/sorting/radix_argsort.cpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/sorting/radix_argsort.hpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/sorting/radix_sort.cpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/sorting/radix_sort.hpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/sorting/radix_sort_support.hpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/sorting/searchsorted.cpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/sorting/searchsorted.hpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/sorting/topk.cpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/sorting/topk.hpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/tensor_accumulation.cpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/tensor_ctors.cpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/tensor_elementwise.cpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/tensor_linalg.cpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/tensor_reductions.cpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/tensor_sorting.cpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/triul_ctor.cpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/triul_ctor.hpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/where.cpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/where.hpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/zeros_ctor.cpp (100%)
 rename {dpctl_ext => dpnp}/tensor/libtensor/source/zeros_ctor.hpp (100%)

diff --git a/.gitignore b/.gitignore
index f8ed987fa0d9..f66bfbb3fdd8 100644
--- a/.gitignore
+++ b/.gitignore
@@ -28,12 +28,8 @@ dpnp_pytest.*
 example3
 
 *dpnp_backend*
+dpnp/include/dpnp/tensor/*.h
 dpnp/**/*.cpython*.so
 dpnp/**/*.pyd
 *~
 core
-
-# TODO: revert to `dpctl/`
-# when dpnp fully migrates dpctl/tensor
-dpctl_ext/**/*.cpython*.so
-dpctl_ext/include/
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 6dedacc3bc43..9cc81c8005f7 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -37,12 +37,23 @@ project(
 )
 
 option(DPNP_GENERATE_COVERAGE "Enable build DPNP with coverage instrumentation" OFF)
+option(
+    DPNP_TENSOR_GENERATE_COVERAGE_FOR_PYBIND11_EXTENSIONS
+    "Build dpnp tensor pybind11 offloading extensions with coverage instrumentation"
+    OFF
+)
 option(DPNP_BACKEND_TESTS "Enable building of DPNP backend test suite" OFF)
 option(
     DPNP_WITH_REDIST
     "Build DPNP assuming DPC++ redistributable is installed into Python prefix"
     OFF
 )
+option(
+    DPNP_TENSOR_OFFLOAD_COMPRESS
+    "Build dpnp tensor using offload section compression feature of DPC++ to reduce \
+size of shared object with offloading sections"
+    OFF
+)
 
 set(CMAKE_CXX_STANDARD 17)
 set(CMAKE_CXX_STANDARD_REQUIRED True)
@@ -344,5 +355,4 @@ if(DEFINED SKBUILD)
     set(_ignore_me ${SKBUILD})
 endif()
 
-add_subdirectory(dpctl_ext)
 add_subdirectory(dpnp)
diff --git a/dpctl_ext/CMakeLists.txt b/dpctl_ext/CMakeLists.txt
deleted file mode 100644
index fa187463414d..000000000000
--- a/dpctl_ext/CMakeLists.txt
+++ /dev/null
@@ -1,202 +0,0 @@
-# -*- coding: utf-8 -*-
-# *****************************************************************************
-# Copyright (c) 2026, Intel Corporation
-# All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-# - Redistributions of source code must retain the above copyright notice,
-#   this list of conditions and the following disclaimer.
-# - Redistributions in binary form must reproduce the above copyright notice,
-#   this list of conditions and the following disclaimer in the documentation
-#   and/or other materials provided with the distribution.
-# - Neither the name of the copyright holder nor the names of its contributors
-#   may be used to endorse or promote products derived from this software
-#   without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
-# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
-# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
-# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
-# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
-# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
-# THE POSSIBILITY OF SUCH DAMAGE.
-# *****************************************************************************
-
-# TODO: rework this logic to remove current duplication
-if(WIN32)
-    string(
-        CONCAT WARNING_FLAGS
-        "-Wall "
-        "-Wextra "
-        "-Winit-self "
-        "-Wunused-function "
-        "-Wuninitialized "
-        "-Wmissing-declarations "
-        "-Wstrict-prototypes "
-        "-Wno-unused-parameter "
-    )
-    string(CONCAT SDL_FLAGS "/GS " "/DynamicBase ")
-    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /Ox ${WARNING_FLAGS} ${SDL_FLAGS}")
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /Ox ${WARNING_FLAGS} ${SDL_FLAGS}")
-    set(CMAKE_C_FLAGS_DEBUG
-        "${CMAKE_C_FLAGS_DEBUG} ${WARNING_FLAGS} ${SDL_FLAGS} -O0 -g1 -DDEBUG -Xsycl-target-frontend=spir64 \"-g0\""
-    )
-    set(CMAKE_CXX_FLAGS_DEBUG
-        "${CMAKE_CXX_FLAGS_DEBUG} ${WARNING_FLAGS} ${SDL_FLAGS} -O0 -g1 -DDEBUG -Xsycl-target-frontend=spir64 \"-g0\""
-    )
-    set(CMAKE_C_FLAGS_COVERAGE
-        "${CMAKE_C_FLAGS_DEBUG} ${WARNING_FLAGS} ${SDL_FLAGS} -O1 -g1 -DDEBUG"
-    )
-    set(CMAKE_CXX_FLAGS_COVERAGE
-        "${CMAKE_CXX_FLAGS_DEBUG} ${WARNING_FLAGS} ${SDL_FLAGS} -O1 -g1 -DDEBUG"
-    )
-    set(CMAKE_MODULE_LINKER_FLAGS_COVERAGE "${CMAKE_MODULE_LINKER_FLAGS_DEBUG}")
-    set(DPCTL_LDFLAGS "/NXCompat;/DynamicBase")
-    mark_as_advanced(
-        CMAKE_CXX_FLAGS_COVERAGE
-        CMAKE_C_FLAGS_COVERAGE
-        CMAKE_MODULE_LINKER_FLAGS_COVERAGE
-    )
-elseif(UNIX)
-    string(
-        CONCAT WARNING_FLAGS
-        "-Wall "
-        "-Wextra "
-        "-Winit-self "
-        "-Wunused-function "
-        "-Wuninitialized "
-        "-Wmissing-declarations "
-        "-Wstrict-prototypes "
-        "-Wno-unused-parameter "
-        "-fdiagnostics-color=auto "
-    )
-    string(
-        CONCAT SDL_FLAGS
-        "-fstack-protector "
-        "-fstack-protector-all "
-        "-fpic "
-        "-fPIC "
-        "-D_FORTIFY_SOURCE=2 "
-        "-Wformat "
-        "-Wformat-security "
-        #       "-fno-strict-overflow "    # no-strict-overflow is implied by -fwrapv
-        "-fno-delete-null-pointer-checks "
-        "-fwrapv "
-    )
-    string(CONCAT CFLAGS "${WARNING_FLAGS}" "${SDL_FLAGS}")
-    string(CONCAT CXXFLAGS "${WARNING_FLAGS}" "${SDL_FLAGS}")
-    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -O3 ${CFLAGS}")
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3 ${CXXFLAGS}")
-    set(CMAKE_C_FLAGS_DEBUG
-        "${CMAKE_C_FLAGS_DEBUG} ${CFLAGS} -O0 -g -DDEBUG -Xsycl-target-frontend=spir64 \"-g0\""
-    )
-    set(CMAKE_CXX_FLAGS_DEBUG
-        "${CMAKE_CXX_FLAGS_DEBUG} ${CXXFLAGS} -O0 -g -DDEBUG -Xsycl-target-frontend=spir64 \"-g0\""
-    )
-    set(CMAKE_C_FLAGS_COVERAGE "${CMAKE_C_FLAGS_DEBUG} ${CFLAGS} -O1 -g1 -DDEBUG")
-    set(CMAKE_CXX_FLAGS_COVERAGE "${CMAKE_CXX_FLAGS_DEBUG} ${CXXFLAGS} -O1 -g1 -DDEBUG")
-    set(CMAKE_MODULE_LINKER_FLAGS_COVERAGE "${CMAKE_MODULE_LINKER_FLAGS_DEBUG}")
-    set(DPCTL_LDFLAGS "-z,noexecstack,-z,relro,-z,now")
-    mark_as_advanced(
-        CMAKE_CXX_FLAGS_COVERAGE
-        CMAKE_C_FLAGS_COVERAGE
-        CMAKE_MODULE_LINKER_FLAGS_COVERAGE
-    )
-else()
-    message(FATAL_ERROR "Unsupported system.")
-endif()
-
-# at build time create include/ directory and copy header files over
-set(DPCTL_INCLUDE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/include)
-
-set(CMAKE_INSTALL_RPATH "$ORIGIN")
-
-function(build_dpctl_ext _trgt _src _dest)
-    set(options SYCL)
-    cmake_parse_arguments(BUILD_DPCTL_EXT "${options}" "RELATIVE_PATH" "" ${ARGN})
-    add_cython_target(${_trgt} ${_src} CXX OUTPUT_VAR _generated_src)
-    set(_cythonize_trgt "${_trgt}_cythonize_pyx")
-    python_add_library(${_trgt} MODULE WITH_SOABI ${_generated_src})
-    if(BUILD_DPCTL_EXT_SYCL)
-        add_sycl_to_target(TARGET ${_trgt} SOURCES ${_generated_src})
-        target_compile_options(${_trgt} PRIVATE -fno-sycl-id-queries-fit-in-int)
-        target_link_options(${_trgt} PRIVATE -fsycl-device-code-split=per_kernel)
-        if(DPCTL_OFFLOAD_COMPRESS)
-            target_link_options(${_trgt} PRIVATE --offload-compress)
-        endif()
-        if(_dpctl_sycl_targets)
-            # make fat binary
-            target_compile_options(
-                ${_trgt}
-                PRIVATE ${_dpctl_sycl_target_compile_options}
-            )
-            target_link_options(${_trgt} PRIVATE ${_dpctl_sycl_target_link_options})
-        endif()
-    endif()
-    target_link_libraries(${_trgt} PRIVATE Python::NumPy)
-    if(DPCTL_GENERATE_COVERAGE)
-        target_compile_definitions(${_trgt} PRIVATE CYTHON_TRACE=1 CYTHON_TRACE_NOGIL=1)
-        if(BUILD_DPCTL_EXT_SYCL)
-            target_compile_options(${_trgt} PRIVATE -fno-sycl-use-footer)
-        endif()
-    endif()
-    # Dpctl
-    target_include_directories(${_trgt} PRIVATE ${Dpctl_INCLUDE_DIR})
-    target_link_directories(${_trgt} PRIVATE ${Dpctl_INCLUDE_DIR}/..)
-    target_link_libraries(${_trgt} PRIVATE DPCTLSyclInterface)
-    set(_linker_options "LINKER:${DPCTL_LDFLAGS}")
-    target_link_options(${_trgt} PRIVATE ${_linker_options})
-    get_filename_component(_name_wle ${_generated_src} NAME_WLE)
-    get_filename_component(_generated_src_dir ${_generated_src} DIRECTORY)
-    set(_generated_public_h "${_generated_src_dir}/${_name_wle}.h")
-    set(_generated_api_h "${_generated_src_dir}/${_name_wle}_api.h")
-
-    # TODO: create separate folder inside build folder that contains only
-    #   headers related to this target and appropriate folder structure to
-    #   eliminate shadow dependencies
-    # Go up two levels to build root for "dpctl_ext/tensor/_usmarray.h" resolution
-    get_filename_component(_parent_dir ${_generated_src_dir} DIRECTORY)
-    get_filename_component(_build_root ${_parent_dir} DIRECTORY)
-    # TODO: do not set directory if we did not generate header
-    target_include_directories(${_trgt} INTERFACE ${_build_root})
-    set(_rpath_value "$ORIGIN")
-    if(BUILD_DPCTL_EXT_RELATIVE_PATH)
-        set(_rpath_value "${_rpath_value}/${BUILD_DPCTL_EXT_RELATIVE_PATH}")
-    endif()
-    if(DPCTL_WITH_REDIST)
-        set(_rpath_value "${_rpath_value}:${_rpath_value}/../../..")
-    endif()
-    set_target_properties(${_trgt} PROPERTIES INSTALL_RPATH ${_rpath_value})
-
-    install(TARGETS ${_trgt} LIBRARY DESTINATION ${_dest})
-    install(
-        FILES ${_generated_api_h}
-        DESTINATION ${CMAKE_INSTALL_PREFIX}/dpctl_ext/include/${_dest}
-        OPTIONAL
-    )
-    install(
-        FILES ${_generated_public_h}
-        DESTINATION ${CMAKE_INSTALL_PREFIX}/dpctl_ext/include/${_dest}
-        OPTIONAL
-    )
-    if(DPCTL_GENERATE_COVERAGE)
-        get_filename_component(_original_src_dir ${_src} DIRECTORY)
-        file(RELATIVE_PATH _rel_dir ${CMAKE_SOURCE_DIR} ${_original_src_dir})
-        install(FILES ${_generated_src} DESTINATION ${CMAKE_INSTALL_PREFIX}/${_rel_dir})
-    endif()
-
-    # Create target with headers only, because python is managing all the
-    # library imports at runtime
-    set(_trgt_headers ${_trgt}_headers)
-    add_library(${_trgt_headers} INTERFACE)
-    add_dependencies(${_trgt_headers} ${_trgt})
-    get_target_property(_trgt_headers_dir ${_trgt} INTERFACE_INCLUDE_DIRECTORIES)
-    target_include_directories(${_trgt_headers} INTERFACE ${_trgt_headers_dir})
-endfunction()
-
-add_subdirectory(tensor)
diff --git a/dpctl_ext/__init__.py b/dpctl_ext/__init__.py
deleted file mode 100644
index a71324cb88d8..000000000000
--- a/dpctl_ext/__init__.py
+++ /dev/null
@@ -1,27 +0,0 @@
-# *****************************************************************************
-# Copyright (c) 2026, Intel Corporation
-# All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-# - Redistributions of source code must retain the above copyright notice,
-#   this list of conditions and the following disclaimer.
-# - Redistributions in binary form must reproduce the above copyright notice,
-#   this list of conditions and the following disclaimer in the documentation
-#   and/or other materials provided with the distribution.
-# - Neither the name of the copyright holder nor the names of its contributors
-#   may be used to endorse or promote products derived from this software
-#   without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
-# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
-# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
-# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
-# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
-# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
-# THE POSSIBILITY OF SUCH DAMAGE.
-# *****************************************************************************
diff --git a/dpnp/CMakeLists.txt b/dpnp/CMakeLists.txt
index 6850b799735c..d7acf368bcd0 100644
--- a/dpnp/CMakeLists.txt
+++ b/dpnp/CMakeLists.txt
@@ -86,11 +86,96 @@ function(build_dpnp_cython_ext _trgt _src _dest)
     install(TARGETS ${_trgt} LIBRARY DESTINATION ${_dest})
 endfunction()
 
+function(build_dpnp_tensor_ext _trgt _src _dest)
+    set(options SYCL)
+    cmake_parse_arguments(BUILD_DPNP_TENSOR "${options}" "RELATIVE_PATH" "" ${ARGN})
+    add_cython_target(${_trgt} ${_src} CXX OUTPUT_VAR _generated_src)
+    set(_cythonize_trgt "${_trgt}_cythonize_pyx")
+    python_add_library(${_trgt} MODULE WITH_SOABI ${_generated_src})
+    if(BUILD_DPNP_TENSOR_SYCL)
+        add_sycl_to_target(TARGET ${_trgt} SOURCES ${_generated_src})
+        target_compile_options(${_trgt} PRIVATE -fno-sycl-id-queries-fit-in-int)
+        target_link_options(${_trgt} PRIVATE -fsycl-device-code-split=per_kernel)
+        if(DPNP_TENSOR_OFFLOAD_COMPRESS)
+            target_link_options(${_trgt} PRIVATE --offload-compress)
+        endif()
+        if(_dpnp_sycl_targets)
+            # make fat binary
+            target_compile_options(
+                ${_trgt}
+                PRIVATE ${_dpnp_sycl_target_compile_options}
+            )
+            target_link_options(${_trgt} PRIVATE ${_dpnp_sycl_target_link_options})
+        endif()
+    endif()
+    target_link_libraries(${_trgt} PRIVATE Python::NumPy)
+    if(DPNP_GENERATE_COVERAGE)
+        target_compile_definitions(${_trgt} PRIVATE CYTHON_TRACE=1 CYTHON_TRACE_NOGIL=1)
+        if(BUILD_DPNP_TENSOR_SYCL)
+            target_compile_options(${_trgt} PRIVATE -fno-sycl-use-footer)
+        endif()
+    endif()
+    # Dpctl
+    target_include_directories(${_trgt} PRIVATE ${Dpctl_INCLUDE_DIR})
+    target_link_directories(${_trgt} PRIVATE ${Dpctl_INCLUDE_DIR}/..)
+    target_link_libraries(${_trgt} PRIVATE DPCTLSyclInterface)
+    set(_linker_options "LINKER:${DPNP_LDFLAGS}")
+    target_link_options(${_trgt} PRIVATE ${_linker_options})
+    get_filename_component(_name_wle ${_generated_src} NAME_WLE)
+    get_filename_component(_generated_src_dir ${_generated_src} DIRECTORY)
+    set(_generated_public_h "${_generated_src_dir}/${_name_wle}.h")
+    set(_generated_api_h "${_generated_src_dir}/${_name_wle}_api.h")
+
+    # TODO: create separate folder inside build folder that contains only
+    #   headers related to this target and appropriate folder structure to
+    #   eliminate shadow dependencies
+    # Go up two levels to build root for "dpnp/tensor/_usmarray.h" resolution
+    get_filename_component(_parent_dir ${_generated_src_dir} DIRECTORY)
+    get_filename_component(_build_root ${_parent_dir} DIRECTORY)
+    # TODO: do not set directory if we did not generate header
+    target_include_directories(${_trgt} INTERFACE ${_build_root})
+    set(_rpath_value "$ORIGIN")
+    if(BUILD_DPNP_TENSOR_RELATIVE_PATH)
+        set(_rpath_value "${_rpath_value}/${BUILD_DPNP_TENSOR_RELATIVE_PATH}")
+    endif()
+    if(DPNP_WITH_REDIST)
+        set(_rpath_value "${_rpath_value}:${_rpath_value}/../../..")
+    endif()
+    set_target_properties(${_trgt} PROPERTIES INSTALL_RPATH ${_rpath_value})
+
+    install(TARGETS ${_trgt} LIBRARY DESTINATION ${_dest})
+    install(
+        FILES ${_generated_api_h}
+        DESTINATION ${CMAKE_INSTALL_PREFIX}/dpnp/include/${_dest}
+        OPTIONAL
+    )
+    install(
+        FILES ${_generated_public_h}
+        DESTINATION ${CMAKE_INSTALL_PREFIX}/dpnp/include/${_dest}
+        OPTIONAL
+    )
+    if(DPNP_GENERATE_COVERAGE)
+        get_filename_component(_original_src_dir ${_src} DIRECTORY)
+        file(RELATIVE_PATH _rel_dir ${CMAKE_SOURCE_DIR} ${_original_src_dir})
+        install(FILES ${_generated_src} DESTINATION ${CMAKE_INSTALL_PREFIX}/${_rel_dir})
+    endif()
+
+    # Create target with headers only, because python is managing all the
+    # library imports at runtime
+    set(_trgt_headers ${_trgt}_headers)
+    add_library(${_trgt_headers} INTERFACE)
+    add_dependencies(${_trgt_headers} ${_trgt})
+    get_target_property(_trgt_headers_dir ${_trgt} INTERFACE_INCLUDE_DIRECTORIES)
+    target_include_directories(${_trgt_headers} INTERFACE ${_trgt_headers_dir})
+endfunction()
+
 function(build_dpnp_cython_ext_with_backend _trgt _src _dest)
     build_dpnp_cython_ext(${_trgt} ${_src} ${_dest})
     target_link_libraries(${_trgt} PRIVATE dpnp_backend_library)
 endfunction()
 
+add_subdirectory(tensor)
+
 add_subdirectory(backend)
 add_subdirectory(backend/extensions/blas)
 add_subdirectory(backend/extensions/fft)
diff --git a/dpnp/__init__.py b/dpnp/__init__.py
index dd413d02f2bb..d2ea158d4d44 100644
--- a/dpnp/__init__.py
+++ b/dpnp/__init__.py
@@ -60,9 +60,7 @@
                 [os.getenv("PATH", ""), dll_path]
             )
 
-# TODO: revert to `from dpctl.tensor...`
-# when dpnp fully migrates dpctl/tensor
-from dpctl_ext.tensor import __array_api_version__, DLDeviceType
+from .tensor import __array_api_version__, DLDeviceType
 
 from .dpnp_array import dpnp_array as ndarray
 from .dpnp_array_api_info import __array_namespace_info__
diff --git a/dpnp/backend/extensions/blas/CMakeLists.txt b/dpnp/backend/extensions/blas/CMakeLists.txt
index 1bf6055d080b..67e0d4cf02e1 100644
--- a/dpnp/backend/extensions/blas/CMakeLists.txt
+++ b/dpnp/backend/extensions/blas/CMakeLists.txt
@@ -71,7 +71,7 @@ target_include_directories(
     PRIVATE
         ${CMAKE_CURRENT_SOURCE_DIR}/../common
         ${CMAKE_SOURCE_DIR}/dpnp/backend/include
-        ${CMAKE_SOURCE_DIR}/dpctl_ext/tensor/libtensor/include
+        ${CMAKE_SOURCE_DIR}/dpnp/tensor/libtensor/include
 )
 
 # treat below headers as system to suppress the warnings there during the build
diff --git a/dpnp/backend/extensions/fft/CMakeLists.txt b/dpnp/backend/extensions/fft/CMakeLists.txt
index 28433ab5d98f..8f5179bbbd76 100644
--- a/dpnp/backend/extensions/fft/CMakeLists.txt
+++ b/dpnp/backend/extensions/fft/CMakeLists.txt
@@ -64,7 +64,7 @@ target_include_directories(
     ${python_module_name}
     PRIVATE
         ${CMAKE_SOURCE_DIR}/dpnp/backend/include
-        ${CMAKE_SOURCE_DIR}/dpctl_ext/tensor/libtensor/include
+        ${CMAKE_SOURCE_DIR}/dpnp/tensor/libtensor/include
 )
 
 # treat below headers as system to suppress the warnings there during the build
diff --git a/dpnp/backend/extensions/indexing/CMakeLists.txt b/dpnp/backend/extensions/indexing/CMakeLists.txt
index e00cee6a29a5..0ca611bfdc9f 100644
--- a/dpnp/backend/extensions/indexing/CMakeLists.txt
+++ b/dpnp/backend/extensions/indexing/CMakeLists.txt
@@ -68,7 +68,7 @@ target_include_directories(
     PRIVATE
         ${CMAKE_CURRENT_SOURCE_DIR}/../common
         ${CMAKE_SOURCE_DIR}/dpnp/backend/include
-        ${CMAKE_SOURCE_DIR}/dpctl_ext/tensor/libtensor/include
+        ${CMAKE_SOURCE_DIR}/dpnp/tensor/libtensor/include
 )
 
 # treat below headers as system to suppress the warnings there during the build
diff --git a/dpnp/backend/extensions/lapack/CMakeLists.txt b/dpnp/backend/extensions/lapack/CMakeLists.txt
index 3105771d9722..6bf25ee651d2 100644
--- a/dpnp/backend/extensions/lapack/CMakeLists.txt
+++ b/dpnp/backend/extensions/lapack/CMakeLists.txt
@@ -89,7 +89,7 @@ target_include_directories(
         ${CMAKE_CURRENT_SOURCE_DIR}/../../
         ${CMAKE_CURRENT_SOURCE_DIR}/../common
         ${CMAKE_SOURCE_DIR}/dpnp/backend/include
-        ${CMAKE_SOURCE_DIR}/dpctl_ext/tensor/libtensor/include
+        ${CMAKE_SOURCE_DIR}/dpnp/tensor/libtensor/include
 )
 
 # treat below headers as system to suppress the warnings there during the build
diff --git a/dpnp/backend/extensions/statistics/CMakeLists.txt b/dpnp/backend/extensions/statistics/CMakeLists.txt
index 0e96d7ead6c1..701a852c5903 100644
--- a/dpnp/backend/extensions/statistics/CMakeLists.txt
+++ b/dpnp/backend/extensions/statistics/CMakeLists.txt
@@ -73,7 +73,7 @@ target_include_directories(
     PRIVATE
         ${CMAKE_CURRENT_SOURCE_DIR}/../common
         ${CMAKE_SOURCE_DIR}/dpnp/backend/include
-        ${CMAKE_SOURCE_DIR}/dpctl_ext/tensor/libtensor/include
+        ${CMAKE_SOURCE_DIR}/dpnp/tensor/libtensor/include
 )
 
 # treat below headers as system to suppress the warnings there during the build
diff --git a/dpnp/backend/extensions/ufunc/CMakeLists.txt b/dpnp/backend/extensions/ufunc/CMakeLists.txt
index 53d3a64122b3..68e6bf29135d 100644
--- a/dpnp/backend/extensions/ufunc/CMakeLists.txt
+++ b/dpnp/backend/extensions/ufunc/CMakeLists.txt
@@ -91,7 +91,7 @@ target_include_directories(
         ${CMAKE_CURRENT_SOURCE_DIR}/../../
         ${CMAKE_CURRENT_SOURCE_DIR}/../common
         ${CMAKE_SOURCE_DIR}/dpnp/backend/include
-        ${CMAKE_SOURCE_DIR}/dpctl_ext/tensor/libtensor/include
+        ${CMAKE_SOURCE_DIR}/dpnp/tensor/libtensor/include
 )
 
 # treat below headers as system to suppress the warnings there during the build
diff --git a/dpnp/backend/extensions/vm/CMakeLists.txt b/dpnp/backend/extensions/vm/CMakeLists.txt
index 5e0409f77671..a739838c8dcd 100644
--- a/dpnp/backend/extensions/vm/CMakeLists.txt
+++ b/dpnp/backend/extensions/vm/CMakeLists.txt
@@ -113,7 +113,7 @@ target_include_directories(
     PRIVATE
         ${CMAKE_CURRENT_SOURCE_DIR}/../common
         ${CMAKE_SOURCE_DIR}/dpnp/backend/include
-        ${CMAKE_SOURCE_DIR}/dpctl_ext/tensor/libtensor/include
+        ${CMAKE_SOURCE_DIR}/dpnp/tensor/libtensor/include
 )
 
 # treat below headers as system to suppress the warnings there during the build
diff --git a/dpnp/backend/extensions/window/CMakeLists.txt b/dpnp/backend/extensions/window/CMakeLists.txt
index 6898cdb332e0..085cd47e7891 100644
--- a/dpnp/backend/extensions/window/CMakeLists.txt
+++ b/dpnp/backend/extensions/window/CMakeLists.txt
@@ -69,7 +69,7 @@ target_include_directories(
         ${CMAKE_CURRENT_SOURCE_DIR}/../../
         ${CMAKE_CURRENT_SOURCE_DIR}/../common
         ${CMAKE_SOURCE_DIR}/dpnp/backend/include
-        ${CMAKE_SOURCE_DIR}/dpctl_ext/tensor/libtensor/include
+        ${CMAKE_SOURCE_DIR}/dpnp/tensor/libtensor/include
 )
 
 # treat below headers as system to suppress the warnings there during the build
diff --git a/dpnp/backend/include/dpnp4pybind11.hpp b/dpnp/backend/include/dpnp4pybind11.hpp
index d8e4c81b20a5..ada7b7e380fb 100644
--- a/dpnp/backend/include/dpnp4pybind11.hpp
+++ b/dpnp/backend/include/dpnp4pybind11.hpp
@@ -52,9 +52,10 @@
 #include "dpctl/program/_program.h"
 #include "dpctl/program/_program_api.h"
 
-// Include generated Cython headers for usm_ndarray struct definition and C-API
-#include "dpctl_ext/tensor/_usmarray.h"
-#include "dpctl_ext/tensor/_usmarray_api.h"
+// Include generated Cython headers for usm_ndarray
+// (struct definition and constants only)
+#include "dpnp/tensor/_usmarray.h"
+#include "dpnp/tensor/_usmarray_api.h"
 
 #include <array>
 #include <cassert>
@@ -272,8 +273,8 @@ class dpctl_capi
         import_dpctl___sycl_queue();
         import_dpctl__memory___memory();
         import_dpctl__program___program();
-        // Import dpctl_ext tensor module for PyUSMArrayType
-        import_dpctl_ext__tensor___usmarray();
+        // Import dpnp tensor module for PyUSMArrayType
+        import_dpnp__tensor___usmarray();
 
         // Python type objects for classes implemented by dpctl
         this->Py_SyclDeviceType_ = &Py_SyclDeviceType;
@@ -386,10 +387,7 @@ class dpctl_capi
         default_usm_memory_ = std::shared_ptr<py::object>(
             new py::object{py_default_usm_memory}, Deleter{});
 
-        // TODO: revert to `py::module_::import("dpctl.tensor._usmarray");`
-        // when dpnp fully migrates dpctl/tensor
-        py::module_ mod_usmarray =
-            py::module_::import("dpctl_ext.tensor._usmarray");
+        py::module_ mod_usmarray = py::module_::import("dpnp.tensor._usmarray");
         auto tensor_kl = mod_usmarray.attr("usm_ndarray");
 
         const py::object &py_default_usm_ndarray =
diff --git a/dpnp/dpnp_algo/dpnp_arraycreation.py b/dpnp/dpnp_algo/dpnp_arraycreation.py
index fb277dd4d310..66d8b9d9fbc8 100644
--- a/dpnp/dpnp_algo/dpnp_arraycreation.py
+++ b/dpnp/dpnp_algo/dpnp_arraycreation.py
@@ -32,10 +32,8 @@
 import dpctl.utils as dpu
 import numpy
 
-# TODO: revert to `import dpctl.tensor...`
-# when dpnp fully migrates dpctl/tensor
-import dpctl_ext.tensor as dpt
 import dpnp
+import dpnp.tensor as dpt
 from dpnp.dpnp_array import dpnp_array
 from dpnp.dpnp_utils import get_usm_allocations, map_dtype_to_device
 
diff --git a/dpnp/dpnp_algo/dpnp_elementwise_common.py b/dpnp/dpnp_algo/dpnp_elementwise_common.py
index 271013b58090..5902d389391f 100644
--- a/dpnp/dpnp_algo/dpnp_elementwise_common.py
+++ b/dpnp/dpnp_algo/dpnp_elementwise_common.py
@@ -32,30 +32,29 @@
 import dpctl.utils as dpu
 import numpy
 
-# pylint: disable=no-name-in-module
-# TODO: revert to `import dpctl.tensor...`
-# when dpnp fully migrates dpctl/tensor
-import dpctl_ext.tensor as dpt
-import dpctl_ext.tensor._copy_utils as dtc
-import dpctl_ext.tensor._tensor_impl as dti
-import dpctl_ext.tensor._type_utils as dtu
 import dpnp
 import dpnp.backend.extensions.vm._vm_impl as vmi
-from dpctl_ext.tensor._elementwise_common import (
+
+# pylint: disable=no-name-in-module
+import dpnp.tensor as dpt
+import dpnp.tensor._copy_utils as dtc
+import dpnp.tensor._tensor_impl as dti
+import dpnp.tensor._type_utils as dtu
+from dpnp.dpnp_array import dpnp_array
+from dpnp.dpnp_utils import get_usm_allocations
+from dpnp.dpnp_utils.dpnp_utils_common import (
+    find_buf_dtype_3out,
+    find_buf_dtype_4out,
+)
+from dpnp.tensor._elementwise_common import (
     BinaryElementwiseFunc,
     UnaryElementwiseFunc,
 )
-from dpctl_ext.tensor._scalar_utils import (
+from dpnp.tensor._scalar_utils import (
     _get_dtype,
     _get_shape,
     _validate_dtype,
 )
-from dpnp.dpnp_array import dpnp_array
-from dpnp.dpnp_utils import get_usm_allocations
-from dpnp.dpnp_utils.dpnp_utils_common import (
-    find_buf_dtype_3out,
-    find_buf_dtype_4out,
-)
 
 __all__ = [
     "DPNPI0",
diff --git a/dpnp/dpnp_algo/dpnp_fill.py b/dpnp/dpnp_algo/dpnp_fill.py
index c9ae58a114a9..7afda62bb07f 100644
--- a/dpnp/dpnp_algo/dpnp_fill.py
+++ b/dpnp/dpnp_algo/dpnp_fill.py
@@ -30,12 +30,10 @@
 
 import dpctl.utils as dpu
 
-# TODO: revert to `from dpctl.tensor...`
-# when dpnp fully migrates dpctl/tensor
-import dpctl_ext.tensor as dpt
 import dpnp
-from dpctl_ext.tensor._ctors import _cast_fill_val
-from dpctl_ext.tensor._tensor_impl import (
+import dpnp.tensor as dpt
+from dpnp.tensor._ctors import _cast_fill_val
+from dpnp.tensor._tensor_impl import (
     _copy_usm_ndarray_into_usm_ndarray,
     _full_usm_ndarray,
     _zeros_usm_ndarray,
diff --git a/dpnp/dpnp_array.py b/dpnp/dpnp_array.py
index cbb5835bbfc4..9cbfb8e27063 100644
--- a/dpnp/dpnp_array.py
+++ b/dpnp/dpnp_array.py
@@ -32,19 +32,18 @@
 
 """
 
+# pylint: disable=duplicate-code
 # pylint: disable=invalid-name
 # pylint: disable=protected-access
 
 import warnings
 
-# TODO: revert to `import dpctl.tensor...`
-# when dpnp fully migrates dpctl/tensor
-import dpctl_ext.tensor as dpt
-import dpctl_ext.tensor._type_utils as dtu
 import dpnp
-from dpctl_ext.tensor._numpy_helper import AxisError
+import dpnp.tensor as dpt
+import dpnp.tensor._type_utils as dtu
 
 from . import memory as dpm
+from .tensor._numpy_helper import AxisError
 
 
 def _get_unwrapped_index_key(key):
diff --git a/dpnp/dpnp_array_api_info.py b/dpnp/dpnp_array_api_info.py
index f792600cbb66..ef3f1e4c2b60 100644
--- a/dpnp/dpnp_array_api_info.py
+++ b/dpnp/dpnp_array_api_info.py
@@ -36,9 +36,7 @@
 
 """
 
-# TODO: revert to `import dpctl.tensor...`
-# when dpnp fully migrates dpctl/tensor
-import dpctl_ext.tensor as dpt
+import dpnp.tensor as dpt
 
 
 def __array_namespace_info__():
diff --git a/dpnp/dpnp_container.py b/dpnp/dpnp_container.py
index 9fe955746593..14d9278579ba 100644
--- a/dpnp/dpnp_container.py
+++ b/dpnp/dpnp_container.py
@@ -37,10 +37,8 @@
 
 import dpctl.utils as dpu
 
-# TODO: revert to `import dpctl.tensor...`
-# when dpnp fully migrates dpctl/tensor
-import dpctl_ext.tensor as dpt
 import dpnp
+import dpnp.tensor as dpt
 from dpnp.dpnp_array import dpnp_array
 
 __all__ = [
diff --git a/dpnp/dpnp_iface.py b/dpnp/dpnp_iface.py
index 13b957ffff8f..acca10a2211b 100644
--- a/dpnp/dpnp_iface.py
+++ b/dpnp/dpnp_iface.py
@@ -48,13 +48,11 @@
 import dpctl.utils as dpu
 import numpy
 
-# pylint: disable=no-name-in-module
-# TODO: revert to `import dpctl.tensor...`
-# when dpnp fully migrates dpctl/tensor
-import dpctl_ext.tensor as dpt
-import dpctl_ext.tensor._tensor_impl as ti
 import dpnp
-from dpctl_ext.tensor._device import normalize_queue_device
+
+# pylint: disable=no-name-in-module
+import dpnp.tensor as dpt
+import dpnp.tensor._tensor_impl as ti
 
 from .dpnp_array import dpnp_array
 from .dpnp_utils import (
@@ -62,6 +60,7 @@
     map_dtype_to_device,
     use_origin_backend,
 )
+from .tensor._device import normalize_queue_device
 
 
 def are_same_logical_tensors(ar1, ar2):
diff --git a/dpnp/dpnp_iface_arraycreation.py b/dpnp/dpnp_iface_arraycreation.py
index 2800df0b2ac8..6062dca6cdfc 100644
--- a/dpnp/dpnp_iface_arraycreation.py
+++ b/dpnp/dpnp_iface_arraycreation.py
@@ -45,10 +45,8 @@
 
 import numpy
 
-# TODO: revert to `import dpctl.tensor...`
-# when dpnp fully migrates dpctl/tensor
-import dpctl_ext.tensor as dpt
 import dpnp
+import dpnp.tensor as dpt
 from dpnp import dpnp_container
 
 from .dpnp_algo.dpnp_arraycreation import (
diff --git a/dpnp/dpnp_iface_bitwise.py b/dpnp/dpnp_iface_bitwise.py
index bff5c4e3aed9..604fd365ee18 100644
--- a/dpnp/dpnp_iface_bitwise.py
+++ b/dpnp/dpnp_iface_bitwise.py
@@ -45,10 +45,8 @@
 
 import numpy
 
-# TODO: revert to `import dpctl.tensor...`
-# when dpnp fully migrates dpctl/tensor
-import dpctl_ext.tensor._tensor_elementwise_impl as ti
 import dpnp.backend.extensions.ufunc._ufunc_impl as ufi
+import dpnp.tensor._tensor_elementwise_impl as ti
 from dpnp.dpnp_algo.dpnp_elementwise_common import DPNPBinaryFunc, DPNPUnaryFunc
 
 
diff --git a/dpnp/dpnp_iface_counting.py b/dpnp/dpnp_iface_counting.py
index a8ebafbcead7..7bb13422f819 100644
--- a/dpnp/dpnp_iface_counting.py
+++ b/dpnp/dpnp_iface_counting.py
@@ -39,10 +39,8 @@
 
 """
 
-# TODO: revert to `import dpctl.tensor...`
-# when dpnp fully migrates dpctl/tensor
-import dpctl_ext.tensor as dpt
 import dpnp
+import dpnp.tensor as dpt
 
 
 def count_nonzero(a, axis=None, *, keepdims=False, out=None):
diff --git a/dpnp/dpnp_iface_functional.py b/dpnp/dpnp_iface_functional.py
index 797d8a736276..0ed965b0698f 100644
--- a/dpnp/dpnp_iface_functional.py
+++ b/dpnp/dpnp_iface_functional.py
@@ -43,16 +43,13 @@
 
 import dpnp
 
-# TODO: revert to `from dpctl.tensor...`
-# when dpnp fully migrates dpctl/tensor
-from dpctl_ext.tensor._numpy_helper import (
+# pylint: disable=no-name-in-module
+from .dpnp_utils import get_usm_allocations
+from .tensor._numpy_helper import (
     normalize_axis_index,
     normalize_axis_tuple,
 )
 
-# pylint: disable=no-name-in-module
-from dpnp.dpnp_utils import get_usm_allocations
-
 
 def apply_along_axis(func1d, axis, arr, *args, **kwargs):
     """
diff --git a/dpnp/dpnp_iface_indexing.py b/dpnp/dpnp_iface_indexing.py
index 4b8fb7bb6a38..a24c8f56844a 100644
--- a/dpnp/dpnp_iface_indexing.py
+++ b/dpnp/dpnp_iface_indexing.py
@@ -47,18 +47,14 @@
 import dpctl.utils as dpu
 import numpy
 
-# pylint: disable=no-name-in-module
-# TODO: revert to `import dpctl.tensor...`
-# when dpnp fully migrates dpctl/tensor
-import dpctl_ext.tensor as dpt
-import dpctl_ext.tensor._tensor_impl as ti
 import dpnp
 
 # pylint: disable=no-name-in-module
 import dpnp.backend.extensions.indexing._indexing_impl as indexing_ext
-from dpctl_ext.tensor._copy_utils import _nonzero_impl
-from dpctl_ext.tensor._indexing_functions import _get_indexing_mode
-from dpctl_ext.tensor._numpy_helper import normalize_axis_index
+
+# pylint: disable=no-name-in-module
+import dpnp.tensor as dpt
+import dpnp.tensor._tensor_impl as ti
 
 # pylint: disable=no-name-in-module
 from .dpnp_algo import (
@@ -66,6 +62,9 @@
 )
 from .dpnp_array import dpnp_array
 from .dpnp_utils import call_origin, get_usm_allocations
+from .tensor._copy_utils import _nonzero_impl
+from .tensor._indexing_functions import _get_indexing_mode
+from .tensor._numpy_helper import normalize_axis_index
 
 
 def _ravel_multi_index_checks(multi_index, dims, order):
diff --git a/dpnp/dpnp_iface_logic.py b/dpnp/dpnp_iface_logic.py
index 616d1e548a34..ce1d40774ca0 100644
--- a/dpnp/dpnp_iface_logic.py
+++ b/dpnp/dpnp_iface_logic.py
@@ -46,12 +46,10 @@
 import dpctl.utils as dpu
 import numpy
 
-# TODO: revert to `import dpctl.tensor...`
-# when dpnp fully migrates dpctl/tensor
-import dpctl_ext.tensor as dpt
-import dpctl_ext.tensor._tensor_elementwise_impl as ti
 import dpnp
 import dpnp.backend.extensions.ufunc._ufunc_impl as ufi
+import dpnp.tensor as dpt
+import dpnp.tensor._tensor_elementwise_impl as ti
 
 from .dpnp_algo.dpnp_elementwise_common import DPNPBinaryFunc, DPNPUnaryFunc
 from .dpnp_array import dpnp_array
diff --git a/dpnp/dpnp_iface_manipulation.py b/dpnp/dpnp_iface_manipulation.py
index 0fc2c3f80fde..f73ac9ca8ecb 100644
--- a/dpnp/dpnp_iface_manipulation.py
+++ b/dpnp/dpnp_iface_manipulation.py
@@ -47,21 +47,19 @@
 import dpctl
 import numpy
 
-# TODO: revert to `import dpctl.tensor...`
-# when dpnp fully migrates dpctl/tensor
-import dpctl_ext.tensor as dpt
 import dpnp
-from dpctl_ext.tensor._numpy_helper import (
-    AxisError,
-    normalize_axis_index,
-    normalize_axis_tuple,
-)
+import dpnp.tensor as dpt
 
 from .dpnp_array import dpnp_array
 
 # pylint: disable=no-name-in-module
 from .dpnp_utils import get_usm_allocations
 from .dpnp_utils.dpnp_utils_pad import dpnp_pad
+from .tensor._numpy_helper import (
+    AxisError,
+    normalize_axis_index,
+    normalize_axis_tuple,
+)
 
 
 class InsertDeleteParams(NamedTuple):
diff --git a/dpnp/dpnp_iface_mathematical.py b/dpnp/dpnp_iface_mathematical.py
index 89bc08681604..22517d9cccca 100644
--- a/dpnp/dpnp_iface_mathematical.py
+++ b/dpnp/dpnp_iface_mathematical.py
@@ -50,17 +50,11 @@
 import dpctl.utils as dpu
 import numpy
 
-# TODO: revert to `import dpctl.tensor...`
-# when dpnp fully migrates dpctl/tensor
-import dpctl_ext.tensor as dpt
-import dpctl_ext.tensor._tensor_elementwise_impl as ti
-import dpctl_ext.tensor._type_utils as dtu
 import dpnp
 import dpnp.backend.extensions.ufunc._ufunc_impl as ufi
-from dpctl_ext.tensor._numpy_helper import (
-    normalize_axis_index,
-    normalize_axis_tuple,
-)
+import dpnp.tensor as dpt
+import dpnp.tensor._tensor_elementwise_impl as ti
+import dpnp.tensor._type_utils as dtu
 
 from .dpnp_algo.dpnp_elementwise_common import (
     DPNPI0,
@@ -86,6 +80,10 @@
 from .dpnp_utils import get_usm_allocations
 from .dpnp_utils.dpnp_utils_linearalgebra import dpnp_cross
 from .dpnp_utils.dpnp_utils_reduction import dpnp_wrap_reduction_call
+from .tensor._numpy_helper import (
+    normalize_axis_index,
+    normalize_axis_tuple,
+)
 
 
 def _get_max_min(dtype):
diff --git a/dpnp/dpnp_iface_searching.py b/dpnp/dpnp_iface_searching.py
index 19279f81286a..856fdbc98936 100644
--- a/dpnp/dpnp_iface_searching.py
+++ b/dpnp/dpnp_iface_searching.py
@@ -39,13 +39,14 @@
 
 """
 
-# pylint: disable=no-name-in-module
-# TODO: revert to `import dpctl.tensor...`
-# when dpnp fully migrates dpctl/tensor
-import dpctl_ext.tensor as dpt
-import dpctl_ext.tensor._tensor_impl as dti
+# pylint: disable=duplicate-code
+
 import dpnp
 
+# pylint: disable=no-name-in-module
+import dpnp.tensor as dpt
+import dpnp.tensor._tensor_impl as dti
+
 from .dpnp_array import dpnp_array
 from .dpnp_utils.dpnp_utils_reduction import dpnp_wrap_reduction_call
 
diff --git a/dpnp/dpnp_iface_sorting.py b/dpnp/dpnp_iface_sorting.py
index e7abef1f4338..c24b1a4bc886 100644
--- a/dpnp/dpnp_iface_sorting.py
+++ b/dpnp/dpnp_iface_sorting.py
@@ -41,11 +41,8 @@
 
 from collections.abc import Sequence
 
-# TODO: revert to `import dpctl.tensor...`
-# when dpnp fully migrates dpctl/tensor
-import dpctl_ext.tensor as dpt
 import dpnp
-from dpctl_ext.tensor._numpy_helper import normalize_axis_index
+import dpnp.tensor as dpt
 
 # pylint: disable=no-name-in-module
 from .dpnp_algo import (
@@ -55,6 +52,7 @@
 from .dpnp_utils import (
     map_dtype_to_device,
 )
+from .tensor._numpy_helper import normalize_axis_index
 
 
 def _wrap_sort_argsort(
diff --git a/dpnp/dpnp_iface_statistics.py b/dpnp/dpnp_iface_statistics.py
index 4063233dc981..3d1f62ef716e 100644
--- a/dpnp/dpnp_iface_statistics.py
+++ b/dpnp/dpnp_iface_statistics.py
@@ -46,13 +46,10 @@
 import dpctl.utils as dpu
 import numpy
 
-# TODO: revert to `import dpctl.tensor...`
-# when dpnp fully migrates dpctl/tensor
-import dpctl_ext.tensor as dpt
-import dpctl_ext.tensor._tensor_elementwise_impl as ti
 import dpnp
 import dpnp.backend.extensions.statistics._statistics_impl as statistics_ext
-from dpctl_ext.tensor._numpy_helper import normalize_axis_index
+import dpnp.tensor as dpt
+import dpnp.tensor._tensor_elementwise_impl as ti
 
 from .dpnp_utils import get_usm_allocations
 from .dpnp_utils.dpnp_utils_common import (
@@ -61,6 +58,7 @@
 )
 from .dpnp_utils.dpnp_utils_reduction import dpnp_wrap_reduction_call
 from .dpnp_utils.dpnp_utils_statistics import dpnp_cov, dpnp_median
+from .tensor._numpy_helper import normalize_axis_index
 
 
 def _count_reduce_items(arr, axis, where=True):
diff --git a/dpnp/dpnp_iface_trigonometric.py b/dpnp/dpnp_iface_trigonometric.py
index 906a20f1625e..35428a0416e7 100644
--- a/dpnp/dpnp_iface_trigonometric.py
+++ b/dpnp/dpnp_iface_trigonometric.py
@@ -42,13 +42,11 @@
 # pylint: disable=protected-access
 # pylint: disable=no-name-in-module
 
-# TODO: revert to `import dpctl.tensor...`
-# when dpnp fully migrates dpctl/tensor
-import dpctl_ext.tensor as dpt
-import dpctl_ext.tensor._tensor_elementwise_impl as ti
-import dpctl_ext.tensor._type_utils as dtu
 import dpnp
 import dpnp.backend.extensions.ufunc._ufunc_impl as ufi
+import dpnp.tensor as dpt
+import dpnp.tensor._tensor_elementwise_impl as ti
+import dpnp.tensor._type_utils as dtu
 
 from .dpnp_algo.dpnp_elementwise_common import DPNPBinaryFunc, DPNPUnaryFunc
 from .dpnp_utils.dpnp_utils_reduction import dpnp_wrap_reduction_call
diff --git a/dpnp/dpnp_iface_types.py b/dpnp/dpnp_iface_types.py
index 7d2d60089d98..d3b295289831 100644
--- a/dpnp/dpnp_iface_types.py
+++ b/dpnp/dpnp_iface_types.py
@@ -39,10 +39,8 @@
 import dpctl
 import numpy
 
-# TODO: revert to `import dpctl.tensor...`
-# when dpnp fully migrates dpctl/tensor
-import dpctl_ext.tensor as dpt
 import dpnp
+import dpnp.tensor as dpt
 
 from .dpnp_array import dpnp_array
 
diff --git a/dpnp/dpnp_utils/dpnp_algo_utils.pyx b/dpnp/dpnp_utils/dpnp_algo_utils.pyx
index 6ef9c9c28a12..938d9118545b 100644
--- a/dpnp/dpnp_utils/dpnp_algo_utils.pyx
+++ b/dpnp/dpnp_utils/dpnp_algo_utils.pyx
@@ -532,7 +532,7 @@ cdef class dpnp_descriptor:
         return self.origin_pyobj
 
     def get_array(self):
-        if isinstance(self.origin_pyobj, dpctl.tensor.usm_ndarray):
+        if isinstance(self.origin_pyobj, dpnp.tensor.usm_ndarray):
             return self.origin_pyobj
         if isinstance(self.origin_pyobj, dpnp_array):
             return self.origin_pyobj.get_array()
diff --git a/dpnp/dpnp_utils/dpnp_utils_common.py b/dpnp/dpnp_utils/dpnp_utils_common.py
index aa294fefe275..55d0f57ca1e2 100644
--- a/dpnp/dpnp_utils/dpnp_utils_common.py
+++ b/dpnp/dpnp_utils/dpnp_utils_common.py
@@ -29,10 +29,8 @@
 
 from collections.abc import Iterable
 
-# TODO: revert to `import dpctl.tensor...`
-# when dpnp fully migrates dpctl/tensor
-import dpctl_ext.tensor._type_utils as dtu
 import dpnp
+import dpnp.tensor._type_utils as dtu
 from dpnp.dpnp_utils import map_dtype_to_device
 
 __all__ = [
diff --git a/dpnp/dpnp_utils/dpnp_utils_linearalgebra.py b/dpnp/dpnp_utils/dpnp_utils_linearalgebra.py
index 28ed40ab5f61..3ea0ec170bb3 100644
--- a/dpnp/dpnp_utils/dpnp_utils_linearalgebra.py
+++ b/dpnp/dpnp_utils/dpnp_utils_linearalgebra.py
@@ -30,20 +30,19 @@
 import numpy
 from dpctl.utils import ExecutionPlacementError
 
-# pylint: disable=no-name-in-module
-# TODO: revert to `import dpctl.tensor...`
-# when dpnp fully migrates dpctl/tensor
-import dpctl_ext.tensor as dpt
-import dpctl_ext.tensor._tensor_impl as ti
 import dpnp
 import dpnp.backend.extensions.blas._blas_impl as bi
-from dpctl_ext.tensor._numpy_helper import (
+
+# pylint: disable=no-name-in-module
+import dpnp.tensor as dpt
+import dpnp.tensor._tensor_impl as ti
+from dpnp.dpnp_array import dpnp_array
+from dpnp.dpnp_utils import get_usm_allocations
+from dpnp.tensor._numpy_helper import (
     AxisError,
     normalize_axis_index,
     normalize_axis_tuple,
 )
-from dpnp.dpnp_array import dpnp_array
-from dpnp.dpnp_utils import get_usm_allocations
 
 __all__ = [
     "dpnp_cross",
diff --git a/dpnp/dpnp_utils/dpnp_utils_statistics.py b/dpnp/dpnp_utils/dpnp_utils_statistics.py
index cd9932cb7153..6dd5d6433f82 100644
--- a/dpnp/dpnp_utils/dpnp_utils_statistics.py
+++ b/dpnp/dpnp_utils/dpnp_utils_statistics.py
@@ -31,12 +31,10 @@
 import dpctl
 from dpctl.utils import ExecutionPlacementError
 
-# TODO: revert to `from dpctl.tensor...`
-# when dpnp fully migrates dpctl/tensor
-import dpctl_ext.tensor as dpt
 import dpnp
-from dpctl_ext.tensor._numpy_helper import normalize_axis_tuple
+import dpnp.tensor as dpt
 from dpnp.dpnp_array import dpnp_array
+from dpnp.tensor._numpy_helper import normalize_axis_tuple
 
 __all__ = ["dpnp_cov", "dpnp_median"]
 
diff --git a/dpnp/exceptions/__init__.py b/dpnp/exceptions/__init__.py
index 7abcdbf0553f..7e5a55961d51 100644
--- a/dpnp/exceptions/__init__.py
+++ b/dpnp/exceptions/__init__.py
@@ -35,9 +35,7 @@
 from dpctl.utils import ExecutionPlacementError
 from numpy.exceptions import AxisError
 
-# TODO: revert to `from dpctl.tensor...`
-# when dpnp fully migrates dpctl/tensor
-from dpctl_ext.tensor._dlpack import DLPackCreationError
+from dpnp.tensor._dlpack import DLPackCreationError
 
 __all__ = [
     "AxisError",
diff --git a/dpnp/fft/dpnp_utils_fft.py b/dpnp/fft/dpnp_utils_fft.py
index 20d0dcd0cff2..074f0a66d7bc 100644
--- a/dpnp/fft/dpnp_utils_fft.py
+++ b/dpnp/fft/dpnp_utils_fft.py
@@ -46,12 +46,10 @@
 import numpy
 from dpctl.utils import ExecutionPlacementError
 
-# TODO: revert to `import dpctl.tensor...`
-# when dpnp fully migrates dpctl/tensor
-import dpctl_ext.tensor._tensor_impl as ti
 import dpnp
 import dpnp.backend.extensions.fft._fft_impl as fi
-from dpctl_ext.tensor._numpy_helper import (
+import dpnp.tensor._tensor_impl as ti
+from dpnp.tensor._numpy_helper import (
     normalize_axis_index,
     normalize_axis_tuple,
 )
diff --git a/dpnp/linalg/dpnp_iface_linalg.py b/dpnp/linalg/dpnp_iface_linalg.py
index f4e0f96da5e6..625d387667ac 100644
--- a/dpnp/linalg/dpnp_iface_linalg.py
+++ b/dpnp/linalg/dpnp_iface_linalg.py
@@ -47,11 +47,8 @@
 import numpy
 
 import dpnp
-
-# TODO: revert to `from dpctl.tensor...`
-# when dpnp fully migrates dpctl/tensor
-from dpctl_ext.tensor._numpy_helper import normalize_axis_tuple
 from dpnp.backend.extensions.lapack._lapack_impl import LinAlgError
+from dpnp.tensor._numpy_helper import normalize_axis_tuple
 
 from .dpnp_utils_linalg import (
     assert_2d,
diff --git a/dpnp/linalg/dpnp_utils_linalg.py b/dpnp/linalg/dpnp_utils_linalg.py
index 28e11f6188c5..9d7b32d48177 100644
--- a/dpnp/linalg/dpnp_utils_linalg.py
+++ b/dpnp/linalg/dpnp_utils_linalg.py
@@ -46,14 +46,13 @@
 import numpy
 from numpy import prod
 
-# pylint: disable=no-name-in-module
-# TODO: revert to `import dpctl.tensor...`
-# when dpnp fully migrates dpctl/tensor
-import dpctl_ext.tensor._tensor_impl as ti
 import dpnp
 import dpnp.backend.extensions.lapack._lapack_impl as li
-from dpctl_ext.tensor._numpy_helper import normalize_axis_index
+
+# pylint: disable=no-name-in-module
+import dpnp.tensor._tensor_impl as ti
 from dpnp.dpnp_utils import get_usm_allocations
+from dpnp.tensor._numpy_helper import normalize_axis_index
 
 
 # pylint:disable=missing-class-docstring
diff --git a/dpnp/memory/_memory.py b/dpnp/memory/_memory.py
index 3e95baacd424..4987d14d97c0 100644
--- a/dpnp/memory/_memory.py
+++ b/dpnp/memory/_memory.py
@@ -30,9 +30,7 @@
 from dpctl.memory import MemoryUSMHost as DPCTLMemoryUSMHost
 from dpctl.memory import MemoryUSMShared as DPCTLMemoryUSMShared
 
-# TODO: revert to `from dpctl.tensor...`
-# when dpnp fully migrates dpctl/tensor
-import dpctl_ext.tensor as dpt
+import dpnp.tensor as dpt
 
 
 def _add_ptr_property(cls):
diff --git a/dpnp/scipy/linalg/_utils.py b/dpnp/scipy/linalg/_utils.py
index f7bdd5330d42..1e80797ca469 100644
--- a/dpnp/scipy/linalg/_utils.py
+++ b/dpnp/scipy/linalg/_utils.py
@@ -45,12 +45,11 @@
 
 import dpctl.utils as dpu
 
-# pylint: disable=no-name-in-module
-# TODO: revert to `import dpctl.tensor...`
-# when dpnp fully migrates dpctl/tensor
-import dpctl_ext.tensor._tensor_impl as ti
 import dpnp
 import dpnp.backend.extensions.lapack._lapack_impl as li
+
+# pylint: disable=no-name-in-module
+import dpnp.tensor._tensor_impl as ti
 from dpnp.dpnp_utils import get_usm_allocations
 from dpnp.linalg.dpnp_utils_linalg import _common_type
 
diff --git a/dpctl_ext/tensor/CMakeLists.txt b/dpnp/tensor/CMakeLists.txt
similarity index 92%
rename from dpctl_ext/tensor/CMakeLists.txt
rename to dpnp/tensor/CMakeLists.txt
index 13c9e248594c..0a8def4131df 100644
--- a/dpctl_ext/tensor/CMakeLists.txt
+++ b/dpnp/tensor/CMakeLists.txt
@@ -29,10 +29,39 @@
 
 find_package(Python COMPONENTS Development.Module)
 
+# Remove global coverage flags for tensor
+# use link-time only approach like dpctl
+if(DPNP_GENERATE_COVERAGE)
+    string(REPLACE "-fprofile-instr-generate " "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
+    string(REPLACE "-fcoverage-mapping " "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
+    string(REPLACE "-fno-sycl-use-footer " "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
+endif()
+
+# Tensor-specific debug flags
+# Disable device code debug info for Debug and Coverage builds to speed up linking
+if(
+    CMAKE_BUILD_TYPE STREQUAL "Debug"
+    OR CMAKE_BUILD_TYPE STREQUAL "DEBUG"
+    OR CMAKE_BUILD_TYPE STREQUAL "Coverage"
+)
+    if(WIN32)
+        add_compile_options(-Xsycl-target-frontend=spir64 "-g0")
+    elseif(UNIX)
+        add_compile_options(-Xsycl-target-frontend=spir64 "-g0")
+        if(CMAKE_BUILD_TYPE STREQUAL "Debug" OR CMAKE_BUILD_TYPE STREQUAL "DEBUG")
+            string(REPLACE "-g1" "-g" CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG}")
+            string(REPLACE "-g1" "-g" CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG}")
+        endif()
+    endif()
+endif()
+
+# Suppress unused parameter warnings
+add_compile_options(-Wno-unused-parameter)
+
 file(GLOB _cython_sources *.pyx)
 foreach(_cy_file ${_cython_sources})
     get_filename_component(_trgt ${_cy_file} NAME_WLE)
-    build_dpctl_ext(${_trgt} ${_cy_file} "dpctl_ext/tensor" RELATIVE_PATH "..")
+    build_dpnp_tensor_ext(${_trgt} ${_cy_file} "dpnp/tensor" RELATIVE_PATH "..")
     target_include_directories(${_trgt} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include)
 endforeach()
 
@@ -304,8 +333,7 @@ foreach(python_module_name ${_py_trgts})
         ${python_module_name}
         PRIVATE -fsycl-device-code-split=per_kernel
     )
-    # TODO: expand DPCTL_OFFLOAD_COMPRESS to the whole dpnp level
-    if(DPCTL_OFFLOAD_COMPRESS)
+    if(DPNP_TENSOR_OFFLOAD_COMPRESS)
         target_link_options(${python_module_name} PRIVATE --offload-compress)
     endif()
 
@@ -319,8 +347,8 @@ foreach(python_module_name ${_py_trgts})
             ${CMAKE_BINARY_DIR} # For generated Cython headers
     )
     target_link_options(${python_module_name} PRIVATE ${_linker_options})
-    if(DPCTL_GENERATE_COVERAGE)
-        if(DPCTL_GENERATE_COVERAGE_FOR_PYBIND11_EXTENSIONS)
+    if(DPNP_GENERATE_COVERAGE)
+        if(DPNP_TENSOR_GENERATE_COVERAGE_FOR_PYBIND11_EXTENSIONS)
             target_compile_options(
                 ${python_module_name}
                 PRIVATE -fprofile-instr-generate -fcoverage-mapping
@@ -350,6 +378,5 @@ foreach(python_module_name ${_py_trgts})
             PROPERTIES INSTALL_RPATH "$ORIGIN/../../../.."
         )
     endif()
-    # TODO: revert to `DESTINATION "dpctl/tensor"`
-    install(TARGETS ${python_module_name} DESTINATION "dpctl_ext/tensor")
+    install(TARGETS ${python_module_name} DESTINATION "dpnp/tensor")
 endforeach()
diff --git a/dpctl_ext/tensor/__init__.pxd b/dpnp/tensor/__init__.pxd
similarity index 100%
rename from dpctl_ext/tensor/__init__.pxd
rename to dpnp/tensor/__init__.pxd
diff --git a/dpctl_ext/tensor/__init__.py b/dpnp/tensor/__init__.py
similarity index 100%
rename from dpctl_ext/tensor/__init__.py
rename to dpnp/tensor/__init__.py
diff --git a/dpctl_ext/tensor/_accumulation.py b/dpnp/tensor/_accumulation.py
similarity index 98%
rename from dpctl_ext/tensor/_accumulation.py
rename to dpnp/tensor/_accumulation.py
index 8628628f3bf8..305cf263514e 100644
--- a/dpctl_ext/tensor/_accumulation.py
+++ b/dpnp/tensor/_accumulation.py
@@ -29,11 +29,9 @@
 import dpctl
 from dpctl.utils import ExecutionPlacementError, SequentialOrderManager
 
-# TODO: revert to `import dpctl.tensor...`
-# when dpnp fully migrates dpctl/tensor
-import dpctl_ext.tensor as dpt
-import dpctl_ext.tensor._tensor_accumulation_impl as tai
-import dpctl_ext.tensor._tensor_impl as ti
+import dpnp.tensor as dpt
+import dpnp.tensor._tensor_accumulation_impl as tai
+import dpnp.tensor._tensor_impl as ti
 
 from ._numpy_helper import normalize_axis_index
 from ._type_utils import (
diff --git a/dpctl_ext/tensor/_array_api.py b/dpnp/tensor/_array_api.py
similarity index 98%
rename from dpctl_ext/tensor/_array_api.py
rename to dpnp/tensor/_array_api.py
index 09f71bc1bdd3..a18bc2be1824 100644
--- a/dpctl_ext/tensor/_array_api.py
+++ b/dpnp/tensor/_array_api.py
@@ -28,9 +28,7 @@
 
 import dpctl
 
-# TODO: revert to `import dpctl.tensor...`
-# when dpnp fully migrates dpctl/tensor
-import dpctl_ext.tensor as dpt
+import dpnp.tensor as dpt
 
 from ._tensor_impl import (
     default_device_complex_type,
diff --git a/dpctl_ext/tensor/_clip.py b/dpnp/tensor/_clip.py
similarity index 99%
rename from dpctl_ext/tensor/_clip.py
rename to dpnp/tensor/_clip.py
index 8071f13bee19..64020e88ce39 100644
--- a/dpctl_ext/tensor/_clip.py
+++ b/dpnp/tensor/_clip.py
@@ -29,11 +29,9 @@
 import dpctl
 from dpctl.utils import ExecutionPlacementError, SequentialOrderManager
 
-# TODO: revert to `import dpctl.tensor...`
-# when dpnp fully migrates dpctl/tensor
-import dpctl_ext.tensor as dpt
-import dpctl_ext.tensor._tensor_elementwise_impl as tei
-import dpctl_ext.tensor._tensor_impl as ti
+import dpnp.tensor as dpt
+import dpnp.tensor._tensor_elementwise_impl as tei
+import dpnp.tensor._tensor_impl as ti
 
 from ._copy_utils import (
     _empty_like_orderK,
diff --git a/dpctl_ext/tensor/_constants.py b/dpnp/tensor/_constants.py
similarity index 100%
rename from dpctl_ext/tensor/_constants.py
rename to dpnp/tensor/_constants.py
diff --git a/dpctl_ext/tensor/_copy_utils.py b/dpnp/tensor/_copy_utils.py
similarity index 99%
rename from dpctl_ext/tensor/_copy_utils.py
rename to dpnp/tensor/_copy_utils.py
index b056511ac33b..9a16d4f59acd 100644
--- a/dpctl_ext/tensor/_copy_utils.py
+++ b/dpnp/tensor/_copy_utils.py
@@ -35,10 +35,8 @@
 import dpctl.utils
 import numpy as np
 
-# TODO: revert to `import dpctl.tensor...`
-# when dpnp fully migrates dpctl/tensor
-import dpctl_ext.tensor as dpt
-import dpctl_ext.tensor._tensor_impl as ti
+import dpnp.tensor as dpt
+import dpnp.tensor._tensor_impl as ti
 
 from ._data_types import _get_dtype
 from ._device import normalize_queue_device
diff --git a/dpctl_ext/tensor/_ctors.py b/dpnp/tensor/_ctors.py
similarity index 99%
rename from dpctl_ext/tensor/_ctors.py
rename to dpnp/tensor/_ctors.py
index 041faba73205..c6e14db7398f 100644
--- a/dpctl_ext/tensor/_ctors.py
+++ b/dpnp/tensor/_ctors.py
@@ -34,10 +34,8 @@
 import dpctl.utils
 import numpy as np
 
-# TODO: revert to `import dpctl.tensor...`
-# when dpnp fully migrates dpctl/tensor
-import dpctl_ext.tensor as dpt
-import dpctl_ext.tensor._tensor_impl as ti
+import dpnp.tensor as dpt
+import dpnp.tensor._tensor_impl as ti
 
 from ._copy_utils import (
     _empty_like_orderK,
diff --git a/dpctl_ext/tensor/_data_types.py b/dpnp/tensor/_data_types.py
similarity index 100%
rename from dpctl_ext/tensor/_data_types.py
rename to dpnp/tensor/_data_types.py
diff --git a/dpctl_ext/tensor/_device.py b/dpnp/tensor/_device.py
similarity index 100%
rename from dpctl_ext/tensor/_device.py
rename to dpnp/tensor/_device.py
diff --git a/dpctl_ext/tensor/_dldevice_conversions.py b/dpnp/tensor/_dldevice_conversions.py
similarity index 100%
rename from dpctl_ext/tensor/_dldevice_conversions.py
rename to dpnp/tensor/_dldevice_conversions.py
diff --git a/dpctl_ext/tensor/_dlpack.pxd b/dpnp/tensor/_dlpack.pxd
similarity index 100%
rename from dpctl_ext/tensor/_dlpack.pxd
rename to dpnp/tensor/_dlpack.pxd
diff --git a/dpctl_ext/tensor/_dlpack.pyx b/dpnp/tensor/_dlpack.pyx
similarity index 99%
rename from dpctl_ext/tensor/_dlpack.pyx
rename to dpnp/tensor/_dlpack.pyx
index 21b3d877c475..947377d3a660 100644
--- a/dpctl_ext/tensor/_dlpack.pyx
+++ b/dpnp/tensor/_dlpack.pyx
@@ -1086,7 +1086,7 @@ def from_dlpack(x, /, *, device=None, copy=None):
         .. code-block:: python
 
             import dpctl
-            import dpctl_ext.tensor as dpt
+            import dpnp.tensor as dpt
 
             class Container:
                 "Helper class implementing `__dlpack__` protocol"
@@ -1209,9 +1209,7 @@ def from_dlpack(x, /, *, device=None, copy=None):
                 )
                 return from_dlpack_capsule(cpu_caps)
             else:
-                # TODO: revert to `import dpctl.tensor`
-                # when dpnp fully migrates dpctl/tensor
-                import dpctl_ext.tensor as dpt
+                import dpnp.tensor as dpt
                 return dpt.asarray(blob, device=dev)
         elif got_buffer_error:
             # we are here, because dlpack_attr could not deal with requested
diff --git a/dpctl_ext/tensor/_elementwise_common.py b/dpnp/tensor/_elementwise_common.py
similarity index 99%
rename from dpctl_ext/tensor/_elementwise_common.py
rename to dpnp/tensor/_elementwise_common.py
index ffe849db9cad..d312d50a4a8f 100644
--- a/dpctl_ext/tensor/_elementwise_common.py
+++ b/dpnp/tensor/_elementwise_common.py
@@ -29,10 +29,8 @@
 import dpctl
 from dpctl.utils import ExecutionPlacementError, SequentialOrderManager
 
-# TODO: revert to `import dpctl.tensor...`
-# when dpnp fully migrates dpctl/tensor
-import dpctl_ext.tensor as dpt
-import dpctl_ext.tensor._tensor_impl as ti
+import dpnp.tensor as dpt
+import dpnp.tensor._tensor_impl as ti
 
 from ._copy_utils import _empty_like_orderK, _empty_like_pair_orderK
 from ._manipulation_functions import _broadcast_shape_impl
diff --git a/dpctl_ext/tensor/_elementwise_funcs.py b/dpnp/tensor/_elementwise_funcs.py
similarity index 99%
rename from dpctl_ext/tensor/_elementwise_funcs.py
rename to dpnp/tensor/_elementwise_funcs.py
index 6442ef0b4594..5d38cad0c2a5 100644
--- a/dpctl_ext/tensor/_elementwise_funcs.py
+++ b/dpnp/tensor/_elementwise_funcs.py
@@ -26,9 +26,7 @@
 # THE POSSIBILITY OF SUCH DAMAGE.
 # *****************************************************************************
 
-# TODO: revert to `import dpctl.tensor...`
-# when dpnp fully migrates dpctl/tensor
-import dpctl_ext.tensor._tensor_elementwise_impl as ti
+import dpnp.tensor._tensor_elementwise_impl as ti
 
 from ._elementwise_common import BinaryElementwiseFunc, UnaryElementwiseFunc
 from ._type_utils import (
diff --git a/dpctl_ext/tensor/_flags.pyx b/dpnp/tensor/_flags.pyx
similarity index 100%
rename from dpctl_ext/tensor/_flags.pyx
rename to dpnp/tensor/_flags.pyx
diff --git a/dpctl_ext/tensor/_indexing_functions.py b/dpnp/tensor/_indexing_functions.py
similarity index 99%
rename from dpctl_ext/tensor/_indexing_functions.py
rename to dpnp/tensor/_indexing_functions.py
index 08db81c1b166..8f097e59efc3 100644
--- a/dpctl_ext/tensor/_indexing_functions.py
+++ b/dpnp/tensor/_indexing_functions.py
@@ -31,10 +31,8 @@
 import dpctl
 import dpctl.utils
 
-# TODO: revert to `import dpctl.tensor...`
-# when dpnp fully migrates dpctl/tensor
-import dpctl_ext.tensor as dpt
-import dpctl_ext.tensor._tensor_impl as ti
+import dpnp.tensor as dpt
+import dpnp.tensor._tensor_impl as ti
 
 from ._copy_utils import (
     _extract_impl,
diff --git a/dpctl_ext/tensor/_linear_algebra_functions.py b/dpnp/tensor/_linear_algebra_functions.py
similarity index 99%
rename from dpctl_ext/tensor/_linear_algebra_functions.py
rename to dpnp/tensor/_linear_algebra_functions.py
index 6dfb30e881b2..bec0522cd18f 100644
--- a/dpctl_ext/tensor/_linear_algebra_functions.py
+++ b/dpnp/tensor/_linear_algebra_functions.py
@@ -31,12 +31,10 @@
 import dpctl
 from dpctl.utils import ExecutionPlacementError, SequentialOrderManager
 
-# TODO: revert to `import dpctl.tensor...`
-# when dpnp fully migrates dpctl/tensor
-import dpctl_ext.tensor as dpt
-import dpctl_ext.tensor._tensor_elementwise_impl as tei
-import dpctl_ext.tensor._tensor_impl as ti
-import dpctl_ext.tensor._tensor_linalg_impl as tli
+import dpnp.tensor as dpt
+import dpnp.tensor._tensor_elementwise_impl as tei
+import dpnp.tensor._tensor_impl as ti
+import dpnp.tensor._tensor_linalg_impl as tli
 
 from ._copy_utils import _empty_like_orderK, _empty_like_pair_orderK
 from ._manipulation_functions import _broadcast_shape_impl
diff --git a/dpctl_ext/tensor/_manipulation_functions.py b/dpnp/tensor/_manipulation_functions.py
similarity index 99%
rename from dpctl_ext/tensor/_manipulation_functions.py
rename to dpnp/tensor/_manipulation_functions.py
index 33817dd0aa2e..965bafda7948 100644
--- a/dpctl_ext/tensor/_manipulation_functions.py
+++ b/dpnp/tensor/_manipulation_functions.py
@@ -33,10 +33,8 @@
 import dpctl.utils as dputils
 import numpy as np
 
-# TODO: revert to `import dpctl.tensor...`
-# when dpnp fully migrates dpctl/tensor
-import dpctl_ext.tensor as dpt
-import dpctl_ext.tensor._tensor_impl as ti
+import dpnp.tensor as dpt
+import dpnp.tensor._tensor_impl as ti
 
 from ._numpy_helper import normalize_axis_index, normalize_axis_tuple
 from ._type_utils import _supported_dtype, _to_device_supported_dtype
diff --git a/dpctl_ext/tensor/_numpy_helper.py b/dpnp/tensor/_numpy_helper.py
similarity index 100%
rename from dpctl_ext/tensor/_numpy_helper.py
rename to dpnp/tensor/_numpy_helper.py
diff --git a/dpctl_ext/tensor/_print.py b/dpnp/tensor/_print.py
similarity index 99%
rename from dpctl_ext/tensor/_print.py
rename to dpnp/tensor/_print.py
index 5385eadb2537..c9325af9d312 100644
--- a/dpctl_ext/tensor/_print.py
+++ b/dpnp/tensor/_print.py
@@ -34,10 +34,8 @@
 import dpctl.utils
 import numpy as np
 
-# TODO: revert to `import dpctl.tensor...`
-# when dpnp fully migrates dpctl/tensor
-import dpctl_ext.tensor as dpt
-import dpctl_ext.tensor._tensor_impl as ti
+import dpnp.tensor as dpt
+import dpnp.tensor._tensor_impl as ti
 
 __doc__ = "Print functions for :class:`dpctl.tensor.usm_ndarray`."
 
diff --git a/dpctl_ext/tensor/_reduction.py b/dpnp/tensor/_reduction.py
similarity index 99%
rename from dpctl_ext/tensor/_reduction.py
rename to dpnp/tensor/_reduction.py
index 79e620605f07..82b75503e269 100644
--- a/dpctl_ext/tensor/_reduction.py
+++ b/dpnp/tensor/_reduction.py
@@ -29,11 +29,9 @@
 import dpctl
 from dpctl.utils import ExecutionPlacementError, SequentialOrderManager
 
-# TODO: revert to `import dpctl.tensor...`
-# when dpnp fully migrates dpctl/tensor
-import dpctl_ext.tensor as dpt
-import dpctl_ext.tensor._tensor_impl as ti
-import dpctl_ext.tensor._tensor_reductions_impl as tri
+import dpnp.tensor as dpt
+import dpnp.tensor._tensor_impl as ti
+import dpnp.tensor._tensor_reductions_impl as tri
 
 from ._numpy_helper import normalize_axis_tuple
 from ._type_utils import (
diff --git a/dpctl_ext/tensor/_reshape.py b/dpnp/tensor/_reshape.py
similarity index 98%
rename from dpctl_ext/tensor/_reshape.py
rename to dpnp/tensor/_reshape.py
index 7ecdace4fc42..6d817c5ccdf0 100644
--- a/dpctl_ext/tensor/_reshape.py
+++ b/dpnp/tensor/_reshape.py
@@ -31,9 +31,7 @@
 import dpctl.utils
 import numpy as np
 
-# TODO: revert to `import dpctl.tensor...`
-# when dpnp fully migrates dpctl/tensor
-import dpctl_ext.tensor as dpt
+import dpnp.tensor as dpt
 
 from ._tensor_impl import (
     _copy_usm_ndarray_for_reshape,
diff --git a/dpctl_ext/tensor/_scalar_utils.py b/dpnp/tensor/_scalar_utils.py
similarity index 97%
rename from dpctl_ext/tensor/_scalar_utils.py
rename to dpnp/tensor/_scalar_utils.py
index 84abdf7b5a52..828f01f1c862 100644
--- a/dpctl_ext/tensor/_scalar_utils.py
+++ b/dpnp/tensor/_scalar_utils.py
@@ -31,9 +31,7 @@
 import dpctl.memory as dpm
 import numpy as np
 
-# TODO: revert to `import dpctl.tensor...`
-# when dpnp fully migrates dpctl/tensor
-import dpctl_ext.tensor as dpt
+import dpnp.tensor as dpt
 
 from ._type_utils import (
     WeakBooleanType,
diff --git a/dpctl_ext/tensor/_search_functions.py b/dpnp/tensor/_search_functions.py
similarity index 98%
rename from dpctl_ext/tensor/_search_functions.py
rename to dpnp/tensor/_search_functions.py
index aae185b64e2b..7e443351311a 100644
--- a/dpctl_ext/tensor/_search_functions.py
+++ b/dpnp/tensor/_search_functions.py
@@ -29,10 +29,8 @@
 import dpctl
 from dpctl.utils import ExecutionPlacementError, SequentialOrderManager
 
-# TODO: revert to `import dpctl.tensor...`
-# when dpnp fully migrates dpctl/tensor
-import dpctl_ext.tensor as dpt
-import dpctl_ext.tensor._tensor_impl as ti
+import dpnp.tensor as dpt
+import dpnp.tensor._tensor_impl as ti
 
 from ._copy_utils import _empty_like_orderK, _empty_like_triple_orderK
 from ._manipulation_functions import _broadcast_shape_impl
diff --git a/dpctl_ext/tensor/_searchsorted.py b/dpnp/tensor/_searchsorted.py
similarity index 98%
rename from dpctl_ext/tensor/_searchsorted.py
rename to dpnp/tensor/_searchsorted.py
index 4c680a49b07b..66a2df7ff375 100644
--- a/dpctl_ext/tensor/_searchsorted.py
+++ b/dpnp/tensor/_searchsorted.py
@@ -41,9 +41,6 @@
 )
 from ._tensor_sorting_impl import _searchsorted_left, _searchsorted_right
 from ._type_utils import isdtype, result_type
-
-# TODO: revert to `from ._usmarray import...`
-# when dpnp fully migrates dpctl/tensor
 from ._usmarray import usm_ndarray
 
 
diff --git a/dpctl_ext/tensor/_set_functions.py b/dpnp/tensor/_set_functions.py
similarity index 99%
rename from dpctl_ext/tensor/_set_functions.py
rename to dpnp/tensor/_set_functions.py
index 76840461b5e9..e6131ddf7d2a 100644
--- a/dpctl_ext/tensor/_set_functions.py
+++ b/dpnp/tensor/_set_functions.py
@@ -30,9 +30,7 @@
 
 import dpctl.utils as du
 
-# TODO: revert to `import dpctl.tensor...`
-# when dpnp fully migrates dpctl/tensor
-import dpctl_ext.tensor as dpt
+import dpnp.tensor as dpt
 
 from ._copy_utils import _empty_like_orderK
 from ._scalar_utils import (
diff --git a/dpctl_ext/tensor/_slicing.pxi b/dpnp/tensor/_slicing.pxi
similarity index 100%
rename from dpctl_ext/tensor/_slicing.pxi
rename to dpnp/tensor/_slicing.pxi
diff --git a/dpctl_ext/tensor/_sorting.py b/dpnp/tensor/_sorting.py
similarity index 99%
rename from dpctl_ext/tensor/_sorting.py
rename to dpnp/tensor/_sorting.py
index 42cd9e1b44be..fb4d3e4d98e4 100644
--- a/dpctl_ext/tensor/_sorting.py
+++ b/dpnp/tensor/_sorting.py
@@ -31,10 +31,8 @@
 
 import dpctl.utils as du
 
-# TODO: revert to `import dpctl.tensor...`
-# when dpnp fully migrates dpctl/tensor
-import dpctl_ext.tensor as dpt
-import dpctl_ext.tensor._tensor_impl as ti
+import dpnp.tensor as dpt
+import dpnp.tensor._tensor_impl as ti
 
 from ._numpy_helper import normalize_axis_index
 from ._tensor_sorting_impl import (
diff --git a/dpctl_ext/tensor/_statistical_functions.py b/dpnp/tensor/_statistical_functions.py
similarity index 98%
rename from dpctl_ext/tensor/_statistical_functions.py
rename to dpnp/tensor/_statistical_functions.py
index c1544b84c6a7..3d717554b5f8 100644
--- a/dpctl_ext/tensor/_statistical_functions.py
+++ b/dpnp/tensor/_statistical_functions.py
@@ -27,12 +27,10 @@
 
 import dpctl.utils as du
 
-# TODO: revert to `import dpctl.tensor...`
-# when dpnp fully migrates dpctl/tensor
-import dpctl_ext.tensor as dpt
-import dpctl_ext.tensor._tensor_elementwise_impl as tei
-import dpctl_ext.tensor._tensor_impl as ti
-import dpctl_ext.tensor._tensor_reductions_impl as tri
+import dpnp.tensor as dpt
+import dpnp.tensor._tensor_elementwise_impl as tei
+import dpnp.tensor._tensor_impl as ti
+import dpnp.tensor._tensor_reductions_impl as tri
 
 from ._numpy_helper import normalize_axis_tuple
 
diff --git a/dpctl_ext/tensor/_stride_utils.pxi b/dpnp/tensor/_stride_utils.pxi
similarity index 100%
rename from dpctl_ext/tensor/_stride_utils.pxi
rename to dpnp/tensor/_stride_utils.pxi
diff --git a/dpctl_ext/tensor/_testing.py b/dpnp/tensor/_testing.py
similarity index 98%
rename from dpctl_ext/tensor/_testing.py
rename to dpnp/tensor/_testing.py
index 4c9f5ebac9a4..ec1f0c47be60 100644
--- a/dpctl_ext/tensor/_testing.py
+++ b/dpnp/tensor/_testing.py
@@ -29,9 +29,7 @@
 import dpctl.utils as du
 import numpy as np
 
-# TODO: revert to `import dpctl.tensor...`
-# when dpnp fully migrates dpctl/tensor
-import dpctl_ext.tensor as dpt
+import dpnp.tensor as dpt
 
 from ._manipulation_functions import _broadcast_shape_impl
 from ._type_utils import _to_device_supported_dtype
diff --git a/dpctl_ext/tensor/_type_utils.py b/dpnp/tensor/_type_utils.py
similarity index 99%
rename from dpctl_ext/tensor/_type_utils.py
rename to dpnp/tensor/_type_utils.py
index 8c15053cb4c1..3da9e7994760 100644
--- a/dpctl_ext/tensor/_type_utils.py
+++ b/dpnp/tensor/_type_utils.py
@@ -30,10 +30,8 @@
 
 import numpy as np
 
-# TODO: revert to `import dpctl.tensor...`
-# when dpnp fully migrates dpctl/tensor
-import dpctl_ext.tensor as dpt
-import dpctl_ext.tensor._tensor_impl as ti
+import dpnp.tensor as dpt
+import dpnp.tensor._tensor_impl as ti
 
 
 def _all_data_types(_fp16, _fp64):
diff --git a/dpctl_ext/tensor/_types.pxi b/dpnp/tensor/_types.pxi
similarity index 100%
rename from dpctl_ext/tensor/_types.pxi
rename to dpnp/tensor/_types.pxi
diff --git a/dpctl_ext/tensor/_usmarray.pxd b/dpnp/tensor/_usmarray.pxd
similarity index 100%
rename from dpctl_ext/tensor/_usmarray.pxd
rename to dpnp/tensor/_usmarray.pxd
diff --git a/dpctl_ext/tensor/_usmarray.pyx b/dpnp/tensor/_usmarray.pyx
similarity index 93%
rename from dpctl_ext/tensor/_usmarray.pyx
rename to dpnp/tensor/_usmarray.pyx
index e3b33fd71ac8..519fefed6129 100644
--- a/dpctl_ext/tensor/_usmarray.pyx
+++ b/dpnp/tensor/_usmarray.pyx
@@ -37,8 +37,7 @@ import numpy as np
 from dpctl._backend cimport DPCTLSyclUSMRef
 from dpctl._sycl_device_factory cimport _cached_default_device
 
-# TODO: remote it when dpnp fully migrates dpctl/tensor
-import dpctl_ext
+import dpnp.tensor
 
 from ._data_types import bool as dpt_bool
 from ._device import Device
@@ -466,7 +465,7 @@ cdef class usm_ndarray:
 
             .. code-block:: python
 
-                from dpctl_ext import tensor
+                from dpnp import tensor
 
                 x = tensor.ones((3, 10, 7))
                 y = tensor.flip(x[:, 1::2], axis=1)
@@ -660,7 +659,7 @@ cdef class usm_ndarray:
 
             .. code-block:: python
 
-                from dpctl_ext import tensor
+                from dpnp import tensor
 
                 x = tensor.arange(899)
                 x.shape = (29, 31)
@@ -765,7 +764,7 @@ cdef class usm_ndarray:
 
             .. code-block:: python
 
-                from dpctl_ext import tensor
+                from dpnp import tensor
 
                 x = tensor.zeros((20, 30))
                 xv = x[10:, :15]
@@ -878,7 +877,7 @@ cdef class usm_ndarray:
 
             .. code-block:: python
 
-                >>> from dpctl_ext import tensor
+                >>> from dpnp import tensor
                 >>> x = tensor.ones(10)
                 >>> x.device
                 Device(level_zero:gpu:0)
@@ -929,7 +928,7 @@ cdef class usm_ndarray:
 
             .. code-block:: python
 
-                from dpctl_ext import tensor
+                from dpnp import tensor
 
                 # Create complex array from
                 # arrays of real and imaginary parts
@@ -958,7 +957,7 @@ cdef class usm_ndarray:
 
             .. code-block:: python
 
-                from dpctl_ext import tensor
+                from dpnp import tensor
 
                 # Reset imaginary part of complex array
 
@@ -1050,7 +1049,7 @@ cdef class usm_ndarray:
             .. code-block:: python
 
                 import dpctl
-                import dpctl_ext.tensor as dpt
+                import dpnp.tensor as dpt
 
                 x = dpt.full(10**6, 2, dtype="int64")
                 q_prof = dpctl.SyclQueue(
@@ -1146,9 +1145,7 @@ cdef class usm_ndarray:
         return (
             self.array_namespace_
             if self.array_namespace_ is not None
-            # TODO: revert to `else dpctl.tensor`
-            # when dpnp fully migrates dpctl/tensor
-            else dpctl_ext.tensor
+            else dpnp.tensor
         )
 
     def __bool__(self):
@@ -1204,19 +1201,17 @@ cdef class usm_ndarray:
         raise IndexError("only integer arrays are valid indices")
 
     def __abs__(self):
-        # TODO: revert to `return dpctl.tensor...`
-        # when dpnp fully migrates dpctl/tensor
-        return dpctl_ext.tensor.abs(self)
+        return dpnp.tensor.abs(self)
 
     def __add__(self, other):
         """
         Implementation for operator.add
         """
-        return dpctl_ext.tensor.add(self, other)
+        return dpnp.tensor.add(self, other)
 
     def __and__(self, other):
         "Implementation for operator.and"
-        return dpctl_ext.tensor.bitwise_and(self, other)
+        return dpnp.tensor.bitwise_and(self, other)
 
     def __dlpack__(
         self, *, stream=None, max_version=None, dl_device=None, copy=None
@@ -1375,24 +1370,22 @@ cdef class usm_ndarray:
         )
 
     def __eq__(self, other):
-        # TODO: revert to `return dpctl.tensor...`
-        # when dpnp fully migrates dpctl/tensor
-        return dpctl_ext.tensor.equal(self, other)
+        return dpnp.tensor.equal(self, other)
 
     def __floordiv__(self, other):
-        return dpctl_ext.tensor.floor_divide(self, other)
+        return dpnp.tensor.floor_divide(self, other)
 
     def __ge__(self, other):
-        return dpctl_ext.tensor.greater_equal(self, other)
+        return dpnp.tensor.greater_equal(self, other)
 
     def __gt__(self, other):
-        return dpctl_ext.tensor.greater(self, other)
+        return dpnp.tensor.greater(self, other)
 
     def __invert__(self):
-        return dpctl_ext.tensor.bitwise_invert(self)
+        return dpnp.tensor.bitwise_invert(self)
 
     def __le__(self, other):
-        return dpctl_ext.tensor.less_equal(self, other)
+        return dpnp.tensor.less_equal(self, other)
 
     def __len__(self):
         if (self.nd_):
@@ -1401,37 +1394,37 @@ cdef class usm_ndarray:
             raise TypeError("len() of unsized object")
 
     def __lshift__(self, other):
-        return dpctl_ext.tensor.bitwise_left_shift(self, other)
+        return dpnp.tensor.bitwise_left_shift(self, other)
 
     def __lt__(self, other):
-        return dpctl_ext.tensor.less(self, other)
+        return dpnp.tensor.less(self, other)
 
     def __matmul__(self, other):
-        return dpctl_ext.tensor.matmul(self, other)
+        return dpnp.tensor.matmul(self, other)
 
     def __mod__(self, other):
-        return dpctl_ext.tensor.remainder(self, other)
+        return dpnp.tensor.remainder(self, other)
 
     def __mul__(self, other):
-        return dpctl_ext.tensor.multiply(self, other)
+        return dpnp.tensor.multiply(self, other)
 
     def __ne__(self, other):
-        return dpctl_ext.tensor.not_equal(self, other)
+        return dpnp.tensor.not_equal(self, other)
 
     def __neg__(self):
-        return dpctl_ext.tensor.negative(self)
+        return dpnp.tensor.negative(self)
 
     def __or__(self, other):
-        return dpctl_ext.tensor.bitwise_or(self, other)
+        return dpnp.tensor.bitwise_or(self, other)
 
     def __pos__(self):
-        return dpctl_ext.tensor.positive(self)
+        return dpnp.tensor.positive(self)
 
     def __pow__(self, other):
-        return dpctl_ext.tensor.pow(self, other)
+        return dpnp.tensor.pow(self, other)
 
     def __rshift__(self, other):
-        return dpctl_ext.tensor.bitwise_right_shift(self, other)
+        return dpnp.tensor.bitwise_right_shift(self, other)
 
     def __setitem__(self, key, rhs):
         cdef tuple _meta
@@ -1476,7 +1469,7 @@ cdef class usm_ndarray:
                 _copy_from_usm_ndarray_to_usm_ndarray(Xv, rhs)
             else:
                 if hasattr(rhs, "__sycl_usm_array_interface__"):
-                    from dpctl_ext.tensor import asarray
+                    from dpnp.tensor import asarray
                     try:
                         rhs_ar = asarray(rhs)
                         _copy_from_usm_ndarray_to_usm_ndarray(Xv, rhs_ar)
@@ -1524,93 +1517,91 @@ cdef class usm_ndarray:
         return
 
     def __sub__(self, other):
-        # TODO: revert to `return dpctl.tensor...`
-        # when dpnp fully migrates dpctl/tensor
-        return dpctl_ext.tensor.subtract(self, other)
+        return dpnp.tensor.subtract(self, other)
 
     def __truediv__(self, other):
-        return dpctl_ext.tensor.divide(self, other)
+        return dpnp.tensor.divide(self, other)
 
     def __xor__(self, other):
-        return dpctl_ext.tensor.bitwise_xor(self, other)
+        return dpnp.tensor.bitwise_xor(self, other)
 
     def __radd__(self, other):
-        return dpctl_ext.tensor.add(other, self)
+        return dpnp.tensor.add(other, self)
 
     def __rand__(self, other):
-        return dpctl_ext.tensor.bitwise_and(other, self)
+        return dpnp.tensor.bitwise_and(other, self)
 
     def __rfloordiv__(self, other):
-        return dpctl_ext.tensor.floor_divide(other, self)
+        return dpnp.tensor.floor_divide(other, self)
 
     def __rlshift__(self, other):
-        return dpctl_ext.tensor.bitwise_left_shift(other, self)
+        return dpnp.tensor.bitwise_left_shift(other, self)
 
     def __rmatmul__(self, other):
-        return dpctl_ext.tensor.matmul(other, self)
+        return dpnp.tensor.matmul(other, self)
 
     def __rmod__(self, other):
-        return dpctl_ext.tensor.remainder(other, self)
+        return dpnp.tensor.remainder(other, self)
 
     def __rmul__(self, other):
-        return dpctl_ext.tensor.multiply(other, self)
+        return dpnp.tensor.multiply(other, self)
 
     def __ror__(self, other):
-        return dpctl_ext.tensor.bitwise_or(other, self)
+        return dpnp.tensor.bitwise_or(other, self)
 
     def __rpow__(self, other):
-        return dpctl_ext.tensor.pow(other, self)
+        return dpnp.tensor.pow(other, self)
 
     def __rrshift__(self, other):
-        return dpctl_ext.tensor.bitwise_right_shift(other, self)
+        return dpnp.tensor.bitwise_right_shift(other, self)
 
     def __rsub__(self, other):
-        return dpctl_ext.tensor.subtract(other, self)
+        return dpnp.tensor.subtract(other, self)
 
     def __rtruediv__(self, other):
-        return dpctl_ext.tensor.divide(other, self)
+        return dpnp.tensor.divide(other, self)
 
     def __rxor__(self, other):
-        return dpctl_ext.tensor.bitwise_xor(other, self)
+        return dpnp.tensor.bitwise_xor(other, self)
 
     def __iadd__(self, other):
-        return dpctl_ext.tensor.add._inplace_op(self, other)
+        return dpnp.tensor.add._inplace_op(self, other)
 
     def __iand__(self, other):
-        return dpctl_ext.tensor.bitwise_and._inplace_op(self, other)
+        return dpnp.tensor.bitwise_and._inplace_op(self, other)
 
     def __ifloordiv__(self, other):
-        return dpctl_ext.tensor.floor_divide._inplace_op(self, other)
+        return dpnp.tensor.floor_divide._inplace_op(self, other)
 
     def __ilshift__(self, other):
-        return dpctl_ext.tensor.bitwise_left_shift._inplace_op(self, other)
+        return dpnp.tensor.bitwise_left_shift._inplace_op(self, other)
 
     def __imatmul__(self, other):
-        return dpctl_ext.tensor.matmul(self, other, out=self, dtype=self.dtype)
+        return dpnp.tensor.matmul(self, other, out=self, dtype=self.dtype)
 
     def __imod__(self, other):
-        return dpctl_ext.tensor.remainder._inplace_op(self, other)
+        return dpnp.tensor.remainder._inplace_op(self, other)
 
     def __imul__(self, other):
-        return dpctl_ext.tensor.multiply._inplace_op(self, other)
+        return dpnp.tensor.multiply._inplace_op(self, other)
 
     def __ior__(self, other):
-        return dpctl_ext.tensor.bitwise_or._inplace_op(self, other)
+        return dpnp.tensor.bitwise_or._inplace_op(self, other)
 
     def __ipow__(self, other):
-        return dpctl_ext.tensor.pow._inplace_op(self, other)
+        return dpnp.tensor.pow._inplace_op(self, other)
 
     def __irshift__(self, other):
-        return dpctl_ext.tensor.bitwise_right_shift._inplace_op(self, other)
+        return dpnp.tensor.bitwise_right_shift._inplace_op(self, other)
 
     def __isub__(self, other):
-        return dpctl_ext.tensor.subtract._inplace_op(self, other)
+        return dpnp.tensor.subtract._inplace_op(self, other)
 
     def __itruediv__(self, other):
-        return dpctl_ext.tensor.divide._inplace_op(self, other)
+        return dpnp.tensor.divide._inplace_op(self, other)
 
     def __ixor__(self, other):
-        return dpctl_ext.tensor.bitwise_xor._inplace_op(self, other)
+        return dpnp.tensor.bitwise_xor._inplace_op(self, other)
 
     def __str__(self):
         return usm_ndarray_str(self)
diff --git a/dpctl_ext/tensor/_utility_functions.py b/dpnp/tensor/_utility_functions.py
similarity index 98%
rename from dpctl_ext/tensor/_utility_functions.py
rename to dpnp/tensor/_utility_functions.py
index c892d777102d..644c2ce9911f 100644
--- a/dpctl_ext/tensor/_utility_functions.py
+++ b/dpnp/tensor/_utility_functions.py
@@ -31,11 +31,9 @@
 
 import dpctl.utils as du
 
-# TODO: revert to `import dpctl.tensor...`
-# when dpnp fully migrates dpctl/tensor
-import dpctl_ext.tensor as dpt
-import dpctl_ext.tensor._tensor_impl as ti
-import dpctl_ext.tensor._tensor_reductions_impl as tri
+import dpnp.tensor as dpt
+import dpnp.tensor._tensor_impl as ti
+import dpnp.tensor._tensor_reductions_impl as tri
 
 from ._numpy_helper import normalize_axis_index, normalize_axis_tuple
 from ._scalar_utils import (
diff --git a/dpctl_ext/tensor/include/dlpack/LICENSE.third-party b/dpnp/tensor/include/dlpack/LICENSE.third-party
similarity index 100%
rename from dpctl_ext/tensor/include/dlpack/LICENSE.third-party
rename to dpnp/tensor/include/dlpack/LICENSE.third-party
diff --git a/dpctl_ext/tensor/include/dlpack/README.md b/dpnp/tensor/include/dlpack/README.md
similarity index 100%
rename from dpctl_ext/tensor/include/dlpack/README.md
rename to dpnp/tensor/include/dlpack/README.md
diff --git a/dpctl_ext/tensor/include/dlpack/dlpack.h b/dpnp/tensor/include/dlpack/dlpack.h
similarity index 100%
rename from dpctl_ext/tensor/include/dlpack/dlpack.h
rename to dpnp/tensor/include/dlpack/dlpack.h
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/accumulators.hpp b/dpnp/tensor/libtensor/include/kernels/accumulators.hpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/include/kernels/accumulators.hpp
rename to dpnp/tensor/libtensor/include/kernels/accumulators.hpp
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/alignment.hpp b/dpnp/tensor/libtensor/include/kernels/alignment.hpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/include/kernels/alignment.hpp
rename to dpnp/tensor/libtensor/include/kernels/alignment.hpp
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/boolean_advanced_indexing.hpp b/dpnp/tensor/libtensor/include/kernels/boolean_advanced_indexing.hpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/include/kernels/boolean_advanced_indexing.hpp
rename to dpnp/tensor/libtensor/include/kernels/boolean_advanced_indexing.hpp
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/clip.hpp b/dpnp/tensor/libtensor/include/kernels/clip.hpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/include/kernels/clip.hpp
rename to dpnp/tensor/libtensor/include/kernels/clip.hpp
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/constructors.hpp b/dpnp/tensor/libtensor/include/kernels/constructors.hpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/include/kernels/constructors.hpp
rename to dpnp/tensor/libtensor/include/kernels/constructors.hpp
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/copy_and_cast.hpp b/dpnp/tensor/libtensor/include/kernels/copy_and_cast.hpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/include/kernels/copy_and_cast.hpp
rename to dpnp/tensor/libtensor/include/kernels/copy_and_cast.hpp
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/copy_as_contiguous.hpp b/dpnp/tensor/libtensor/include/kernels/copy_as_contiguous.hpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/include/kernels/copy_as_contiguous.hpp
rename to dpnp/tensor/libtensor/include/kernels/copy_as_contiguous.hpp
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/dpctl_tensor_types.hpp b/dpnp/tensor/libtensor/include/kernels/dpctl_tensor_types.hpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/include/kernels/dpctl_tensor_types.hpp
rename to dpnp/tensor/libtensor/include/kernels/dpctl_tensor_types.hpp
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/abs.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/abs.hpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/abs.hpp
rename to dpnp/tensor/libtensor/include/kernels/elementwise_functions/abs.hpp
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/acos.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/acos.hpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/acos.hpp
rename to dpnp/tensor/libtensor/include/kernels/elementwise_functions/acos.hpp
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/acosh.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/acosh.hpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/acosh.hpp
rename to dpnp/tensor/libtensor/include/kernels/elementwise_functions/acosh.hpp
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/add.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/add.hpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/add.hpp
rename to dpnp/tensor/libtensor/include/kernels/elementwise_functions/add.hpp
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/angle.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/angle.hpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/angle.hpp
rename to dpnp/tensor/libtensor/include/kernels/elementwise_functions/angle.hpp
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/asin.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/asin.hpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/asin.hpp
rename to dpnp/tensor/libtensor/include/kernels/elementwise_functions/asin.hpp
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/asinh.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/asinh.hpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/asinh.hpp
rename to dpnp/tensor/libtensor/include/kernels/elementwise_functions/asinh.hpp
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/atan.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/atan.hpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/atan.hpp
rename to dpnp/tensor/libtensor/include/kernels/elementwise_functions/atan.hpp
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/atan2.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/atan2.hpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/atan2.hpp
rename to dpnp/tensor/libtensor/include/kernels/elementwise_functions/atan2.hpp
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/atanh.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/atanh.hpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/atanh.hpp
rename to dpnp/tensor/libtensor/include/kernels/elementwise_functions/atanh.hpp
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/bitwise_and.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/bitwise_and.hpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/bitwise_and.hpp
rename to dpnp/tensor/libtensor/include/kernels/elementwise_functions/bitwise_and.hpp
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/bitwise_invert.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/bitwise_invert.hpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/bitwise_invert.hpp
rename to dpnp/tensor/libtensor/include/kernels/elementwise_functions/bitwise_invert.hpp
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/bitwise_left_shift.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/bitwise_left_shift.hpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/bitwise_left_shift.hpp
rename to dpnp/tensor/libtensor/include/kernels/elementwise_functions/bitwise_left_shift.hpp
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/bitwise_or.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/bitwise_or.hpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/bitwise_or.hpp
rename to dpnp/tensor/libtensor/include/kernels/elementwise_functions/bitwise_or.hpp
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/bitwise_right_shift.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/bitwise_right_shift.hpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/bitwise_right_shift.hpp
rename to dpnp/tensor/libtensor/include/kernels/elementwise_functions/bitwise_right_shift.hpp
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/bitwise_xor.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/bitwise_xor.hpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/bitwise_xor.hpp
rename to dpnp/tensor/libtensor/include/kernels/elementwise_functions/bitwise_xor.hpp
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/cabs_impl.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/cabs_impl.hpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/cabs_impl.hpp
rename to dpnp/tensor/libtensor/include/kernels/elementwise_functions/cabs_impl.hpp
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/cbrt.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/cbrt.hpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/cbrt.hpp
rename to dpnp/tensor/libtensor/include/kernels/elementwise_functions/cbrt.hpp
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/ceil.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/ceil.hpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/ceil.hpp
rename to dpnp/tensor/libtensor/include/kernels/elementwise_functions/ceil.hpp
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/common.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/common.hpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/common.hpp
rename to dpnp/tensor/libtensor/include/kernels/elementwise_functions/common.hpp
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/common_detail.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/common_detail.hpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/common_detail.hpp
rename to dpnp/tensor/libtensor/include/kernels/elementwise_functions/common_detail.hpp
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/common_inplace.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/common_inplace.hpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/common_inplace.hpp
rename to dpnp/tensor/libtensor/include/kernels/elementwise_functions/common_inplace.hpp
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/conj.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/conj.hpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/conj.hpp
rename to dpnp/tensor/libtensor/include/kernels/elementwise_functions/conj.hpp
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/copysign.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/copysign.hpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/copysign.hpp
rename to dpnp/tensor/libtensor/include/kernels/elementwise_functions/copysign.hpp
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/cos.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/cos.hpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/cos.hpp
rename to dpnp/tensor/libtensor/include/kernels/elementwise_functions/cos.hpp
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/cosh.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/cosh.hpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/cosh.hpp
rename to dpnp/tensor/libtensor/include/kernels/elementwise_functions/cosh.hpp
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/equal.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/equal.hpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/equal.hpp
rename to dpnp/tensor/libtensor/include/kernels/elementwise_functions/equal.hpp
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/exp.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/exp.hpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/exp.hpp
rename to dpnp/tensor/libtensor/include/kernels/elementwise_functions/exp.hpp
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/exp2.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/exp2.hpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/exp2.hpp
rename to dpnp/tensor/libtensor/include/kernels/elementwise_functions/exp2.hpp
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/expm1.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/expm1.hpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/expm1.hpp
rename to dpnp/tensor/libtensor/include/kernels/elementwise_functions/expm1.hpp
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/floor.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/floor.hpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/floor.hpp
rename to dpnp/tensor/libtensor/include/kernels/elementwise_functions/floor.hpp
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/floor_divide.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/floor_divide.hpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/floor_divide.hpp
rename to dpnp/tensor/libtensor/include/kernels/elementwise_functions/floor_divide.hpp
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/greater.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/greater.hpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/greater.hpp
rename to dpnp/tensor/libtensor/include/kernels/elementwise_functions/greater.hpp
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/greater_equal.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/greater_equal.hpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/greater_equal.hpp
rename to dpnp/tensor/libtensor/include/kernels/elementwise_functions/greater_equal.hpp
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/hypot.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/hypot.hpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/hypot.hpp
rename to dpnp/tensor/libtensor/include/kernels/elementwise_functions/hypot.hpp
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/imag.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/imag.hpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/imag.hpp
rename to dpnp/tensor/libtensor/include/kernels/elementwise_functions/imag.hpp
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/isfinite.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/isfinite.hpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/isfinite.hpp
rename to dpnp/tensor/libtensor/include/kernels/elementwise_functions/isfinite.hpp
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/isinf.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/isinf.hpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/isinf.hpp
rename to dpnp/tensor/libtensor/include/kernels/elementwise_functions/isinf.hpp
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/isnan.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/isnan.hpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/isnan.hpp
rename to dpnp/tensor/libtensor/include/kernels/elementwise_functions/isnan.hpp
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/less.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/less.hpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/less.hpp
rename to dpnp/tensor/libtensor/include/kernels/elementwise_functions/less.hpp
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/less_equal.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/less_equal.hpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/less_equal.hpp
rename to dpnp/tensor/libtensor/include/kernels/elementwise_functions/less_equal.hpp
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/log.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/log.hpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/log.hpp
rename to dpnp/tensor/libtensor/include/kernels/elementwise_functions/log.hpp
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/log10.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/log10.hpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/log10.hpp
rename to dpnp/tensor/libtensor/include/kernels/elementwise_functions/log10.hpp
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/log1p.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/log1p.hpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/log1p.hpp
rename to dpnp/tensor/libtensor/include/kernels/elementwise_functions/log1p.hpp
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/log2.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/log2.hpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/log2.hpp
rename to dpnp/tensor/libtensor/include/kernels/elementwise_functions/log2.hpp
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/logaddexp.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/logaddexp.hpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/logaddexp.hpp
rename to dpnp/tensor/libtensor/include/kernels/elementwise_functions/logaddexp.hpp
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/logical_and.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/logical_and.hpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/logical_and.hpp
rename to dpnp/tensor/libtensor/include/kernels/elementwise_functions/logical_and.hpp
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/logical_not.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/logical_not.hpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/logical_not.hpp
rename to dpnp/tensor/libtensor/include/kernels/elementwise_functions/logical_not.hpp
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/logical_or.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/logical_or.hpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/logical_or.hpp
rename to dpnp/tensor/libtensor/include/kernels/elementwise_functions/logical_or.hpp
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/logical_xor.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/logical_xor.hpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/logical_xor.hpp
rename to dpnp/tensor/libtensor/include/kernels/elementwise_functions/logical_xor.hpp
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/maximum.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/maximum.hpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/maximum.hpp
rename to dpnp/tensor/libtensor/include/kernels/elementwise_functions/maximum.hpp
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/minimum.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/minimum.hpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/minimum.hpp
rename to dpnp/tensor/libtensor/include/kernels/elementwise_functions/minimum.hpp
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/multiply.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/multiply.hpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/multiply.hpp
rename to dpnp/tensor/libtensor/include/kernels/elementwise_functions/multiply.hpp
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/negative.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/negative.hpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/negative.hpp
rename to dpnp/tensor/libtensor/include/kernels/elementwise_functions/negative.hpp
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/nextafter.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/nextafter.hpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/nextafter.hpp
rename to dpnp/tensor/libtensor/include/kernels/elementwise_functions/nextafter.hpp
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/not_equal.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/not_equal.hpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/not_equal.hpp
rename to dpnp/tensor/libtensor/include/kernels/elementwise_functions/not_equal.hpp
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/positive.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/positive.hpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/positive.hpp
rename to dpnp/tensor/libtensor/include/kernels/elementwise_functions/positive.hpp
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/pow.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/pow.hpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/pow.hpp
rename to dpnp/tensor/libtensor/include/kernels/elementwise_functions/pow.hpp
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/proj.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/proj.hpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/proj.hpp
rename to dpnp/tensor/libtensor/include/kernels/elementwise_functions/proj.hpp
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/real.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/real.hpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/real.hpp
rename to dpnp/tensor/libtensor/include/kernels/elementwise_functions/real.hpp
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/reciprocal.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/reciprocal.hpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/reciprocal.hpp
rename to dpnp/tensor/libtensor/include/kernels/elementwise_functions/reciprocal.hpp
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/remainder.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/remainder.hpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/remainder.hpp
rename to dpnp/tensor/libtensor/include/kernels/elementwise_functions/remainder.hpp
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/round.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/round.hpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/round.hpp
rename to dpnp/tensor/libtensor/include/kernels/elementwise_functions/round.hpp
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/rsqrt.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/rsqrt.hpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/rsqrt.hpp
rename to dpnp/tensor/libtensor/include/kernels/elementwise_functions/rsqrt.hpp
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/sign.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/sign.hpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/sign.hpp
rename to dpnp/tensor/libtensor/include/kernels/elementwise_functions/sign.hpp
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/signbit.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/signbit.hpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/signbit.hpp
rename to dpnp/tensor/libtensor/include/kernels/elementwise_functions/signbit.hpp
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/sin.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/sin.hpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/sin.hpp
rename to dpnp/tensor/libtensor/include/kernels/elementwise_functions/sin.hpp
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/sinh.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/sinh.hpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/sinh.hpp
rename to dpnp/tensor/libtensor/include/kernels/elementwise_functions/sinh.hpp
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/sqrt.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/sqrt.hpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/sqrt.hpp
rename to dpnp/tensor/libtensor/include/kernels/elementwise_functions/sqrt.hpp
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/square.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/square.hpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/square.hpp
rename to dpnp/tensor/libtensor/include/kernels/elementwise_functions/square.hpp
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/subtract.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/subtract.hpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/subtract.hpp
rename to dpnp/tensor/libtensor/include/kernels/elementwise_functions/subtract.hpp
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/sycl_complex.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/sycl_complex.hpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/sycl_complex.hpp
rename to dpnp/tensor/libtensor/include/kernels/elementwise_functions/sycl_complex.hpp
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/tan.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/tan.hpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/tan.hpp
rename to dpnp/tensor/libtensor/include/kernels/elementwise_functions/tan.hpp
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/tanh.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/tanh.hpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/tanh.hpp
rename to dpnp/tensor/libtensor/include/kernels/elementwise_functions/tanh.hpp
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/true_divide.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/true_divide.hpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/true_divide.hpp
rename to dpnp/tensor/libtensor/include/kernels/elementwise_functions/true_divide.hpp
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/trunc.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/trunc.hpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/trunc.hpp
rename to dpnp/tensor/libtensor/include/kernels/elementwise_functions/trunc.hpp
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/vec_size_util.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/vec_size_util.hpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/vec_size_util.hpp
rename to dpnp/tensor/libtensor/include/kernels/elementwise_functions/vec_size_util.hpp
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/integer_advanced_indexing.hpp b/dpnp/tensor/libtensor/include/kernels/integer_advanced_indexing.hpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/include/kernels/integer_advanced_indexing.hpp
rename to dpnp/tensor/libtensor/include/kernels/integer_advanced_indexing.hpp
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/linalg_functions/dot_product.hpp b/dpnp/tensor/libtensor/include/kernels/linalg_functions/dot_product.hpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/include/kernels/linalg_functions/dot_product.hpp
rename to dpnp/tensor/libtensor/include/kernels/linalg_functions/dot_product.hpp
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/linalg_functions/gemm.hpp b/dpnp/tensor/libtensor/include/kernels/linalg_functions/gemm.hpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/include/kernels/linalg_functions/gemm.hpp
rename to dpnp/tensor/libtensor/include/kernels/linalg_functions/gemm.hpp
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/reductions.hpp b/dpnp/tensor/libtensor/include/kernels/reductions.hpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/include/kernels/reductions.hpp
rename to dpnp/tensor/libtensor/include/kernels/reductions.hpp
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/repeat.hpp b/dpnp/tensor/libtensor/include/kernels/repeat.hpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/include/kernels/repeat.hpp
rename to dpnp/tensor/libtensor/include/kernels/repeat.hpp
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/sorting/isin.hpp b/dpnp/tensor/libtensor/include/kernels/sorting/isin.hpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/include/kernels/sorting/isin.hpp
rename to dpnp/tensor/libtensor/include/kernels/sorting/isin.hpp
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/sorting/merge_sort.hpp b/dpnp/tensor/libtensor/include/kernels/sorting/merge_sort.hpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/include/kernels/sorting/merge_sort.hpp
rename to dpnp/tensor/libtensor/include/kernels/sorting/merge_sort.hpp
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/sorting/radix_sort.hpp b/dpnp/tensor/libtensor/include/kernels/sorting/radix_sort.hpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/include/kernels/sorting/radix_sort.hpp
rename to dpnp/tensor/libtensor/include/kernels/sorting/radix_sort.hpp
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/sorting/search_sorted_detail.hpp b/dpnp/tensor/libtensor/include/kernels/sorting/search_sorted_detail.hpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/include/kernels/sorting/search_sorted_detail.hpp
rename to dpnp/tensor/libtensor/include/kernels/sorting/search_sorted_detail.hpp
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/sorting/searchsorted.hpp b/dpnp/tensor/libtensor/include/kernels/sorting/searchsorted.hpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/include/kernels/sorting/searchsorted.hpp
rename to dpnp/tensor/libtensor/include/kernels/sorting/searchsorted.hpp
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/sorting/sort_impl_fn_ptr_t.hpp b/dpnp/tensor/libtensor/include/kernels/sorting/sort_impl_fn_ptr_t.hpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/include/kernels/sorting/sort_impl_fn_ptr_t.hpp
rename to dpnp/tensor/libtensor/include/kernels/sorting/sort_impl_fn_ptr_t.hpp
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/sorting/sort_utils.hpp b/dpnp/tensor/libtensor/include/kernels/sorting/sort_utils.hpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/include/kernels/sorting/sort_utils.hpp
rename to dpnp/tensor/libtensor/include/kernels/sorting/sort_utils.hpp
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/sorting/topk.hpp b/dpnp/tensor/libtensor/include/kernels/sorting/topk.hpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/include/kernels/sorting/topk.hpp
rename to dpnp/tensor/libtensor/include/kernels/sorting/topk.hpp
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/where.hpp b/dpnp/tensor/libtensor/include/kernels/where.hpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/include/kernels/where.hpp
rename to dpnp/tensor/libtensor/include/kernels/where.hpp
diff --git a/dpctl_ext/tensor/libtensor/include/utils/indexing_utils.hpp b/dpnp/tensor/libtensor/include/utils/indexing_utils.hpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/include/utils/indexing_utils.hpp
rename to dpnp/tensor/libtensor/include/utils/indexing_utils.hpp
diff --git a/dpctl_ext/tensor/libtensor/include/utils/math_utils.hpp b/dpnp/tensor/libtensor/include/utils/math_utils.hpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/include/utils/math_utils.hpp
rename to dpnp/tensor/libtensor/include/utils/math_utils.hpp
diff --git a/dpctl_ext/tensor/libtensor/include/utils/memory_overlap.hpp b/dpnp/tensor/libtensor/include/utils/memory_overlap.hpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/include/utils/memory_overlap.hpp
rename to dpnp/tensor/libtensor/include/utils/memory_overlap.hpp
diff --git a/dpctl_ext/tensor/libtensor/include/utils/offset_utils.hpp b/dpnp/tensor/libtensor/include/utils/offset_utils.hpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/include/utils/offset_utils.hpp
rename to dpnp/tensor/libtensor/include/utils/offset_utils.hpp
diff --git a/dpctl_ext/tensor/libtensor/include/utils/output_validation.hpp b/dpnp/tensor/libtensor/include/utils/output_validation.hpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/include/utils/output_validation.hpp
rename to dpnp/tensor/libtensor/include/utils/output_validation.hpp
diff --git a/dpctl_ext/tensor/libtensor/include/utils/rich_comparisons.hpp b/dpnp/tensor/libtensor/include/utils/rich_comparisons.hpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/include/utils/rich_comparisons.hpp
rename to dpnp/tensor/libtensor/include/utils/rich_comparisons.hpp
diff --git a/dpctl_ext/tensor/libtensor/include/utils/strided_iters.hpp b/dpnp/tensor/libtensor/include/utils/strided_iters.hpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/include/utils/strided_iters.hpp
rename to dpnp/tensor/libtensor/include/utils/strided_iters.hpp
diff --git a/dpctl_ext/tensor/libtensor/include/utils/sycl_alloc_utils.hpp b/dpnp/tensor/libtensor/include/utils/sycl_alloc_utils.hpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/include/utils/sycl_alloc_utils.hpp
rename to dpnp/tensor/libtensor/include/utils/sycl_alloc_utils.hpp
diff --git a/dpctl_ext/tensor/libtensor/include/utils/sycl_utils.hpp b/dpnp/tensor/libtensor/include/utils/sycl_utils.hpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/include/utils/sycl_utils.hpp
rename to dpnp/tensor/libtensor/include/utils/sycl_utils.hpp
diff --git a/dpctl_ext/tensor/libtensor/include/utils/type_dispatch.hpp b/dpnp/tensor/libtensor/include/utils/type_dispatch.hpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/include/utils/type_dispatch.hpp
rename to dpnp/tensor/libtensor/include/utils/type_dispatch.hpp
diff --git a/dpctl_ext/tensor/libtensor/include/utils/type_dispatch_building.hpp b/dpnp/tensor/libtensor/include/utils/type_dispatch_building.hpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/include/utils/type_dispatch_building.hpp
rename to dpnp/tensor/libtensor/include/utils/type_dispatch_building.hpp
diff --git a/dpctl_ext/tensor/libtensor/include/utils/type_utils.hpp b/dpnp/tensor/libtensor/include/utils/type_utils.hpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/include/utils/type_utils.hpp
rename to dpnp/tensor/libtensor/include/utils/type_utils.hpp
diff --git a/dpctl_ext/tensor/libtensor/source/accumulators.cpp b/dpnp/tensor/libtensor/source/accumulators.cpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/accumulators.cpp
rename to dpnp/tensor/libtensor/source/accumulators.cpp
diff --git a/dpctl_ext/tensor/libtensor/source/accumulators.hpp b/dpnp/tensor/libtensor/source/accumulators.hpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/accumulators.hpp
rename to dpnp/tensor/libtensor/source/accumulators.hpp
diff --git a/dpctl_ext/tensor/libtensor/source/accumulators/accumulate_over_axis.hpp b/dpnp/tensor/libtensor/source/accumulators/accumulate_over_axis.hpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/accumulators/accumulate_over_axis.hpp
rename to dpnp/tensor/libtensor/source/accumulators/accumulate_over_axis.hpp
diff --git a/dpctl_ext/tensor/libtensor/source/accumulators/accumulators_common.cpp b/dpnp/tensor/libtensor/source/accumulators/accumulators_common.cpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/accumulators/accumulators_common.cpp
rename to dpnp/tensor/libtensor/source/accumulators/accumulators_common.cpp
diff --git a/dpctl_ext/tensor/libtensor/source/accumulators/accumulators_common.hpp b/dpnp/tensor/libtensor/source/accumulators/accumulators_common.hpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/accumulators/accumulators_common.hpp
rename to dpnp/tensor/libtensor/source/accumulators/accumulators_common.hpp
diff --git a/dpctl_ext/tensor/libtensor/source/accumulators/cumulative_logsumexp.cpp b/dpnp/tensor/libtensor/source/accumulators/cumulative_logsumexp.cpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/accumulators/cumulative_logsumexp.cpp
rename to dpnp/tensor/libtensor/source/accumulators/cumulative_logsumexp.cpp
diff --git a/dpctl_ext/tensor/libtensor/source/accumulators/cumulative_logsumexp.hpp b/dpnp/tensor/libtensor/source/accumulators/cumulative_logsumexp.hpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/accumulators/cumulative_logsumexp.hpp
rename to dpnp/tensor/libtensor/source/accumulators/cumulative_logsumexp.hpp
diff --git a/dpctl_ext/tensor/libtensor/source/accumulators/cumulative_prod.cpp b/dpnp/tensor/libtensor/source/accumulators/cumulative_prod.cpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/accumulators/cumulative_prod.cpp
rename to dpnp/tensor/libtensor/source/accumulators/cumulative_prod.cpp
diff --git a/dpctl_ext/tensor/libtensor/source/accumulators/cumulative_prod.hpp b/dpnp/tensor/libtensor/source/accumulators/cumulative_prod.hpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/accumulators/cumulative_prod.hpp
rename to dpnp/tensor/libtensor/source/accumulators/cumulative_prod.hpp
diff --git a/dpctl_ext/tensor/libtensor/source/accumulators/cumulative_sum.cpp b/dpnp/tensor/libtensor/source/accumulators/cumulative_sum.cpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/accumulators/cumulative_sum.cpp
rename to dpnp/tensor/libtensor/source/accumulators/cumulative_sum.cpp
diff --git a/dpctl_ext/tensor/libtensor/source/accumulators/cumulative_sum.hpp b/dpnp/tensor/libtensor/source/accumulators/cumulative_sum.hpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/accumulators/cumulative_sum.hpp
rename to dpnp/tensor/libtensor/source/accumulators/cumulative_sum.hpp
diff --git a/dpctl_ext/tensor/libtensor/source/boolean_advanced_indexing.cpp b/dpnp/tensor/libtensor/source/boolean_advanced_indexing.cpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/boolean_advanced_indexing.cpp
rename to dpnp/tensor/libtensor/source/boolean_advanced_indexing.cpp
diff --git a/dpctl_ext/tensor/libtensor/source/boolean_advanced_indexing.hpp b/dpnp/tensor/libtensor/source/boolean_advanced_indexing.hpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/boolean_advanced_indexing.hpp
rename to dpnp/tensor/libtensor/source/boolean_advanced_indexing.hpp
diff --git a/dpctl_ext/tensor/libtensor/source/clip.cpp b/dpnp/tensor/libtensor/source/clip.cpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/clip.cpp
rename to dpnp/tensor/libtensor/source/clip.cpp
diff --git a/dpctl_ext/tensor/libtensor/source/clip.hpp b/dpnp/tensor/libtensor/source/clip.hpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/clip.hpp
rename to dpnp/tensor/libtensor/source/clip.hpp
diff --git a/dpctl_ext/tensor/libtensor/source/copy_and_cast_usm_to_usm.cpp b/dpnp/tensor/libtensor/source/copy_and_cast_usm_to_usm.cpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/copy_and_cast_usm_to_usm.cpp
rename to dpnp/tensor/libtensor/source/copy_and_cast_usm_to_usm.cpp
diff --git a/dpctl_ext/tensor/libtensor/source/copy_and_cast_usm_to_usm.hpp b/dpnp/tensor/libtensor/source/copy_and_cast_usm_to_usm.hpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/copy_and_cast_usm_to_usm.hpp
rename to dpnp/tensor/libtensor/source/copy_and_cast_usm_to_usm.hpp
diff --git a/dpctl_ext/tensor/libtensor/source/copy_as_contig.cpp b/dpnp/tensor/libtensor/source/copy_as_contig.cpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/copy_as_contig.cpp
rename to dpnp/tensor/libtensor/source/copy_as_contig.cpp
diff --git a/dpctl_ext/tensor/libtensor/source/copy_as_contig.hpp b/dpnp/tensor/libtensor/source/copy_as_contig.hpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/copy_as_contig.hpp
rename to dpnp/tensor/libtensor/source/copy_as_contig.hpp
diff --git a/dpctl_ext/tensor/libtensor/source/copy_for_reshape.cpp b/dpnp/tensor/libtensor/source/copy_for_reshape.cpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/copy_for_reshape.cpp
rename to dpnp/tensor/libtensor/source/copy_for_reshape.cpp
diff --git a/dpctl_ext/tensor/libtensor/source/copy_for_reshape.hpp b/dpnp/tensor/libtensor/source/copy_for_reshape.hpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/copy_for_reshape.hpp
rename to dpnp/tensor/libtensor/source/copy_for_reshape.hpp
diff --git a/dpctl_ext/tensor/libtensor/source/copy_for_roll.cpp b/dpnp/tensor/libtensor/source/copy_for_roll.cpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/copy_for_roll.cpp
rename to dpnp/tensor/libtensor/source/copy_for_roll.cpp
diff --git a/dpctl_ext/tensor/libtensor/source/copy_for_roll.hpp b/dpnp/tensor/libtensor/source/copy_for_roll.hpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/copy_for_roll.hpp
rename to dpnp/tensor/libtensor/source/copy_for_roll.hpp
diff --git a/dpctl_ext/tensor/libtensor/source/copy_numpy_ndarray_into_usm_ndarray.cpp b/dpnp/tensor/libtensor/source/copy_numpy_ndarray_into_usm_ndarray.cpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/copy_numpy_ndarray_into_usm_ndarray.cpp
rename to dpnp/tensor/libtensor/source/copy_numpy_ndarray_into_usm_ndarray.cpp
diff --git a/dpctl_ext/tensor/libtensor/source/copy_numpy_ndarray_into_usm_ndarray.hpp b/dpnp/tensor/libtensor/source/copy_numpy_ndarray_into_usm_ndarray.hpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/copy_numpy_ndarray_into_usm_ndarray.hpp
rename to dpnp/tensor/libtensor/source/copy_numpy_ndarray_into_usm_ndarray.hpp
diff --git a/dpctl_ext/tensor/libtensor/source/device_support_queries.cpp b/dpnp/tensor/libtensor/source/device_support_queries.cpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/device_support_queries.cpp
rename to dpnp/tensor/libtensor/source/device_support_queries.cpp
diff --git a/dpctl_ext/tensor/libtensor/source/device_support_queries.hpp b/dpnp/tensor/libtensor/source/device_support_queries.hpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/device_support_queries.hpp
rename to dpnp/tensor/libtensor/source/device_support_queries.hpp
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/abs.cpp b/dpnp/tensor/libtensor/source/elementwise_functions/abs.cpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/elementwise_functions/abs.cpp
rename to dpnp/tensor/libtensor/source/elementwise_functions/abs.cpp
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/abs.hpp b/dpnp/tensor/libtensor/source/elementwise_functions/abs.hpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/elementwise_functions/abs.hpp
rename to dpnp/tensor/libtensor/source/elementwise_functions/abs.hpp
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/acos.cpp b/dpnp/tensor/libtensor/source/elementwise_functions/acos.cpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/elementwise_functions/acos.cpp
rename to dpnp/tensor/libtensor/source/elementwise_functions/acos.cpp
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/acos.hpp b/dpnp/tensor/libtensor/source/elementwise_functions/acos.hpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/elementwise_functions/acos.hpp
rename to dpnp/tensor/libtensor/source/elementwise_functions/acos.hpp
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/acosh.cpp b/dpnp/tensor/libtensor/source/elementwise_functions/acosh.cpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/elementwise_functions/acosh.cpp
rename to dpnp/tensor/libtensor/source/elementwise_functions/acosh.cpp
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/acosh.hpp b/dpnp/tensor/libtensor/source/elementwise_functions/acosh.hpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/elementwise_functions/acosh.hpp
rename to dpnp/tensor/libtensor/source/elementwise_functions/acosh.hpp
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/add.cpp b/dpnp/tensor/libtensor/source/elementwise_functions/add.cpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/elementwise_functions/add.cpp
rename to dpnp/tensor/libtensor/source/elementwise_functions/add.cpp
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/add.hpp b/dpnp/tensor/libtensor/source/elementwise_functions/add.hpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/elementwise_functions/add.hpp
rename to dpnp/tensor/libtensor/source/elementwise_functions/add.hpp
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/angle.cpp b/dpnp/tensor/libtensor/source/elementwise_functions/angle.cpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/elementwise_functions/angle.cpp
rename to dpnp/tensor/libtensor/source/elementwise_functions/angle.cpp
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/angle.hpp b/dpnp/tensor/libtensor/source/elementwise_functions/angle.hpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/elementwise_functions/angle.hpp
rename to dpnp/tensor/libtensor/source/elementwise_functions/angle.hpp
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/asin.cpp b/dpnp/tensor/libtensor/source/elementwise_functions/asin.cpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/elementwise_functions/asin.cpp
rename to dpnp/tensor/libtensor/source/elementwise_functions/asin.cpp
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/asin.hpp b/dpnp/tensor/libtensor/source/elementwise_functions/asin.hpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/elementwise_functions/asin.hpp
rename to dpnp/tensor/libtensor/source/elementwise_functions/asin.hpp
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/asinh.cpp b/dpnp/tensor/libtensor/source/elementwise_functions/asinh.cpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/elementwise_functions/asinh.cpp
rename to dpnp/tensor/libtensor/source/elementwise_functions/asinh.cpp
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/asinh.hpp b/dpnp/tensor/libtensor/source/elementwise_functions/asinh.hpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/elementwise_functions/asinh.hpp
rename to dpnp/tensor/libtensor/source/elementwise_functions/asinh.hpp
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/atan.cpp b/dpnp/tensor/libtensor/source/elementwise_functions/atan.cpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/elementwise_functions/atan.cpp
rename to dpnp/tensor/libtensor/source/elementwise_functions/atan.cpp
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/atan.hpp b/dpnp/tensor/libtensor/source/elementwise_functions/atan.hpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/elementwise_functions/atan.hpp
rename to dpnp/tensor/libtensor/source/elementwise_functions/atan.hpp
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/atan2.cpp b/dpnp/tensor/libtensor/source/elementwise_functions/atan2.cpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/elementwise_functions/atan2.cpp
rename to dpnp/tensor/libtensor/source/elementwise_functions/atan2.cpp
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/atan2.hpp b/dpnp/tensor/libtensor/source/elementwise_functions/atan2.hpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/elementwise_functions/atan2.hpp
rename to dpnp/tensor/libtensor/source/elementwise_functions/atan2.hpp
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/atanh.cpp b/dpnp/tensor/libtensor/source/elementwise_functions/atanh.cpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/elementwise_functions/atanh.cpp
rename to dpnp/tensor/libtensor/source/elementwise_functions/atanh.cpp
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/atanh.hpp b/dpnp/tensor/libtensor/source/elementwise_functions/atanh.hpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/elementwise_functions/atanh.hpp
rename to dpnp/tensor/libtensor/source/elementwise_functions/atanh.hpp
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/bitwise_and.cpp b/dpnp/tensor/libtensor/source/elementwise_functions/bitwise_and.cpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/elementwise_functions/bitwise_and.cpp
rename to dpnp/tensor/libtensor/source/elementwise_functions/bitwise_and.cpp
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/bitwise_and.hpp b/dpnp/tensor/libtensor/source/elementwise_functions/bitwise_and.hpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/elementwise_functions/bitwise_and.hpp
rename to dpnp/tensor/libtensor/source/elementwise_functions/bitwise_and.hpp
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/bitwise_invert.cpp b/dpnp/tensor/libtensor/source/elementwise_functions/bitwise_invert.cpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/elementwise_functions/bitwise_invert.cpp
rename to dpnp/tensor/libtensor/source/elementwise_functions/bitwise_invert.cpp
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/bitwise_invert.hpp b/dpnp/tensor/libtensor/source/elementwise_functions/bitwise_invert.hpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/elementwise_functions/bitwise_invert.hpp
rename to dpnp/tensor/libtensor/source/elementwise_functions/bitwise_invert.hpp
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/bitwise_left_shift.cpp b/dpnp/tensor/libtensor/source/elementwise_functions/bitwise_left_shift.cpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/elementwise_functions/bitwise_left_shift.cpp
rename to dpnp/tensor/libtensor/source/elementwise_functions/bitwise_left_shift.cpp
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/bitwise_left_shift.hpp b/dpnp/tensor/libtensor/source/elementwise_functions/bitwise_left_shift.hpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/elementwise_functions/bitwise_left_shift.hpp
rename to dpnp/tensor/libtensor/source/elementwise_functions/bitwise_left_shift.hpp
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/bitwise_or.cpp b/dpnp/tensor/libtensor/source/elementwise_functions/bitwise_or.cpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/elementwise_functions/bitwise_or.cpp
rename to dpnp/tensor/libtensor/source/elementwise_functions/bitwise_or.cpp
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/bitwise_or.hpp b/dpnp/tensor/libtensor/source/elementwise_functions/bitwise_or.hpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/elementwise_functions/bitwise_or.hpp
rename to dpnp/tensor/libtensor/source/elementwise_functions/bitwise_or.hpp
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/bitwise_right_shift.cpp b/dpnp/tensor/libtensor/source/elementwise_functions/bitwise_right_shift.cpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/elementwise_functions/bitwise_right_shift.cpp
rename to dpnp/tensor/libtensor/source/elementwise_functions/bitwise_right_shift.cpp
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/bitwise_right_shift.hpp b/dpnp/tensor/libtensor/source/elementwise_functions/bitwise_right_shift.hpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/elementwise_functions/bitwise_right_shift.hpp
rename to dpnp/tensor/libtensor/source/elementwise_functions/bitwise_right_shift.hpp
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/bitwise_xor.cpp b/dpnp/tensor/libtensor/source/elementwise_functions/bitwise_xor.cpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/elementwise_functions/bitwise_xor.cpp
rename to dpnp/tensor/libtensor/source/elementwise_functions/bitwise_xor.cpp
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/bitwise_xor.hpp b/dpnp/tensor/libtensor/source/elementwise_functions/bitwise_xor.hpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/elementwise_functions/bitwise_xor.hpp
rename to dpnp/tensor/libtensor/source/elementwise_functions/bitwise_xor.hpp
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/cbrt.cpp b/dpnp/tensor/libtensor/source/elementwise_functions/cbrt.cpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/elementwise_functions/cbrt.cpp
rename to dpnp/tensor/libtensor/source/elementwise_functions/cbrt.cpp
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/cbrt.hpp b/dpnp/tensor/libtensor/source/elementwise_functions/cbrt.hpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/elementwise_functions/cbrt.hpp
rename to dpnp/tensor/libtensor/source/elementwise_functions/cbrt.hpp
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/ceil.cpp b/dpnp/tensor/libtensor/source/elementwise_functions/ceil.cpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/elementwise_functions/ceil.cpp
rename to dpnp/tensor/libtensor/source/elementwise_functions/ceil.cpp
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/ceil.hpp b/dpnp/tensor/libtensor/source/elementwise_functions/ceil.hpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/elementwise_functions/ceil.hpp
rename to dpnp/tensor/libtensor/source/elementwise_functions/ceil.hpp
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/conj.cpp b/dpnp/tensor/libtensor/source/elementwise_functions/conj.cpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/elementwise_functions/conj.cpp
rename to dpnp/tensor/libtensor/source/elementwise_functions/conj.cpp
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/conj.hpp b/dpnp/tensor/libtensor/source/elementwise_functions/conj.hpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/elementwise_functions/conj.hpp
rename to dpnp/tensor/libtensor/source/elementwise_functions/conj.hpp
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/copysign.cpp b/dpnp/tensor/libtensor/source/elementwise_functions/copysign.cpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/elementwise_functions/copysign.cpp
rename to dpnp/tensor/libtensor/source/elementwise_functions/copysign.cpp
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/copysign.hpp b/dpnp/tensor/libtensor/source/elementwise_functions/copysign.hpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/elementwise_functions/copysign.hpp
rename to dpnp/tensor/libtensor/source/elementwise_functions/copysign.hpp
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/cos.cpp b/dpnp/tensor/libtensor/source/elementwise_functions/cos.cpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/elementwise_functions/cos.cpp
rename to dpnp/tensor/libtensor/source/elementwise_functions/cos.cpp
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/cos.hpp b/dpnp/tensor/libtensor/source/elementwise_functions/cos.hpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/elementwise_functions/cos.hpp
rename to dpnp/tensor/libtensor/source/elementwise_functions/cos.hpp
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/cosh.cpp b/dpnp/tensor/libtensor/source/elementwise_functions/cosh.cpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/elementwise_functions/cosh.cpp
rename to dpnp/tensor/libtensor/source/elementwise_functions/cosh.cpp
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/cosh.hpp b/dpnp/tensor/libtensor/source/elementwise_functions/cosh.hpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/elementwise_functions/cosh.hpp
rename to dpnp/tensor/libtensor/source/elementwise_functions/cosh.hpp
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/elementwise_common.cpp b/dpnp/tensor/libtensor/source/elementwise_functions/elementwise_common.cpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/elementwise_functions/elementwise_common.cpp
rename to dpnp/tensor/libtensor/source/elementwise_functions/elementwise_common.cpp
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/elementwise_common.hpp b/dpnp/tensor/libtensor/source/elementwise_functions/elementwise_common.hpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/elementwise_functions/elementwise_common.hpp
rename to dpnp/tensor/libtensor/source/elementwise_functions/elementwise_common.hpp
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/elementwise_functions.hpp b/dpnp/tensor/libtensor/source/elementwise_functions/elementwise_functions.hpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/elementwise_functions/elementwise_functions.hpp
rename to dpnp/tensor/libtensor/source/elementwise_functions/elementwise_functions.hpp
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/elementwise_functions_type_utils.cpp b/dpnp/tensor/libtensor/source/elementwise_functions/elementwise_functions_type_utils.cpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/elementwise_functions/elementwise_functions_type_utils.cpp
rename to dpnp/tensor/libtensor/source/elementwise_functions/elementwise_functions_type_utils.cpp
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/elementwise_functions_type_utils.hpp b/dpnp/tensor/libtensor/source/elementwise_functions/elementwise_functions_type_utils.hpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/elementwise_functions/elementwise_functions_type_utils.hpp
rename to dpnp/tensor/libtensor/source/elementwise_functions/elementwise_functions_type_utils.hpp
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/equal.cpp b/dpnp/tensor/libtensor/source/elementwise_functions/equal.cpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/elementwise_functions/equal.cpp
rename to dpnp/tensor/libtensor/source/elementwise_functions/equal.cpp
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/equal.hpp b/dpnp/tensor/libtensor/source/elementwise_functions/equal.hpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/elementwise_functions/equal.hpp
rename to dpnp/tensor/libtensor/source/elementwise_functions/equal.hpp
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/exp.cpp b/dpnp/tensor/libtensor/source/elementwise_functions/exp.cpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/elementwise_functions/exp.cpp
rename to dpnp/tensor/libtensor/source/elementwise_functions/exp.cpp
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/exp.hpp b/dpnp/tensor/libtensor/source/elementwise_functions/exp.hpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/elementwise_functions/exp.hpp
rename to dpnp/tensor/libtensor/source/elementwise_functions/exp.hpp
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/exp2.cpp b/dpnp/tensor/libtensor/source/elementwise_functions/exp2.cpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/elementwise_functions/exp2.cpp
rename to dpnp/tensor/libtensor/source/elementwise_functions/exp2.cpp
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/exp2.hpp b/dpnp/tensor/libtensor/source/elementwise_functions/exp2.hpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/elementwise_functions/exp2.hpp
rename to dpnp/tensor/libtensor/source/elementwise_functions/exp2.hpp
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/expm1.cpp b/dpnp/tensor/libtensor/source/elementwise_functions/expm1.cpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/elementwise_functions/expm1.cpp
rename to dpnp/tensor/libtensor/source/elementwise_functions/expm1.cpp
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/expm1.hpp b/dpnp/tensor/libtensor/source/elementwise_functions/expm1.hpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/elementwise_functions/expm1.hpp
rename to dpnp/tensor/libtensor/source/elementwise_functions/expm1.hpp
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/floor.cpp b/dpnp/tensor/libtensor/source/elementwise_functions/floor.cpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/elementwise_functions/floor.cpp
rename to dpnp/tensor/libtensor/source/elementwise_functions/floor.cpp
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/floor.hpp b/dpnp/tensor/libtensor/source/elementwise_functions/floor.hpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/elementwise_functions/floor.hpp
rename to dpnp/tensor/libtensor/source/elementwise_functions/floor.hpp
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/floor_divide.cpp b/dpnp/tensor/libtensor/source/elementwise_functions/floor_divide.cpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/elementwise_functions/floor_divide.cpp
rename to dpnp/tensor/libtensor/source/elementwise_functions/floor_divide.cpp
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/floor_divide.hpp b/dpnp/tensor/libtensor/source/elementwise_functions/floor_divide.hpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/elementwise_functions/floor_divide.hpp
rename to dpnp/tensor/libtensor/source/elementwise_functions/floor_divide.hpp
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/greater.cpp b/dpnp/tensor/libtensor/source/elementwise_functions/greater.cpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/elementwise_functions/greater.cpp
rename to dpnp/tensor/libtensor/source/elementwise_functions/greater.cpp
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/greater.hpp b/dpnp/tensor/libtensor/source/elementwise_functions/greater.hpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/elementwise_functions/greater.hpp
rename to dpnp/tensor/libtensor/source/elementwise_functions/greater.hpp
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/greater_equal.cpp b/dpnp/tensor/libtensor/source/elementwise_functions/greater_equal.cpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/elementwise_functions/greater_equal.cpp
rename to dpnp/tensor/libtensor/source/elementwise_functions/greater_equal.cpp
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/greater_equal.hpp b/dpnp/tensor/libtensor/source/elementwise_functions/greater_equal.hpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/elementwise_functions/greater_equal.hpp
rename to dpnp/tensor/libtensor/source/elementwise_functions/greater_equal.hpp
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/hypot.cpp b/dpnp/tensor/libtensor/source/elementwise_functions/hypot.cpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/elementwise_functions/hypot.cpp
rename to dpnp/tensor/libtensor/source/elementwise_functions/hypot.cpp
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/hypot.hpp b/dpnp/tensor/libtensor/source/elementwise_functions/hypot.hpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/elementwise_functions/hypot.hpp
rename to dpnp/tensor/libtensor/source/elementwise_functions/hypot.hpp
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/imag.cpp b/dpnp/tensor/libtensor/source/elementwise_functions/imag.cpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/elementwise_functions/imag.cpp
rename to dpnp/tensor/libtensor/source/elementwise_functions/imag.cpp
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/imag.hpp b/dpnp/tensor/libtensor/source/elementwise_functions/imag.hpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/elementwise_functions/imag.hpp
rename to dpnp/tensor/libtensor/source/elementwise_functions/imag.hpp
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/isfinite.cpp b/dpnp/tensor/libtensor/source/elementwise_functions/isfinite.cpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/elementwise_functions/isfinite.cpp
rename to dpnp/tensor/libtensor/source/elementwise_functions/isfinite.cpp
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/isfinite.hpp b/dpnp/tensor/libtensor/source/elementwise_functions/isfinite.hpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/elementwise_functions/isfinite.hpp
rename to dpnp/tensor/libtensor/source/elementwise_functions/isfinite.hpp
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/isinf.cpp b/dpnp/tensor/libtensor/source/elementwise_functions/isinf.cpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/elementwise_functions/isinf.cpp
rename to dpnp/tensor/libtensor/source/elementwise_functions/isinf.cpp
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/isinf.hpp b/dpnp/tensor/libtensor/source/elementwise_functions/isinf.hpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/elementwise_functions/isinf.hpp
rename to dpnp/tensor/libtensor/source/elementwise_functions/isinf.hpp
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/isnan.cpp b/dpnp/tensor/libtensor/source/elementwise_functions/isnan.cpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/elementwise_functions/isnan.cpp
rename to dpnp/tensor/libtensor/source/elementwise_functions/isnan.cpp
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/isnan.hpp b/dpnp/tensor/libtensor/source/elementwise_functions/isnan.hpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/elementwise_functions/isnan.hpp
rename to dpnp/tensor/libtensor/source/elementwise_functions/isnan.hpp
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/less.cpp b/dpnp/tensor/libtensor/source/elementwise_functions/less.cpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/elementwise_functions/less.cpp
rename to dpnp/tensor/libtensor/source/elementwise_functions/less.cpp
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/less.hpp b/dpnp/tensor/libtensor/source/elementwise_functions/less.hpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/elementwise_functions/less.hpp
rename to dpnp/tensor/libtensor/source/elementwise_functions/less.hpp
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/less_equal.cpp b/dpnp/tensor/libtensor/source/elementwise_functions/less_equal.cpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/elementwise_functions/less_equal.cpp
rename to dpnp/tensor/libtensor/source/elementwise_functions/less_equal.cpp
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/less_equal.hpp b/dpnp/tensor/libtensor/source/elementwise_functions/less_equal.hpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/elementwise_functions/less_equal.hpp
rename to dpnp/tensor/libtensor/source/elementwise_functions/less_equal.hpp
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/log.cpp b/dpnp/tensor/libtensor/source/elementwise_functions/log.cpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/elementwise_functions/log.cpp
rename to dpnp/tensor/libtensor/source/elementwise_functions/log.cpp
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/log.hpp b/dpnp/tensor/libtensor/source/elementwise_functions/log.hpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/elementwise_functions/log.hpp
rename to dpnp/tensor/libtensor/source/elementwise_functions/log.hpp
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/log10.cpp b/dpnp/tensor/libtensor/source/elementwise_functions/log10.cpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/elementwise_functions/log10.cpp
rename to dpnp/tensor/libtensor/source/elementwise_functions/log10.cpp
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/log10.hpp b/dpnp/tensor/libtensor/source/elementwise_functions/log10.hpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/elementwise_functions/log10.hpp
rename to dpnp/tensor/libtensor/source/elementwise_functions/log10.hpp
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/log1p.cpp b/dpnp/tensor/libtensor/source/elementwise_functions/log1p.cpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/elementwise_functions/log1p.cpp
rename to dpnp/tensor/libtensor/source/elementwise_functions/log1p.cpp
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/log1p.hpp b/dpnp/tensor/libtensor/source/elementwise_functions/log1p.hpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/elementwise_functions/log1p.hpp
rename to dpnp/tensor/libtensor/source/elementwise_functions/log1p.hpp
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/log2.cpp b/dpnp/tensor/libtensor/source/elementwise_functions/log2.cpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/elementwise_functions/log2.cpp
rename to dpnp/tensor/libtensor/source/elementwise_functions/log2.cpp
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/log2.hpp b/dpnp/tensor/libtensor/source/elementwise_functions/log2.hpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/elementwise_functions/log2.hpp
rename to dpnp/tensor/libtensor/source/elementwise_functions/log2.hpp
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/logaddexp.cpp b/dpnp/tensor/libtensor/source/elementwise_functions/logaddexp.cpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/elementwise_functions/logaddexp.cpp
rename to dpnp/tensor/libtensor/source/elementwise_functions/logaddexp.cpp
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/logaddexp.hpp b/dpnp/tensor/libtensor/source/elementwise_functions/logaddexp.hpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/elementwise_functions/logaddexp.hpp
rename to dpnp/tensor/libtensor/source/elementwise_functions/logaddexp.hpp
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/logical_and.cpp b/dpnp/tensor/libtensor/source/elementwise_functions/logical_and.cpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/elementwise_functions/logical_and.cpp
rename to dpnp/tensor/libtensor/source/elementwise_functions/logical_and.cpp
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/logical_and.hpp b/dpnp/tensor/libtensor/source/elementwise_functions/logical_and.hpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/elementwise_functions/logical_and.hpp
rename to dpnp/tensor/libtensor/source/elementwise_functions/logical_and.hpp
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/logical_not.cpp b/dpnp/tensor/libtensor/source/elementwise_functions/logical_not.cpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/elementwise_functions/logical_not.cpp
rename to dpnp/tensor/libtensor/source/elementwise_functions/logical_not.cpp
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/logical_not.hpp b/dpnp/tensor/libtensor/source/elementwise_functions/logical_not.hpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/elementwise_functions/logical_not.hpp
rename to dpnp/tensor/libtensor/source/elementwise_functions/logical_not.hpp
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/logical_or.cpp b/dpnp/tensor/libtensor/source/elementwise_functions/logical_or.cpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/elementwise_functions/logical_or.cpp
rename to dpnp/tensor/libtensor/source/elementwise_functions/logical_or.cpp
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/logical_or.hpp b/dpnp/tensor/libtensor/source/elementwise_functions/logical_or.hpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/elementwise_functions/logical_or.hpp
rename to dpnp/tensor/libtensor/source/elementwise_functions/logical_or.hpp
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/logical_xor.cpp b/dpnp/tensor/libtensor/source/elementwise_functions/logical_xor.cpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/elementwise_functions/logical_xor.cpp
rename to dpnp/tensor/libtensor/source/elementwise_functions/logical_xor.cpp
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/logical_xor.hpp b/dpnp/tensor/libtensor/source/elementwise_functions/logical_xor.hpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/elementwise_functions/logical_xor.hpp
rename to dpnp/tensor/libtensor/source/elementwise_functions/logical_xor.hpp
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/maximum.cpp b/dpnp/tensor/libtensor/source/elementwise_functions/maximum.cpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/elementwise_functions/maximum.cpp
rename to dpnp/tensor/libtensor/source/elementwise_functions/maximum.cpp
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/maximum.hpp b/dpnp/tensor/libtensor/source/elementwise_functions/maximum.hpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/elementwise_functions/maximum.hpp
rename to dpnp/tensor/libtensor/source/elementwise_functions/maximum.hpp
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/minimum.cpp b/dpnp/tensor/libtensor/source/elementwise_functions/minimum.cpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/elementwise_functions/minimum.cpp
rename to dpnp/tensor/libtensor/source/elementwise_functions/minimum.cpp
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/minimum.hpp b/dpnp/tensor/libtensor/source/elementwise_functions/minimum.hpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/elementwise_functions/minimum.hpp
rename to dpnp/tensor/libtensor/source/elementwise_functions/minimum.hpp
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/multiply.cpp b/dpnp/tensor/libtensor/source/elementwise_functions/multiply.cpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/elementwise_functions/multiply.cpp
rename to dpnp/tensor/libtensor/source/elementwise_functions/multiply.cpp
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/multiply.hpp b/dpnp/tensor/libtensor/source/elementwise_functions/multiply.hpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/elementwise_functions/multiply.hpp
rename to dpnp/tensor/libtensor/source/elementwise_functions/multiply.hpp
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/negative.cpp b/dpnp/tensor/libtensor/source/elementwise_functions/negative.cpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/elementwise_functions/negative.cpp
rename to dpnp/tensor/libtensor/source/elementwise_functions/negative.cpp
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/negative.hpp b/dpnp/tensor/libtensor/source/elementwise_functions/negative.hpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/elementwise_functions/negative.hpp
rename to dpnp/tensor/libtensor/source/elementwise_functions/negative.hpp
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/nextafter.cpp b/dpnp/tensor/libtensor/source/elementwise_functions/nextafter.cpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/elementwise_functions/nextafter.cpp
rename to dpnp/tensor/libtensor/source/elementwise_functions/nextafter.cpp
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/nextafter.hpp b/dpnp/tensor/libtensor/source/elementwise_functions/nextafter.hpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/elementwise_functions/nextafter.hpp
rename to dpnp/tensor/libtensor/source/elementwise_functions/nextafter.hpp
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/not_equal.cpp b/dpnp/tensor/libtensor/source/elementwise_functions/not_equal.cpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/elementwise_functions/not_equal.cpp
rename to dpnp/tensor/libtensor/source/elementwise_functions/not_equal.cpp
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/not_equal.hpp b/dpnp/tensor/libtensor/source/elementwise_functions/not_equal.hpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/elementwise_functions/not_equal.hpp
rename to dpnp/tensor/libtensor/source/elementwise_functions/not_equal.hpp
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/positive.cpp b/dpnp/tensor/libtensor/source/elementwise_functions/positive.cpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/elementwise_functions/positive.cpp
rename to dpnp/tensor/libtensor/source/elementwise_functions/positive.cpp
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/positive.hpp b/dpnp/tensor/libtensor/source/elementwise_functions/positive.hpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/elementwise_functions/positive.hpp
rename to dpnp/tensor/libtensor/source/elementwise_functions/positive.hpp
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/pow.cpp b/dpnp/tensor/libtensor/source/elementwise_functions/pow.cpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/elementwise_functions/pow.cpp
rename to dpnp/tensor/libtensor/source/elementwise_functions/pow.cpp
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/pow.hpp b/dpnp/tensor/libtensor/source/elementwise_functions/pow.hpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/elementwise_functions/pow.hpp
rename to dpnp/tensor/libtensor/source/elementwise_functions/pow.hpp
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/proj.cpp b/dpnp/tensor/libtensor/source/elementwise_functions/proj.cpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/elementwise_functions/proj.cpp
rename to dpnp/tensor/libtensor/source/elementwise_functions/proj.cpp
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/proj.hpp b/dpnp/tensor/libtensor/source/elementwise_functions/proj.hpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/elementwise_functions/proj.hpp
rename to dpnp/tensor/libtensor/source/elementwise_functions/proj.hpp
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/real.cpp b/dpnp/tensor/libtensor/source/elementwise_functions/real.cpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/elementwise_functions/real.cpp
rename to dpnp/tensor/libtensor/source/elementwise_functions/real.cpp
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/real.hpp b/dpnp/tensor/libtensor/source/elementwise_functions/real.hpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/elementwise_functions/real.hpp
rename to dpnp/tensor/libtensor/source/elementwise_functions/real.hpp
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/reciprocal.cpp b/dpnp/tensor/libtensor/source/elementwise_functions/reciprocal.cpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/elementwise_functions/reciprocal.cpp
rename to dpnp/tensor/libtensor/source/elementwise_functions/reciprocal.cpp
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/reciprocal.hpp b/dpnp/tensor/libtensor/source/elementwise_functions/reciprocal.hpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/elementwise_functions/reciprocal.hpp
rename to dpnp/tensor/libtensor/source/elementwise_functions/reciprocal.hpp
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/remainder.cpp b/dpnp/tensor/libtensor/source/elementwise_functions/remainder.cpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/elementwise_functions/remainder.cpp
rename to dpnp/tensor/libtensor/source/elementwise_functions/remainder.cpp
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/remainder.hpp b/dpnp/tensor/libtensor/source/elementwise_functions/remainder.hpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/elementwise_functions/remainder.hpp
rename to dpnp/tensor/libtensor/source/elementwise_functions/remainder.hpp
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/round.cpp b/dpnp/tensor/libtensor/source/elementwise_functions/round.cpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/elementwise_functions/round.cpp
rename to dpnp/tensor/libtensor/source/elementwise_functions/round.cpp
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/round.hpp b/dpnp/tensor/libtensor/source/elementwise_functions/round.hpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/elementwise_functions/round.hpp
rename to dpnp/tensor/libtensor/source/elementwise_functions/round.hpp
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/rsqrt.cpp b/dpnp/tensor/libtensor/source/elementwise_functions/rsqrt.cpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/elementwise_functions/rsqrt.cpp
rename to dpnp/tensor/libtensor/source/elementwise_functions/rsqrt.cpp
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/rsqrt.hpp b/dpnp/tensor/libtensor/source/elementwise_functions/rsqrt.hpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/elementwise_functions/rsqrt.hpp
rename to dpnp/tensor/libtensor/source/elementwise_functions/rsqrt.hpp
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/sign.cpp b/dpnp/tensor/libtensor/source/elementwise_functions/sign.cpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/elementwise_functions/sign.cpp
rename to dpnp/tensor/libtensor/source/elementwise_functions/sign.cpp
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/sign.hpp b/dpnp/tensor/libtensor/source/elementwise_functions/sign.hpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/elementwise_functions/sign.hpp
rename to dpnp/tensor/libtensor/source/elementwise_functions/sign.hpp
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/signbit.cpp b/dpnp/tensor/libtensor/source/elementwise_functions/signbit.cpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/elementwise_functions/signbit.cpp
rename to dpnp/tensor/libtensor/source/elementwise_functions/signbit.cpp
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/signbit.hpp b/dpnp/tensor/libtensor/source/elementwise_functions/signbit.hpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/elementwise_functions/signbit.hpp
rename to dpnp/tensor/libtensor/source/elementwise_functions/signbit.hpp
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/sin.cpp b/dpnp/tensor/libtensor/source/elementwise_functions/sin.cpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/elementwise_functions/sin.cpp
rename to dpnp/tensor/libtensor/source/elementwise_functions/sin.cpp
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/sin.hpp b/dpnp/tensor/libtensor/source/elementwise_functions/sin.hpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/elementwise_functions/sin.hpp
rename to dpnp/tensor/libtensor/source/elementwise_functions/sin.hpp
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/sinh.cpp b/dpnp/tensor/libtensor/source/elementwise_functions/sinh.cpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/elementwise_functions/sinh.cpp
rename to dpnp/tensor/libtensor/source/elementwise_functions/sinh.cpp
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/sinh.hpp b/dpnp/tensor/libtensor/source/elementwise_functions/sinh.hpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/elementwise_functions/sinh.hpp
rename to dpnp/tensor/libtensor/source/elementwise_functions/sinh.hpp
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/sqrt.cpp b/dpnp/tensor/libtensor/source/elementwise_functions/sqrt.cpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/elementwise_functions/sqrt.cpp
rename to dpnp/tensor/libtensor/source/elementwise_functions/sqrt.cpp
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/sqrt.hpp b/dpnp/tensor/libtensor/source/elementwise_functions/sqrt.hpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/elementwise_functions/sqrt.hpp
rename to dpnp/tensor/libtensor/source/elementwise_functions/sqrt.hpp
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/square.cpp b/dpnp/tensor/libtensor/source/elementwise_functions/square.cpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/elementwise_functions/square.cpp
rename to dpnp/tensor/libtensor/source/elementwise_functions/square.cpp
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/square.hpp b/dpnp/tensor/libtensor/source/elementwise_functions/square.hpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/elementwise_functions/square.hpp
rename to dpnp/tensor/libtensor/source/elementwise_functions/square.hpp
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/subtract.cpp b/dpnp/tensor/libtensor/source/elementwise_functions/subtract.cpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/elementwise_functions/subtract.cpp
rename to dpnp/tensor/libtensor/source/elementwise_functions/subtract.cpp
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/subtract.hpp b/dpnp/tensor/libtensor/source/elementwise_functions/subtract.hpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/elementwise_functions/subtract.hpp
rename to dpnp/tensor/libtensor/source/elementwise_functions/subtract.hpp
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/tan.cpp b/dpnp/tensor/libtensor/source/elementwise_functions/tan.cpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/elementwise_functions/tan.cpp
rename to dpnp/tensor/libtensor/source/elementwise_functions/tan.cpp
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/tan.hpp b/dpnp/tensor/libtensor/source/elementwise_functions/tan.hpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/elementwise_functions/tan.hpp
rename to dpnp/tensor/libtensor/source/elementwise_functions/tan.hpp
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/tanh.cpp b/dpnp/tensor/libtensor/source/elementwise_functions/tanh.cpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/elementwise_functions/tanh.cpp
rename to dpnp/tensor/libtensor/source/elementwise_functions/tanh.cpp
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/tanh.hpp b/dpnp/tensor/libtensor/source/elementwise_functions/tanh.hpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/elementwise_functions/tanh.hpp
rename to dpnp/tensor/libtensor/source/elementwise_functions/tanh.hpp
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/true_divide.cpp b/dpnp/tensor/libtensor/source/elementwise_functions/true_divide.cpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/elementwise_functions/true_divide.cpp
rename to dpnp/tensor/libtensor/source/elementwise_functions/true_divide.cpp
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/true_divide.hpp b/dpnp/tensor/libtensor/source/elementwise_functions/true_divide.hpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/elementwise_functions/true_divide.hpp
rename to dpnp/tensor/libtensor/source/elementwise_functions/true_divide.hpp
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/trunc.cpp b/dpnp/tensor/libtensor/source/elementwise_functions/trunc.cpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/elementwise_functions/trunc.cpp
rename to dpnp/tensor/libtensor/source/elementwise_functions/trunc.cpp
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/trunc.hpp b/dpnp/tensor/libtensor/source/elementwise_functions/trunc.hpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/elementwise_functions/trunc.hpp
rename to dpnp/tensor/libtensor/source/elementwise_functions/trunc.hpp
diff --git a/dpctl_ext/tensor/libtensor/source/eye_ctor.cpp b/dpnp/tensor/libtensor/source/eye_ctor.cpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/eye_ctor.cpp
rename to dpnp/tensor/libtensor/source/eye_ctor.cpp
diff --git a/dpctl_ext/tensor/libtensor/source/eye_ctor.hpp b/dpnp/tensor/libtensor/source/eye_ctor.hpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/eye_ctor.hpp
rename to dpnp/tensor/libtensor/source/eye_ctor.hpp
diff --git a/dpctl_ext/tensor/libtensor/source/full_ctor.cpp b/dpnp/tensor/libtensor/source/full_ctor.cpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/full_ctor.cpp
rename to dpnp/tensor/libtensor/source/full_ctor.cpp
diff --git a/dpctl_ext/tensor/libtensor/source/full_ctor.hpp b/dpnp/tensor/libtensor/source/full_ctor.hpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/full_ctor.hpp
rename to dpnp/tensor/libtensor/source/full_ctor.hpp
diff --git a/dpctl_ext/tensor/libtensor/source/integer_advanced_indexing.cpp b/dpnp/tensor/libtensor/source/integer_advanced_indexing.cpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/integer_advanced_indexing.cpp
rename to dpnp/tensor/libtensor/source/integer_advanced_indexing.cpp
diff --git a/dpctl_ext/tensor/libtensor/source/integer_advanced_indexing.hpp b/dpnp/tensor/libtensor/source/integer_advanced_indexing.hpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/integer_advanced_indexing.hpp
rename to dpnp/tensor/libtensor/source/integer_advanced_indexing.hpp
diff --git a/dpctl_ext/tensor/libtensor/source/linalg_functions/dot.cpp b/dpnp/tensor/libtensor/source/linalg_functions/dot.cpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/linalg_functions/dot.cpp
rename to dpnp/tensor/libtensor/source/linalg_functions/dot.cpp
diff --git a/dpctl_ext/tensor/libtensor/source/linalg_functions/dot.hpp b/dpnp/tensor/libtensor/source/linalg_functions/dot.hpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/linalg_functions/dot.hpp
rename to dpnp/tensor/libtensor/source/linalg_functions/dot.hpp
diff --git a/dpctl_ext/tensor/libtensor/source/linalg_functions/dot_atomic_support.hpp b/dpnp/tensor/libtensor/source/linalg_functions/dot_atomic_support.hpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/linalg_functions/dot_atomic_support.hpp
rename to dpnp/tensor/libtensor/source/linalg_functions/dot_atomic_support.hpp
diff --git a/dpctl_ext/tensor/libtensor/source/linalg_functions/dot_dispatch.hpp b/dpnp/tensor/libtensor/source/linalg_functions/dot_dispatch.hpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/linalg_functions/dot_dispatch.hpp
rename to dpnp/tensor/libtensor/source/linalg_functions/dot_dispatch.hpp
diff --git a/dpctl_ext/tensor/libtensor/source/linear_sequences.cpp b/dpnp/tensor/libtensor/source/linear_sequences.cpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/linear_sequences.cpp
rename to dpnp/tensor/libtensor/source/linear_sequences.cpp
diff --git a/dpctl_ext/tensor/libtensor/source/linear_sequences.hpp b/dpnp/tensor/libtensor/source/linear_sequences.hpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/linear_sequences.hpp
rename to dpnp/tensor/libtensor/source/linear_sequences.hpp
diff --git a/dpctl_ext/tensor/libtensor/source/reductions/all.cpp b/dpnp/tensor/libtensor/source/reductions/all.cpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/reductions/all.cpp
rename to dpnp/tensor/libtensor/source/reductions/all.cpp
diff --git a/dpctl_ext/tensor/libtensor/source/reductions/all.hpp b/dpnp/tensor/libtensor/source/reductions/all.hpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/reductions/all.hpp
rename to dpnp/tensor/libtensor/source/reductions/all.hpp
diff --git a/dpctl_ext/tensor/libtensor/source/reductions/any.cpp b/dpnp/tensor/libtensor/source/reductions/any.cpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/reductions/any.cpp
rename to dpnp/tensor/libtensor/source/reductions/any.cpp
diff --git a/dpctl_ext/tensor/libtensor/source/reductions/any.hpp b/dpnp/tensor/libtensor/source/reductions/any.hpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/reductions/any.hpp
rename to dpnp/tensor/libtensor/source/reductions/any.hpp
diff --git a/dpctl_ext/tensor/libtensor/source/reductions/argmax.cpp b/dpnp/tensor/libtensor/source/reductions/argmax.cpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/reductions/argmax.cpp
rename to dpnp/tensor/libtensor/source/reductions/argmax.cpp
diff --git a/dpctl_ext/tensor/libtensor/source/reductions/argmax.hpp b/dpnp/tensor/libtensor/source/reductions/argmax.hpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/reductions/argmax.hpp
rename to dpnp/tensor/libtensor/source/reductions/argmax.hpp
diff --git a/dpctl_ext/tensor/libtensor/source/reductions/argmin.cpp b/dpnp/tensor/libtensor/source/reductions/argmin.cpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/reductions/argmin.cpp
rename to dpnp/tensor/libtensor/source/reductions/argmin.cpp
diff --git a/dpctl_ext/tensor/libtensor/source/reductions/argmin.hpp b/dpnp/tensor/libtensor/source/reductions/argmin.hpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/reductions/argmin.hpp
rename to dpnp/tensor/libtensor/source/reductions/argmin.hpp
diff --git a/dpctl_ext/tensor/libtensor/source/reductions/logsumexp.cpp b/dpnp/tensor/libtensor/source/reductions/logsumexp.cpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/reductions/logsumexp.cpp
rename to dpnp/tensor/libtensor/source/reductions/logsumexp.cpp
diff --git a/dpctl_ext/tensor/libtensor/source/reductions/logsumexp.hpp b/dpnp/tensor/libtensor/source/reductions/logsumexp.hpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/reductions/logsumexp.hpp
rename to dpnp/tensor/libtensor/source/reductions/logsumexp.hpp
diff --git a/dpctl_ext/tensor/libtensor/source/reductions/max.cpp b/dpnp/tensor/libtensor/source/reductions/max.cpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/reductions/max.cpp
rename to dpnp/tensor/libtensor/source/reductions/max.cpp
diff --git a/dpctl_ext/tensor/libtensor/source/reductions/max.hpp b/dpnp/tensor/libtensor/source/reductions/max.hpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/reductions/max.hpp
rename to dpnp/tensor/libtensor/source/reductions/max.hpp
diff --git a/dpctl_ext/tensor/libtensor/source/reductions/min.cpp b/dpnp/tensor/libtensor/source/reductions/min.cpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/reductions/min.cpp
rename to dpnp/tensor/libtensor/source/reductions/min.cpp
diff --git a/dpctl_ext/tensor/libtensor/source/reductions/min.hpp b/dpnp/tensor/libtensor/source/reductions/min.hpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/reductions/min.hpp
rename to dpnp/tensor/libtensor/source/reductions/min.hpp
diff --git a/dpctl_ext/tensor/libtensor/source/reductions/prod.cpp b/dpnp/tensor/libtensor/source/reductions/prod.cpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/reductions/prod.cpp
rename to dpnp/tensor/libtensor/source/reductions/prod.cpp
diff --git a/dpctl_ext/tensor/libtensor/source/reductions/prod.hpp b/dpnp/tensor/libtensor/source/reductions/prod.hpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/reductions/prod.hpp
rename to dpnp/tensor/libtensor/source/reductions/prod.hpp
diff --git a/dpctl_ext/tensor/libtensor/source/reductions/reduce_hypot.cpp b/dpnp/tensor/libtensor/source/reductions/reduce_hypot.cpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/reductions/reduce_hypot.cpp
rename to dpnp/tensor/libtensor/source/reductions/reduce_hypot.cpp
diff --git a/dpctl_ext/tensor/libtensor/source/reductions/reduce_hypot.hpp b/dpnp/tensor/libtensor/source/reductions/reduce_hypot.hpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/reductions/reduce_hypot.hpp
rename to dpnp/tensor/libtensor/source/reductions/reduce_hypot.hpp
diff --git a/dpctl_ext/tensor/libtensor/source/reductions/reduction_atomic_support.hpp b/dpnp/tensor/libtensor/source/reductions/reduction_atomic_support.hpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/reductions/reduction_atomic_support.hpp
rename to dpnp/tensor/libtensor/source/reductions/reduction_atomic_support.hpp
diff --git a/dpctl_ext/tensor/libtensor/source/reductions/reduction_common.cpp b/dpnp/tensor/libtensor/source/reductions/reduction_common.cpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/reductions/reduction_common.cpp
rename to dpnp/tensor/libtensor/source/reductions/reduction_common.cpp
diff --git a/dpctl_ext/tensor/libtensor/source/reductions/reduction_common.hpp b/dpnp/tensor/libtensor/source/reductions/reduction_common.hpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/reductions/reduction_common.hpp
rename to dpnp/tensor/libtensor/source/reductions/reduction_common.hpp
diff --git a/dpctl_ext/tensor/libtensor/source/reductions/reduction_over_axis.hpp b/dpnp/tensor/libtensor/source/reductions/reduction_over_axis.hpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/reductions/reduction_over_axis.hpp
rename to dpnp/tensor/libtensor/source/reductions/reduction_over_axis.hpp
diff --git a/dpctl_ext/tensor/libtensor/source/reductions/sum.cpp b/dpnp/tensor/libtensor/source/reductions/sum.cpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/reductions/sum.cpp
rename to dpnp/tensor/libtensor/source/reductions/sum.cpp
diff --git a/dpctl_ext/tensor/libtensor/source/reductions/sum.hpp b/dpnp/tensor/libtensor/source/reductions/sum.hpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/reductions/sum.hpp
rename to dpnp/tensor/libtensor/source/reductions/sum.hpp
diff --git a/dpctl_ext/tensor/libtensor/source/repeat.cpp b/dpnp/tensor/libtensor/source/repeat.cpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/repeat.cpp
rename to dpnp/tensor/libtensor/source/repeat.cpp
diff --git a/dpctl_ext/tensor/libtensor/source/repeat.hpp b/dpnp/tensor/libtensor/source/repeat.hpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/repeat.hpp
rename to dpnp/tensor/libtensor/source/repeat.hpp
diff --git a/dpctl_ext/tensor/libtensor/source/simplify_iteration_space.cpp b/dpnp/tensor/libtensor/source/simplify_iteration_space.cpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/simplify_iteration_space.cpp
rename to dpnp/tensor/libtensor/source/simplify_iteration_space.cpp
diff --git a/dpctl_ext/tensor/libtensor/source/simplify_iteration_space.hpp b/dpnp/tensor/libtensor/source/simplify_iteration_space.hpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/simplify_iteration_space.hpp
rename to dpnp/tensor/libtensor/source/simplify_iteration_space.hpp
diff --git a/dpctl_ext/tensor/libtensor/source/sorting/isin.cpp b/dpnp/tensor/libtensor/source/sorting/isin.cpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/sorting/isin.cpp
rename to dpnp/tensor/libtensor/source/sorting/isin.cpp
diff --git a/dpctl_ext/tensor/libtensor/source/sorting/isin.hpp b/dpnp/tensor/libtensor/source/sorting/isin.hpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/sorting/isin.hpp
rename to dpnp/tensor/libtensor/source/sorting/isin.hpp
diff --git a/dpctl_ext/tensor/libtensor/source/sorting/merge_argsort.cpp b/dpnp/tensor/libtensor/source/sorting/merge_argsort.cpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/sorting/merge_argsort.cpp
rename to dpnp/tensor/libtensor/source/sorting/merge_argsort.cpp
diff --git a/dpctl_ext/tensor/libtensor/source/sorting/merge_argsort.hpp b/dpnp/tensor/libtensor/source/sorting/merge_argsort.hpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/sorting/merge_argsort.hpp
rename to dpnp/tensor/libtensor/source/sorting/merge_argsort.hpp
diff --git a/dpctl_ext/tensor/libtensor/source/sorting/merge_sort.cpp b/dpnp/tensor/libtensor/source/sorting/merge_sort.cpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/sorting/merge_sort.cpp
rename to dpnp/tensor/libtensor/source/sorting/merge_sort.cpp
diff --git a/dpctl_ext/tensor/libtensor/source/sorting/merge_sort.hpp b/dpnp/tensor/libtensor/source/sorting/merge_sort.hpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/sorting/merge_sort.hpp
rename to dpnp/tensor/libtensor/source/sorting/merge_sort.hpp
diff --git a/dpctl_ext/tensor/libtensor/source/sorting/py_argsort_common.hpp b/dpnp/tensor/libtensor/source/sorting/py_argsort_common.hpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/sorting/py_argsort_common.hpp
rename to dpnp/tensor/libtensor/source/sorting/py_argsort_common.hpp
diff --git a/dpctl_ext/tensor/libtensor/source/sorting/py_sort_common.hpp b/dpnp/tensor/libtensor/source/sorting/py_sort_common.hpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/sorting/py_sort_common.hpp
rename to dpnp/tensor/libtensor/source/sorting/py_sort_common.hpp
diff --git a/dpctl_ext/tensor/libtensor/source/sorting/radix_argsort.cpp b/dpnp/tensor/libtensor/source/sorting/radix_argsort.cpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/sorting/radix_argsort.cpp
rename to dpnp/tensor/libtensor/source/sorting/radix_argsort.cpp
diff --git a/dpctl_ext/tensor/libtensor/source/sorting/radix_argsort.hpp b/dpnp/tensor/libtensor/source/sorting/radix_argsort.hpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/sorting/radix_argsort.hpp
rename to dpnp/tensor/libtensor/source/sorting/radix_argsort.hpp
diff --git a/dpctl_ext/tensor/libtensor/source/sorting/radix_sort.cpp b/dpnp/tensor/libtensor/source/sorting/radix_sort.cpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/sorting/radix_sort.cpp
rename to dpnp/tensor/libtensor/source/sorting/radix_sort.cpp
diff --git a/dpctl_ext/tensor/libtensor/source/sorting/radix_sort.hpp b/dpnp/tensor/libtensor/source/sorting/radix_sort.hpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/sorting/radix_sort.hpp
rename to dpnp/tensor/libtensor/source/sorting/radix_sort.hpp
diff --git a/dpctl_ext/tensor/libtensor/source/sorting/radix_sort_support.hpp b/dpnp/tensor/libtensor/source/sorting/radix_sort_support.hpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/sorting/radix_sort_support.hpp
rename to dpnp/tensor/libtensor/source/sorting/radix_sort_support.hpp
diff --git a/dpctl_ext/tensor/libtensor/source/sorting/searchsorted.cpp b/dpnp/tensor/libtensor/source/sorting/searchsorted.cpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/sorting/searchsorted.cpp
rename to dpnp/tensor/libtensor/source/sorting/searchsorted.cpp
diff --git a/dpctl_ext/tensor/libtensor/source/sorting/searchsorted.hpp b/dpnp/tensor/libtensor/source/sorting/searchsorted.hpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/sorting/searchsorted.hpp
rename to dpnp/tensor/libtensor/source/sorting/searchsorted.hpp
diff --git a/dpctl_ext/tensor/libtensor/source/sorting/topk.cpp b/dpnp/tensor/libtensor/source/sorting/topk.cpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/sorting/topk.cpp
rename to dpnp/tensor/libtensor/source/sorting/topk.cpp
diff --git a/dpctl_ext/tensor/libtensor/source/sorting/topk.hpp b/dpnp/tensor/libtensor/source/sorting/topk.hpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/sorting/topk.hpp
rename to dpnp/tensor/libtensor/source/sorting/topk.hpp
diff --git a/dpctl_ext/tensor/libtensor/source/tensor_accumulation.cpp b/dpnp/tensor/libtensor/source/tensor_accumulation.cpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/tensor_accumulation.cpp
rename to dpnp/tensor/libtensor/source/tensor_accumulation.cpp
diff --git a/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp b/dpnp/tensor/libtensor/source/tensor_ctors.cpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp
rename to dpnp/tensor/libtensor/source/tensor_ctors.cpp
diff --git a/dpctl_ext/tensor/libtensor/source/tensor_elementwise.cpp b/dpnp/tensor/libtensor/source/tensor_elementwise.cpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/tensor_elementwise.cpp
rename to dpnp/tensor/libtensor/source/tensor_elementwise.cpp
diff --git a/dpctl_ext/tensor/libtensor/source/tensor_linalg.cpp b/dpnp/tensor/libtensor/source/tensor_linalg.cpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/tensor_linalg.cpp
rename to dpnp/tensor/libtensor/source/tensor_linalg.cpp
diff --git a/dpctl_ext/tensor/libtensor/source/tensor_reductions.cpp b/dpnp/tensor/libtensor/source/tensor_reductions.cpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/tensor_reductions.cpp
rename to dpnp/tensor/libtensor/source/tensor_reductions.cpp
diff --git a/dpctl_ext/tensor/libtensor/source/tensor_sorting.cpp b/dpnp/tensor/libtensor/source/tensor_sorting.cpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/tensor_sorting.cpp
rename to dpnp/tensor/libtensor/source/tensor_sorting.cpp
diff --git a/dpctl_ext/tensor/libtensor/source/triul_ctor.cpp b/dpnp/tensor/libtensor/source/triul_ctor.cpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/triul_ctor.cpp
rename to dpnp/tensor/libtensor/source/triul_ctor.cpp
diff --git a/dpctl_ext/tensor/libtensor/source/triul_ctor.hpp b/dpnp/tensor/libtensor/source/triul_ctor.hpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/triul_ctor.hpp
rename to dpnp/tensor/libtensor/source/triul_ctor.hpp
diff --git a/dpctl_ext/tensor/libtensor/source/where.cpp b/dpnp/tensor/libtensor/source/where.cpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/where.cpp
rename to dpnp/tensor/libtensor/source/where.cpp
diff --git a/dpctl_ext/tensor/libtensor/source/where.hpp b/dpnp/tensor/libtensor/source/where.hpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/where.hpp
rename to dpnp/tensor/libtensor/source/where.hpp
diff --git a/dpctl_ext/tensor/libtensor/source/zeros_ctor.cpp b/dpnp/tensor/libtensor/source/zeros_ctor.cpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/zeros_ctor.cpp
rename to dpnp/tensor/libtensor/source/zeros_ctor.cpp
diff --git a/dpctl_ext/tensor/libtensor/source/zeros_ctor.hpp b/dpnp/tensor/libtensor/source/zeros_ctor.hpp
similarity index 100%
rename from dpctl_ext/tensor/libtensor/source/zeros_ctor.hpp
rename to dpnp/tensor/libtensor/source/zeros_ctor.hpp
diff --git a/dpnp/tests/test_array_api_info.py b/dpnp/tests/test_array_api_info.py
index 32730c8724dc..81141f9dab00 100644
--- a/dpnp/tests/test_array_api_info.py
+++ b/dpnp/tests/test_array_api_info.py
@@ -2,10 +2,7 @@
 from dpctl import SyclDeviceCreationError, get_devices, select_default_device
 
 import dpnp
-
-# TODO: revert to `from dpctl.tensor....`
-# when dpnp fully migrates dpctl/tensor
-from dpctl_ext.tensor._tensor_impl import default_device_complex_type
+from dpnp.tensor._tensor_impl import default_device_complex_type
 from dpnp.tests.helper import (
     has_support_aspect64,
     is_win_platform,
diff --git a/dpnp/tests/test_arraycreation.py b/dpnp/tests/test_arraycreation.py
index 8d89f2a42ca8..b195c0484105 100644
--- a/dpnp/tests/test_arraycreation.py
+++ b/dpnp/tests/test_arraycreation.py
@@ -12,10 +12,8 @@
     assert_raises_regex,
 )
 
-# TODO: revert to `import dpctl.tensor...`
-# when dpnp fully migrates dpctl/tensor
-import dpctl_ext.tensor as dpt
 import dpnp
+import dpnp.tensor as dpt
 
 from .helper import (
     assert_dtype_allclose,
diff --git a/dpnp/tests/test_arraymanipulation.py b/dpnp/tests/test_arraymanipulation.py
index f7df6387caf6..970e0b12318d 100644
--- a/dpnp/tests/test_arraymanipulation.py
+++ b/dpnp/tests/test_arraymanipulation.py
@@ -2,14 +2,9 @@
 import pytest
 from numpy.testing import assert_array_equal, assert_equal, assert_raises
 
-# TODO: revert to `import dpctl.tensor...`
-# when dpnp fully migrates dpctl/tensor
-import dpctl_ext.tensor as dpt
 import dpnp
-
-# TODO: revert to `from dpctl.tensor...`
-# when dpnp fully migrates dpctl/tensor
-from dpctl_ext.tensor._numpy_helper import AxisError
+import dpnp.tensor as dpt
+from dpnp.tensor._numpy_helper import AxisError
 
 from .helper import get_all_dtypes, get_float_complex_dtypes
 from .third_party.cupy import testing
diff --git a/dpnp/tests/test_counting.py b/dpnp/tests/test_counting.py
index 9210e7c1b3dd..e729b98af756 100644
--- a/dpnp/tests/test_counting.py
+++ b/dpnp/tests/test_counting.py
@@ -7,10 +7,7 @@
 )
 
 import dpnp
-
-# TODO: revert to `from dpctl.tensor...`
-# when dpnp fully migrates dpctl/tensor
-from dpctl_ext.tensor._numpy_helper import AxisError
+from dpnp.tensor._numpy_helper import AxisError
 
 from .helper import (
     get_all_dtypes,
diff --git a/dpnp/tests/test_fft.py b/dpnp/tests/test_fft.py
index 3a19a2cf3668..c39303f32c1a 100644
--- a/dpnp/tests/test_fft.py
+++ b/dpnp/tests/test_fft.py
@@ -4,10 +4,8 @@
 from dpctl.utils import ExecutionPlacementError
 from numpy.testing import assert_raises
 
-# TODO: revert to `import dpctl.tensor...`
-# when dpnp fully migrates dpctl/tensor
-import dpctl_ext.tensor as dpt
 import dpnp
+import dpnp.tensor as dpt
 from dpnp.dpnp_utils import map_dtype_to_device
 
 from .helper import (
diff --git a/dpnp/tests/test_flipping.py b/dpnp/tests/test_flipping.py
index cd55846e3668..5bb3cd85bf79 100644
--- a/dpnp/tests/test_flipping.py
+++ b/dpnp/tests/test_flipping.py
@@ -7,10 +7,7 @@
 )
 
 import dpnp
-
-# TODO: revert to `from dpctl.tensor...`
-# when dpnp fully migrates dpctl/tensor
-from dpctl_ext.tensor._numpy_helper import AxisError
+from dpnp.tensor._numpy_helper import AxisError
 
 from .helper import (
     get_all_dtypes,
diff --git a/dpnp/tests/test_indexing.py b/dpnp/tests/test_indexing.py
index d8822d77080b..3bdd5449d223 100644
--- a/dpnp/tests/test_indexing.py
+++ b/dpnp/tests/test_indexing.py
@@ -12,13 +12,11 @@
     assert_raises_regex,
 )
 
-# TODO: revert to `import dpctl.tensor...`
-# when dpnp fully migrates dpctl/tensor
-import dpctl_ext.tensor as dpt
 import dpnp
-from dpctl_ext.tensor._numpy_helper import AxisError
-from dpctl_ext.tensor._type_utils import _to_device_supported_dtype
+import dpnp.tensor as dpt
 from dpnp.dpnp_array import dpnp_array
+from dpnp.tensor._numpy_helper import AxisError
+from dpnp.tensor._type_utils import _to_device_supported_dtype
 
 from .helper import (
     get_abs_array,
diff --git a/dpnp/tests/test_linalg.py b/dpnp/tests/test_linalg.py
index dfd6e21c2a95..d32237c04aad 100644
--- a/dpnp/tests/test_linalg.py
+++ b/dpnp/tests/test_linalg.py
@@ -12,14 +12,9 @@
     assert_raises_regex,
 )
 
-# TODO: revert to `import dpctl.tensor...`
-# when dpnp fully migrates dpctl/tensor
-import dpctl_ext.tensor as dpt
 import dpnp
-
-# TODO: revert to `from dpctl.tensor...`
-# when dpnp fully migrates dpctl/tensor
-from dpctl_ext.tensor._numpy_helper import AxisError
+import dpnp.tensor as dpt
+from dpnp.tensor._numpy_helper import AxisError
 
 from .helper import (
     assert_dtype_allclose,
diff --git a/dpnp/tests/test_manipulation.py b/dpnp/tests/test_manipulation.py
index d30c08a65f1e..672fce699f14 100644
--- a/dpnp/tests/test_manipulation.py
+++ b/dpnp/tests/test_manipulation.py
@@ -8,14 +8,9 @@
     assert_raises,
 )
 
-# TODO: revert to `import dpctl.tensor...`
-# when dpnp fully migrates dpctl/tensor
-import dpctl_ext.tensor as dpt
 import dpnp
-
-# TODO: revert to `from dpctl.tensor...`
-# when dpnp fully migrates dpctl/tensor
-from dpctl_ext.tensor._numpy_helper import AxisError
+import dpnp.tensor as dpt
+from dpnp.tensor._numpy_helper import AxisError
 
 from .helper import (
     assert_dtype_allclose,
diff --git a/dpnp/tests/test_mathematical.py b/dpnp/tests/test_mathematical.py
index 155f4cdb06fb..ae273ffa8c03 100644
--- a/dpnp/tests/test_mathematical.py
+++ b/dpnp/tests/test_mathematical.py
@@ -10,19 +10,14 @@
     assert_raises_regex,
 )
 
-# TODO: revert to `import dpctl.tensor...`
-# when dpnp fully migrates dpctl/tensor
-import dpctl_ext.tensor as dpt
 import dpnp
-
-# TODO: revert to `from dpctl.tensor...`
-# when dpnp fully migrates dpctl/tensor
-from dpctl_ext.tensor._numpy_helper import (
+import dpnp.tensor as dpt
+from dpnp.dpnp_array import dpnp_array
+from dpnp.dpnp_utils import map_dtype_to_device
+from dpnp.tensor._numpy_helper import (
     AxisError,
     normalize_axis_index,
 )
-from dpnp.dpnp_array import dpnp_array
-from dpnp.dpnp_utils import map_dtype_to_device
 
 from .helper import (
     LTS_VERSION,
diff --git a/dpnp/tests/test_memory.py b/dpnp/tests/test_memory.py
index dd87a993e1dc..c2601ab092f7 100644
--- a/dpnp/tests/test_memory.py
+++ b/dpnp/tests/test_memory.py
@@ -1,11 +1,9 @@
 import numpy
 import pytest
 
-# TODO: revert to `import dpctl.tensor...`
-# when dpnp fully migrates dpctl/tensor
-import dpctl_ext.tensor as dpt
 import dpnp
 import dpnp.memory as dpm
+import dpnp.tensor as dpt
 
 
 class IntUsmData(dpt.usm_ndarray):
diff --git a/dpnp/tests/test_nanfunctions.py b/dpnp/tests/test_nanfunctions.py
index 2cb70df5954a..3c5ea5b61989 100644
--- a/dpnp/tests/test_nanfunctions.py
+++ b/dpnp/tests/test_nanfunctions.py
@@ -11,10 +11,8 @@
     assert_raises_regex,
 )
 
-# TODO: revert to `import dpctl.tensor...`
-# when dpnp fully migrates dpctl/tensor
-import dpctl_ext.tensor as dpt
 import dpnp
+import dpnp.tensor as dpt
 
 from .helper import (
     assert_dtype_allclose,
diff --git a/dpnp/tests/test_ndarray.py b/dpnp/tests/test_ndarray.py
index 8944043d90a0..a16ccd9f5bf3 100644
--- a/dpnp/tests/test_ndarray.py
+++ b/dpnp/tests/test_ndarray.py
@@ -8,10 +8,8 @@
     assert_raises_regex,
 )
 
-# TODO: revert to `import dpctl.tensor...`
-# when dpnp fully migrates dpctl/tensor
-import dpctl_ext.tensor as dpt
 import dpnp
+import dpnp.tensor as dpt
 
 from .helper import (
     generate_random_numpy_array,
diff --git a/dpnp/tests/test_product.py b/dpnp/tests/test_product.py
index 9c2bc54e30b5..cd71b07352da 100644
--- a/dpnp/tests/test_product.py
+++ b/dpnp/tests/test_product.py
@@ -5,11 +5,8 @@
 from numpy.testing import assert_allclose, assert_array_equal, assert_raises
 
 import dpnp
-
-# TODO: revert to `from dpctl.tensor...`
-# when dpnp fully migrates dpctl/tensor
-from dpctl_ext.tensor._numpy_helper import AxisError
 from dpnp.dpnp_utils import map_dtype_to_device
+from dpnp.tensor._numpy_helper import AxisError
 
 from .helper import (
     assert_dtype_allclose,
diff --git a/dpnp/tests/test_search.py b/dpnp/tests/test_search.py
index 36e0032ccff1..75ce9bdeed20 100644
--- a/dpnp/tests/test_search.py
+++ b/dpnp/tests/test_search.py
@@ -2,10 +2,8 @@
 import pytest
 from numpy.testing import assert_array_equal, assert_equal, assert_raises
 
-# TODO: revert to `import dpctl.tensor...`
-# when dpnp fully migrates dpctl/tensor
-import dpctl_ext.tensor as dpt
 import dpnp
+import dpnp.tensor as dpt
 
 from .helper import (
     generate_random_numpy_array,
diff --git a/dpnp/tests/test_sort.py b/dpnp/tests/test_sort.py
index 73eac4064892..9d893a858fc1 100644
--- a/dpnp/tests/test_sort.py
+++ b/dpnp/tests/test_sort.py
@@ -3,10 +3,7 @@
 from numpy.testing import assert_array_equal, assert_equal, assert_raises
 
 import dpnp
-
-# TODO: revert to `from dpctl.tensor...`
-# when dpnp fully migrates dpctl/tensor
-from dpctl_ext.tensor._numpy_helper import AxisError
+from dpnp.tensor._numpy_helper import AxisError
 
 from .helper import (
     assert_dtype_allclose,
diff --git a/dpnp/tests/test_statistics.py b/dpnp/tests/test_statistics.py
index fe8848b6c858..a02adfac2ecb 100644
--- a/dpnp/tests/test_statistics.py
+++ b/dpnp/tests/test_statistics.py
@@ -8,10 +8,8 @@
     assert_raises_regex,
 )
 
-# TODO: revert to `import dpctl.tensor...`
-# when dpnp fully migrates dpctl/tensor
-import dpctl_ext.tensor as dpt
 import dpnp
+import dpnp.tensor as dpt
 
 from .helper import (
     assert_dtype_allclose,
diff --git a/dpnp/tests/test_sycl_queue.py b/dpnp/tests/test_sycl_queue.py
index 4485d79b2213..f1678bd28da3 100644
--- a/dpnp/tests/test_sycl_queue.py
+++ b/dpnp/tests/test_sycl_queue.py
@@ -7,11 +7,9 @@
 from dpctl.utils import ExecutionPlacementError
 from numpy.testing import assert_array_equal, assert_raises
 
-# TODO: revert to `import dpctl.tensor...`
-# when dpnp fully migrates dpctl/tensor
-import dpctl_ext.tensor as dpt
 import dpnp
 import dpnp.linalg
+import dpnp.tensor as dpt
 from dpnp.dpnp_array import dpnp_array
 from dpnp.dpnp_utils import get_usm_allocations
 
diff --git a/dpnp/tests/test_usm_type.py b/dpnp/tests/test_usm_type.py
index 8f8efd1cdd10..a0cfe6d24979 100644
--- a/dpnp/tests/test_usm_type.py
+++ b/dpnp/tests/test_usm_type.py
@@ -6,10 +6,8 @@
 import numpy
 import pytest
 
-# TODO: revert to `import dpctl.tensor...`
-# when dpnp fully migrates dpctl/tensor
-import dpctl_ext.tensor as dpt
 import dpnp
+import dpnp.tensor as dpt
 from dpnp.dpnp_utils import get_usm_allocations
 
 from .helper import generate_random_numpy_array
diff --git a/dpnp/tests/test_utils.py b/dpnp/tests/test_utils.py
index ddbd267c2108..aef6abba8726 100644
--- a/dpnp/tests/test_utils.py
+++ b/dpnp/tests/test_utils.py
@@ -1,10 +1,8 @@
 import numpy
 import pytest
 
-# TODO: revert to `import dpctl.tensor...`
-# when dpnp fully migrates dpctl/tensor
-import dpctl_ext.tensor as dpt
 import dpnp
+import dpnp.tensor as dpt
 
 
 class TestIsSupportedArrayOrScalar:
diff --git a/dpnp/tests/third_party/cupy/core_tests/test_dlpack.py b/dpnp/tests/third_party/cupy/core_tests/test_dlpack.py
index e44f51f09b20..eb9e958fad0b 100644
--- a/dpnp/tests/third_party/cupy/core_tests/test_dlpack.py
+++ b/dpnp/tests/third_party/cupy/core_tests/test_dlpack.py
@@ -4,10 +4,8 @@
 import numpy
 import pytest
 
-# TODO: revert to `import dpctl.tensor...`
-# when dpnp fully migrates dpctl/tensor
-import dpctl_ext.tensor._dlpack as dlp
 import dpnp as cupy
+import dpnp.tensor._dlpack as dlp
 from dpnp.tests.third_party.cupy import testing
 
 
diff --git a/dpnp/tests/third_party/cupy/core_tests/test_ndarray.py b/dpnp/tests/third_party/cupy/core_tests/test_ndarray.py
index 085261317ead..2cff78627368 100644
--- a/dpnp/tests/third_party/cupy/core_tests/test_ndarray.py
+++ b/dpnp/tests/third_party/cupy/core_tests/test_ndarray.py
@@ -11,10 +11,7 @@
 # from cupy_backends.cuda.api import runtime
 # from cupy_backends.cuda import stream as stream_module
 import dpnp as cupy
-
-# TODO: revert to `from dpctl.tensor...`
-# when dpnp fully migrates dpctl/tensor
-from dpctl_ext.tensor._numpy_helper import AxisError
+from dpnp.tensor._numpy_helper import AxisError
 
 # from cupy import _util
 # from cupy import _core
diff --git a/dpnp/tests/third_party/cupy/lib_tests/test_shape_base.py b/dpnp/tests/third_party/cupy/lib_tests/test_shape_base.py
index a1309f3ed83d..51fe5f1ca68e 100644
--- a/dpnp/tests/third_party/cupy/lib_tests/test_shape_base.py
+++ b/dpnp/tests/third_party/cupy/lib_tests/test_shape_base.py
@@ -4,10 +4,7 @@
 import pytest
 
 import dpnp as cupy
-
-# TODO: revert to `from dpctl.tensor...`
-# when dpnp fully migrates dpctl/tensor
-from dpctl_ext.tensor._numpy_helper import AxisError
+from dpnp.tensor._numpy_helper import AxisError
 from dpnp.tests.helper import has_support_aspect64
 from dpnp.tests.third_party.cupy import testing
 
diff --git a/dpnp/tests/third_party/cupy/manipulation_tests/test_dims.py b/dpnp/tests/third_party/cupy/manipulation_tests/test_dims.py
index 8944a6b944c9..7391ed9b0f88 100644
--- a/dpnp/tests/third_party/cupy/manipulation_tests/test_dims.py
+++ b/dpnp/tests/third_party/cupy/manipulation_tests/test_dims.py
@@ -4,10 +4,7 @@
 import pytest
 
 import dpnp as cupy
-
-# TODO: revert to `from dpctl.tensor...`
-# when dpnp fully migrates dpctl/tensor
-from dpctl_ext.tensor._numpy_helper import AxisError
+from dpnp.tensor._numpy_helper import AxisError
 from dpnp.tests.third_party.cupy import testing
 
 
diff --git a/dpnp/tests/third_party/cupy/manipulation_tests/test_transpose.py b/dpnp/tests/third_party/cupy/manipulation_tests/test_transpose.py
index 0f6bed1c2ced..145b84027201 100644
--- a/dpnp/tests/third_party/cupy/manipulation_tests/test_transpose.py
+++ b/dpnp/tests/third_party/cupy/manipulation_tests/test_transpose.py
@@ -4,10 +4,7 @@
 import pytest
 
 import dpnp as cupy
-
-# TODO: revert to `from dpctl.tensor...`
-# when dpnp fully migrates dpctl/tensor
-from dpctl_ext.tensor._numpy_helper import AxisError
+from dpnp.tensor._numpy_helper import AxisError
 from dpnp.tests.third_party.cupy import testing
 
 
diff --git a/dpnp/tests/third_party/cupy/math_tests/test_sumprod.py b/dpnp/tests/third_party/cupy/math_tests/test_sumprod.py
index cb7200c1b13b..acd5d882f61f 100644
--- a/dpnp/tests/third_party/cupy/math_tests/test_sumprod.py
+++ b/dpnp/tests/third_party/cupy/math_tests/test_sumprod.py
@@ -4,10 +4,7 @@
 import pytest
 
 import dpnp as cupy
-
-# TODO: revert to `from dpctl.tensor...`
-# when dpnp fully migrates dpctl/tensor
-from dpctl_ext.tensor._numpy_helper import AxisError
+from dpnp.tensor._numpy_helper import AxisError
 from dpnp.tests.helper import (
     has_support_aspect16,
     has_support_aspect64,
diff --git a/dpnp/tests/third_party/cupy/sorting_tests/test_sort.py b/dpnp/tests/third_party/cupy/sorting_tests/test_sort.py
index 8359ba580a25..209b264c0fc5 100644
--- a/dpnp/tests/third_party/cupy/sorting_tests/test_sort.py
+++ b/dpnp/tests/third_party/cupy/sorting_tests/test_sort.py
@@ -6,10 +6,7 @@
 import pytest
 
 import dpnp as cupy
-
-# TODO: revert to `from dpctl.tensor...`
-# when dpnp fully migrates dpctl/tensor
-from dpctl_ext.tensor._numpy_helper import AxisError
+from dpnp.tensor._numpy_helper import AxisError
 from dpnp.tests.helper import has_support_aspect64
 from dpnp.tests.third_party.cupy import testing
 
diff --git a/dpnp/tests/third_party/cupy/statistics_tests/test_meanvar.py b/dpnp/tests/third_party/cupy/statistics_tests/test_meanvar.py
index d355d18985f2..70b1392a37af 100644
--- a/dpnp/tests/third_party/cupy/statistics_tests/test_meanvar.py
+++ b/dpnp/tests/third_party/cupy/statistics_tests/test_meanvar.py
@@ -4,10 +4,7 @@
 import pytest
 
 import dpnp as cupy
-
-# TODO: revert to `from dpctl.tensor...`
-# when dpnp fully migrates dpctl/tensor
-from dpctl_ext.tensor._numpy_helper import AxisError
+from dpnp.tensor._numpy_helper import AxisError
 from dpnp.tests.helper import has_support_aspect16, has_support_aspect64
 from dpnp.tests.third_party.cupy import testing
 
diff --git a/dpnp/tests/third_party/cupy/testing/_loops.py b/dpnp/tests/third_party/cupy/testing/_loops.py
index 66c243a3d7f7..bd1d178e1a66 100644
--- a/dpnp/tests/third_party/cupy/testing/_loops.py
+++ b/dpnp/tests/third_party/cupy/testing/_loops.py
@@ -12,10 +12,7 @@
 from dpctl import select_default_device
 
 import dpnp as cupy
-
-# TODO: revert to `from dpctl.tensor...`
-# when dpnp fully migrates dpctl/tensor
-from dpctl_ext.tensor._numpy_helper import AxisError
+from dpnp.tensor._numpy_helper import AxisError
 from dpnp.tests import config
 from dpnp.tests.third_party.cupy.testing import _array, _parameterized
 from dpnp.tests.third_party.cupy.testing._pytest_impl import is_available
diff --git a/setup.py b/setup.py
index 7ffef3bed9d8..86899c27ca65 100644
--- a/setup.py
+++ b/setup.py
@@ -34,6 +34,7 @@
     cmdclass=versioneer.get_cmdclass(),
     packages=[
         "dpnp",
+        "dpnp.tensor",
         "dpnp.dpnp_algo",
         "dpnp.dpnp_utils",
         "dpnp.exceptions",
@@ -44,9 +45,6 @@
         "dpnp.scipy",
         "dpnp.scipy.linalg",
         "dpnp.scipy.special",
-        # TODO: replace with dpctl; dpctl.tensor
-        "dpctl_ext",
-        "dpctl_ext.tensor",
     ],
     package_data={
         "dpnp": [

From e1aad07e406bbc4f339eee82e95de185a1d83193 Mon Sep 17 00:00:00 2001
From: vlad-perevezentsev <vladislav.perevezentsev@intel.com>
Date: Fri, 10 Apr 2026 13:41:58 +0200
Subject: [PATCH 24/43] Move tensor tests from dpctl  (#2817)

This PR moves all tensor-related tests to `dpnp/tests/tensor` as part of
the ongoing migration of tensor functionality from `dpctl` to
`dpnp.tensor`

Key changes:

> - Relocated 89 tensor tests (elementwise functions, `usm_ndarray`, and
tensor utilities)
> - Updated imports to use `dpnp.tensor`
> - Included tests in packaging configuration
> - Integrated tensor tests into CI
> - Fixed several issues discovered during migration (dtype
expectations, boolean reductions, etc.)
> - Fixed a circular import in _usmarray.py
> - Added `SKIP_TENSOR_TESTS` env variable to manage the launch of the
test scope

In a follow-up PR:

> - Conditional logic will be added to run dpctl_ext/tests only when
changes affect the tensor code.
> - Array API tests for tensor will be introduced and executed as a
separate CI job.
---
 .github/workflows/check-onemath.yaml          |   54 +-
 .github/workflows/conda-package.yml           |   44 +-
 .github/workflows/generate_coverage.yaml      |    1 +
 dpnp/tensor/CMakeLists.txt                    |   12 +
 dpnp/tensor/_usmarray.pyx                     |    2 +-
 .../kernels/elementwise_functions/round.hpp   |    1 +
 .../libtensor/include/utils/sycl_utils.hpp    |   18 +-
 dpnp/tests/config.py                          |    1 +
 dpnp/tests/conftest.py                        |    4 +
 dpnp/tests/tensor/__init__.py                 |   31 +
 dpnp/tests/tensor/conftest.py                 |   31 +
 dpnp/tests/tensor/elementwise/__init__.py     |   32 +
 dpnp/tests/tensor/elementwise/test_abs.py     |  224 ++
 dpnp/tests/tensor/elementwise/test_add.py     |  593 +++++
 dpnp/tests/tensor/elementwise/test_angle.py   |  111 +
 dpnp/tests/tensor/elementwise/test_atan2.py   |  524 ++++
 .../tensor/elementwise/test_bitwise_and.py    |  142 +
 .../tensor/elementwise/test_bitwise_invert.py |  148 ++
 .../elementwise/test_bitwise_left_shift.py    |  150 ++
 .../tensor/elementwise/test_bitwise_or.py     |  158 ++
 .../elementwise/test_bitwise_right_shift.py   |  166 ++
 .../tensor/elementwise/test_bitwise_xor.py    |  158 ++
 dpnp/tests/tensor/elementwise/test_cbrt.py    |   98 +
 dpnp/tests/tensor/elementwise/test_complex.py |  243 ++
 .../tests/tensor/elementwise/test_copysign.py |  130 +
 dpnp/tests/tensor/elementwise/test_divide.py  |  313 +++
 .../elementwise/test_elementwise_classes.py   |  150 ++
 dpnp/tests/tensor/elementwise/test_equal.py   |  209 ++
 dpnp/tests/tensor/elementwise/test_exp.py     |  253 ++
 dpnp/tests/tensor/elementwise/test_exp2.py    |  187 ++
 dpnp/tests/tensor/elementwise/test_expm1.py   |  187 ++
 .../elementwise/test_floor_ceil_trunc.py      |  182 ++
 .../tensor/elementwise/test_floor_divide.py   |  319 +++
 dpnp/tests/tensor/elementwise/test_greater.py |  316 +++
 .../tensor/elementwise/test_greater_equal.py  |  315 +++
 .../tensor/elementwise/test_hyperbolic.py     |  202 ++
 dpnp/tests/tensor/elementwise/test_hypot.py   |  212 ++
 .../tests/tensor/elementwise/test_isfinite.py |  114 +
 dpnp/tests/tensor/elementwise/test_isinf.py   |  108 +
 dpnp/tests/tensor/elementwise/test_isnan.py   |  113 +
 dpnp/tests/tensor/elementwise/test_less.py    |  316 +++
 .../tensor/elementwise/test_less_equal.py     |  315 +++
 dpnp/tests/tensor/elementwise/test_log.py     |  149 ++
 dpnp/tests/tensor/elementwise/test_log10.py   |  152 ++
 dpnp/tests/tensor/elementwise/test_log1p.py   |  188 ++
 dpnp/tests/tensor/elementwise/test_log2.py    |  148 ++
 .../tensor/elementwise/test_logaddexp.py      |  213 ++
 .../tensor/elementwise/test_logical_and.py    |  323 +++
 .../tensor/elementwise/test_logical_not.py    |  198 ++
 .../tensor/elementwise/test_logical_or.py     |  324 +++
 .../tensor/elementwise/test_logical_xor.py    |  325 +++
 .../elementwise/test_maximum_minimum.py       |  333 +++
 .../tests/tensor/elementwise/test_multiply.py |  253 ++
 .../tests/tensor/elementwise/test_negative.py |  101 +
 .../tensor/elementwise/test_nextafter.py      |  169 ++
 .../tensor/elementwise/test_not_equal.py      |  227 ++
 .../tests/tensor/elementwise/test_positive.py |   94 +
 dpnp/tests/tensor/elementwise/test_pow.py     |  231 ++
 .../tensor/elementwise/test_reciprocal.py     |  108 +
 .../tensor/elementwise/test_remainder.py      |  279 ++
 dpnp/tests/tensor/elementwise/test_round.py   |  234 ++
 dpnp/tests/tensor/elementwise/test_rsqrt.py   |   93 +
 dpnp/tests/tensor/elementwise/test_sign.py    |  140 +
 dpnp/tests/tensor/elementwise/test_signbit.py |  124 +
 dpnp/tests/tensor/elementwise/test_sqrt.py    |  207 ++
 dpnp/tests/tensor/elementwise/test_square.py  |  114 +
 .../tests/tensor/elementwise/test_subtract.py |  254 ++
 .../tensor/elementwise/test_trigonometric.py  |  234 ++
 .../tensor/elementwise/test_type_utils.py     |  254 ++
 dpnp/tests/tensor/elementwise/utils.py        |   74 +
 dpnp/tests/tensor/helper/__init__.py          |   47 +
 dpnp/tests/tensor/helper/_helper.py           |   89 +
 dpnp/tests/tensor/test_tensor_accumulation.py |  450 ++++
 .../test_tensor_array_api_inspection.py       |  238 ++
 dpnp/tests/tensor/test_tensor_asarray.py      |  664 +++++
 dpnp/tests/tensor/test_tensor_clip.py         |  793 ++++++
 dpnp/tests/tensor/test_tensor_copy_utils.py   |  113 +
 dpnp/tests/tensor/test_tensor_diff.py         |  345 +++
 .../tensor/test_tensor_dtype_routines.py      |  170 ++
 dpnp/tests/tensor/test_tensor_isin.py         |  282 ++
 .../test_tensor_statistical_functions.py      |  271 ++
 dpnp/tests/tensor/test_tensor_sum.py          |  348 +++
 dpnp/tests/tensor/test_tensor_testing.py      |  181 ++
 dpnp/tests/tensor/test_usm_ndarray_ctor.py    | 2324 +++++++++++++++++
 dpnp/tests/tensor/test_usm_ndarray_dlpack.py  |  917 +++++++
 .../tests/tensor/test_usm_ndarray_indexing.py | 2055 +++++++++++++++
 dpnp/tests/tensor/test_usm_ndarray_linalg.py  | 1031 ++++++++
 .../tensor/test_usm_ndarray_manipulation.py   | 1609 ++++++++++++
 .../tensor/test_usm_ndarray_operators.py      |  154 ++
 dpnp/tests/tensor/test_usm_ndarray_print.py   |  408 +++
 .../tensor/test_usm_ndarray_reductions.py     |  706 +++++
 .../test_usm_ndarray_search_functions.py      |  594 +++++
 .../tensor/test_usm_ndarray_searchsorted.py   |  408 +++
 dpnp/tests/tensor/test_usm_ndarray_sorting.py |  397 +++
 dpnp/tests/tensor/test_usm_ndarray_top_k.py   |  331 +++
 dpnp/tests/tensor/test_usm_ndarray_unique.py  |  361 +++
 .../test_usm_ndarray_utility_functions.py     |  199 ++
 dpnp/tests/test_ndarray.py                    |    3 +
 pyproject.toml                                |    2 +-
 setup.py                                      |    4 +-
 100 files changed, 27916 insertions(+), 8 deletions(-)
 create mode 100644 dpnp/tests/tensor/__init__.py
 create mode 100644 dpnp/tests/tensor/conftest.py
 create mode 100644 dpnp/tests/tensor/elementwise/__init__.py
 create mode 100644 dpnp/tests/tensor/elementwise/test_abs.py
 create mode 100644 dpnp/tests/tensor/elementwise/test_add.py
 create mode 100644 dpnp/tests/tensor/elementwise/test_angle.py
 create mode 100644 dpnp/tests/tensor/elementwise/test_atan2.py
 create mode 100644 dpnp/tests/tensor/elementwise/test_bitwise_and.py
 create mode 100644 dpnp/tests/tensor/elementwise/test_bitwise_invert.py
 create mode 100644 dpnp/tests/tensor/elementwise/test_bitwise_left_shift.py
 create mode 100644 dpnp/tests/tensor/elementwise/test_bitwise_or.py
 create mode 100644 dpnp/tests/tensor/elementwise/test_bitwise_right_shift.py
 create mode 100644 dpnp/tests/tensor/elementwise/test_bitwise_xor.py
 create mode 100644 dpnp/tests/tensor/elementwise/test_cbrt.py
 create mode 100644 dpnp/tests/tensor/elementwise/test_complex.py
 create mode 100644 dpnp/tests/tensor/elementwise/test_copysign.py
 create mode 100644 dpnp/tests/tensor/elementwise/test_divide.py
 create mode 100644 dpnp/tests/tensor/elementwise/test_elementwise_classes.py
 create mode 100644 dpnp/tests/tensor/elementwise/test_equal.py
 create mode 100644 dpnp/tests/tensor/elementwise/test_exp.py
 create mode 100644 dpnp/tests/tensor/elementwise/test_exp2.py
 create mode 100644 dpnp/tests/tensor/elementwise/test_expm1.py
 create mode 100644 dpnp/tests/tensor/elementwise/test_floor_ceil_trunc.py
 create mode 100644 dpnp/tests/tensor/elementwise/test_floor_divide.py
 create mode 100644 dpnp/tests/tensor/elementwise/test_greater.py
 create mode 100644 dpnp/tests/tensor/elementwise/test_greater_equal.py
 create mode 100644 dpnp/tests/tensor/elementwise/test_hyperbolic.py
 create mode 100644 dpnp/tests/tensor/elementwise/test_hypot.py
 create mode 100644 dpnp/tests/tensor/elementwise/test_isfinite.py
 create mode 100644 dpnp/tests/tensor/elementwise/test_isinf.py
 create mode 100644 dpnp/tests/tensor/elementwise/test_isnan.py
 create mode 100644 dpnp/tests/tensor/elementwise/test_less.py
 create mode 100644 dpnp/tests/tensor/elementwise/test_less_equal.py
 create mode 100644 dpnp/tests/tensor/elementwise/test_log.py
 create mode 100644 dpnp/tests/tensor/elementwise/test_log10.py
 create mode 100644 dpnp/tests/tensor/elementwise/test_log1p.py
 create mode 100644 dpnp/tests/tensor/elementwise/test_log2.py
 create mode 100644 dpnp/tests/tensor/elementwise/test_logaddexp.py
 create mode 100644 dpnp/tests/tensor/elementwise/test_logical_and.py
 create mode 100644 dpnp/tests/tensor/elementwise/test_logical_not.py
 create mode 100644 dpnp/tests/tensor/elementwise/test_logical_or.py
 create mode 100644 dpnp/tests/tensor/elementwise/test_logical_xor.py
 create mode 100644 dpnp/tests/tensor/elementwise/test_maximum_minimum.py
 create mode 100644 dpnp/tests/tensor/elementwise/test_multiply.py
 create mode 100644 dpnp/tests/tensor/elementwise/test_negative.py
 create mode 100644 dpnp/tests/tensor/elementwise/test_nextafter.py
 create mode 100644 dpnp/tests/tensor/elementwise/test_not_equal.py
 create mode 100644 dpnp/tests/tensor/elementwise/test_positive.py
 create mode 100644 dpnp/tests/tensor/elementwise/test_pow.py
 create mode 100644 dpnp/tests/tensor/elementwise/test_reciprocal.py
 create mode 100644 dpnp/tests/tensor/elementwise/test_remainder.py
 create mode 100644 dpnp/tests/tensor/elementwise/test_round.py
 create mode 100644 dpnp/tests/tensor/elementwise/test_rsqrt.py
 create mode 100644 dpnp/tests/tensor/elementwise/test_sign.py
 create mode 100644 dpnp/tests/tensor/elementwise/test_signbit.py
 create mode 100644 dpnp/tests/tensor/elementwise/test_sqrt.py
 create mode 100644 dpnp/tests/tensor/elementwise/test_square.py
 create mode 100644 dpnp/tests/tensor/elementwise/test_subtract.py
 create mode 100644 dpnp/tests/tensor/elementwise/test_trigonometric.py
 create mode 100644 dpnp/tests/tensor/elementwise/test_type_utils.py
 create mode 100644 dpnp/tests/tensor/elementwise/utils.py
 create mode 100644 dpnp/tests/tensor/helper/__init__.py
 create mode 100644 dpnp/tests/tensor/helper/_helper.py
 create mode 100644 dpnp/tests/tensor/test_tensor_accumulation.py
 create mode 100644 dpnp/tests/tensor/test_tensor_array_api_inspection.py
 create mode 100644 dpnp/tests/tensor/test_tensor_asarray.py
 create mode 100644 dpnp/tests/tensor/test_tensor_clip.py
 create mode 100644 dpnp/tests/tensor/test_tensor_copy_utils.py
 create mode 100644 dpnp/tests/tensor/test_tensor_diff.py
 create mode 100644 dpnp/tests/tensor/test_tensor_dtype_routines.py
 create mode 100644 dpnp/tests/tensor/test_tensor_isin.py
 create mode 100644 dpnp/tests/tensor/test_tensor_statistical_functions.py
 create mode 100644 dpnp/tests/tensor/test_tensor_sum.py
 create mode 100644 dpnp/tests/tensor/test_tensor_testing.py
 create mode 100644 dpnp/tests/tensor/test_usm_ndarray_ctor.py
 create mode 100644 dpnp/tests/tensor/test_usm_ndarray_dlpack.py
 create mode 100644 dpnp/tests/tensor/test_usm_ndarray_indexing.py
 create mode 100644 dpnp/tests/tensor/test_usm_ndarray_linalg.py
 create mode 100644 dpnp/tests/tensor/test_usm_ndarray_manipulation.py
 create mode 100644 dpnp/tests/tensor/test_usm_ndarray_operators.py
 create mode 100644 dpnp/tests/tensor/test_usm_ndarray_print.py
 create mode 100644 dpnp/tests/tensor/test_usm_ndarray_reductions.py
 create mode 100644 dpnp/tests/tensor/test_usm_ndarray_search_functions.py
 create mode 100644 dpnp/tests/tensor/test_usm_ndarray_searchsorted.py
 create mode 100644 dpnp/tests/tensor/test_usm_ndarray_sorting.py
 create mode 100644 dpnp/tests/tensor/test_usm_ndarray_top_k.py
 create mode 100644 dpnp/tests/tensor/test_usm_ndarray_unique.py
 create mode 100644 dpnp/tests/tensor/test_usm_ndarray_utility_functions.py

diff --git a/.github/workflows/check-onemath.yaml b/.github/workflows/check-onemath.yaml
index 7b18cdfaba64..8fc554c73118 100644
--- a/.github/workflows/check-onemath.yaml
+++ b/.github/workflows/check-onemath.yaml
@@ -74,7 +74,7 @@ jobs:
         os: [ubuntu-22.04] # windows-2022 - no DFT support for Windows in oneMKL
 
     runs-on: ${{ matrix.os }}
-    timeout-minutes: 60
+    timeout-minutes: 120
 
     defaults:
       run:
@@ -133,6 +133,14 @@ jobs:
         if: env.rerun-tests-on-failure != 'true'
         run: |
           python -m pytest -ra --pyargs dpnp.tests
+        env:
+          SKIP_TENSOR_TESTS: 1
+          SYCL_CACHE_PERSISTENT: 1
+
+      - name: Run tensor tests
+        if: env.rerun-tests-on-failure != 'true'
+        run: |
+          python -m pytest -ra --pyargs dpnp.tests.tensor
         env:
           SYCL_CACHE_PERSISTENT: 1
 
@@ -150,6 +158,24 @@ jobs:
             mamba activate ${{ env.test-env-name }}
 
             python -m pytest -ra --pyargs dpnp.tests
+        env:
+          SKIP_TENSOR_TESTS: 1
+          SYCL_CACHE_PERSISTENT: 1
+
+      - name: ReRun tensor tests on Linux
+        if: env.rerun-tests-on-failure == 'true'
+        id: run_tensor_tests
+        uses: nick-fields/retry@ce71cc2ab81d554ebbe88c79ab5975992d79ba08 # v3.0.2
+        with:
+          timeout_minutes: ${{ env.rerun-tests-timeout }}
+          max_attempts: ${{ env.rerun-tests-max-attempts }}
+          retry_on: any
+          command: |
+            . $CONDA/etc/profile.d/conda.sh
+            . $CONDA/etc/profile.d/mamba.sh
+            mamba activate ${{ env.test-env-name }}
+
+            python -m pytest -ra --pyargs dpnp.tests.tensor
         env:
           SYCL_CACHE_PERSISTENT: 1
 
@@ -239,6 +265,14 @@ jobs:
         if: env.rerun-tests-on-failure != 'true'
         run: |
           python -m pytest -ra --pyargs dpnp.tests
+        env:
+          SKIP_TENSOR_TESTS: 1
+          SYCL_CACHE_PERSISTENT: 1
+
+      - name: Run tensor tests
+        if: env.rerun-tests-on-failure != 'true'
+        run: |
+          python -m pytest -ra --pyargs dpnp.tests.tensor
         env:
           SYCL_CACHE_PERSISTENT: 1
 
@@ -256,5 +290,23 @@ jobs:
             mamba activate ${{ env.test-env-name }}
 
             python -m pytest -ra --pyargs dpnp.tests
+        env:
+          SKIP_TENSOR_TESTS: 1
+          SYCL_CACHE_PERSISTENT: 1
+
+      - name: ReRun tensor tests on Linux
+        if: env.rerun-tests-on-failure == 'true'
+        id: run_tensor_tests_branch
+        uses: nick-fields/retry@ce71cc2ab81d554ebbe88c79ab5975992d79ba08 # v3.0.2
+        with:
+          timeout_minutes: ${{ env.rerun-tests-timeout }}
+          max_attempts: ${{ env.rerun-tests-max-attempts }}
+          retry_on: any
+          command: |
+            . $CONDA/etc/profile.d/conda.sh
+            . $CONDA/etc/profile.d/mamba.sh
+            mamba activate ${{ env.test-env-name }}
+
+            python -m pytest -ra --pyargs dpnp.tests.tensor
         env:
           SYCL_CACHE_PERSISTENT: 1
diff --git a/.github/workflows/conda-package.yml b/.github/workflows/conda-package.yml
index eb66c91dc8c2..cbce48eb728c 100644
--- a/.github/workflows/conda-package.yml
+++ b/.github/workflows/conda-package.yml
@@ -37,7 +37,7 @@ jobs:
       actions: write
 
     runs-on: ${{ matrix.os }}
-    timeout-minutes: 80
+    timeout-minutes: 90
 
     defaults:
       run:
@@ -220,6 +220,7 @@ jobs:
       - name: Run tests
         if: env.rerun-tests-on-failure != 'true'
         run: |
+          export SKIP_TENSOR_TESTS=1
           if [[ "${{ matrix.python }}" == "${{ env.python-ver-test-all-dtypes }}" ]]; then
             export DPNP_TEST_ALL_INT_TYPES=1
             python -m pytest -ra --pyargs ${{ env.package-name }}.tests
@@ -239,6 +240,7 @@ jobs:
             . $CONDA/etc/profile.d/conda.sh
             . $CONDA/etc/profile.d/mamba.sh
             mamba activate ${{ env.test-env-name }}
+            export SKIP_TENSOR_TESTS=1
 
             if [[ "${{ matrix.python }}" == "${{ env.python-ver-test-all-dtypes }}" ]]; then
               export DPNP_TEST_ALL_INT_TYPES=1
@@ -247,6 +249,26 @@ jobs:
               python -m pytest -n auto -ra --pyargs ${{ env.package-name }}.tests
             fi
 
+      - name: Run tensor tests
+        if: env.rerun-tests-on-failure != 'true'
+        run: |
+          python -m pytest -n auto -ra --pyargs dpnp.tests.tensor
+
+      - name: Run tensor tests
+        if: env.rerun-tests-on-failure == 'true'
+        id: run_tests_tensor_linux
+        uses: nick-fields/retry@ce71cc2ab81d554ebbe88c79ab5975992d79ba08 # v3.0.2
+        with:
+          timeout_minutes: ${{ env.rerun-tests-timeout }}
+          max_attempts: ${{ env.rerun-tests-max-attempts }}
+          retry_on: any
+          command: |
+            . $CONDA/etc/profile.d/conda.sh
+            . $CONDA/etc/profile.d/mamba.sh
+            mamba activate ${{ env.test-env-name }}
+
+            python -m pytest -n auto -ra --pyargs dpnp.tests.tensor
+
   test_windows:
     name: Test
 
@@ -382,6 +404,7 @@ jobs:
         if: env.rerun-tests-on-failure != 'true'
         shell: pwsh
         run: |
+          $env:SKIP_TENSOR_TESTS=1
           if (${{ matrix.python }} -eq ${{ env.python-ver-test-all-dtypes }}) {
             $env:DPNP_TEST_ALL_INT_TYPES=1
             python -m pytest -ra --pyargs ${{ env.package-name }}.tests
@@ -399,6 +422,7 @@ jobs:
           retry_on: any
           shell: pwsh
           command: |
+            $env:SKIP_TENSOR_TESTS=1
             if ( ${{ matrix.python }} -eq ${{ env.python-ver-test-all-dtypes }} ) {
               $env:DPNP_TEST_ALL_INT_TYPES=1
               python -m pytest -ra --pyargs ${{ env.package-name }}.tests
@@ -406,6 +430,24 @@ jobs:
               python -m pytest -n auto -ra --pyargs ${{ env.package-name }}.tests
             }
 
+      - name: Run tensor tests
+        if: env.rerun-tests-on-failure != 'true'
+        shell: pwsh
+        run: |
+          python -m pytest -n auto -ra --pyargs dpnp.tests.tensor
+
+      - name: Run tensor tests
+        if: env.rerun-tests-on-failure == 'true'
+        id: run_tests_tensor_win
+        uses: nick-fields/retry@ce71cc2ab81d554ebbe88c79ab5975992d79ba08 # v3.0.2
+        with:
+          timeout_minutes: ${{ env.rerun-tests-timeout }}
+          max_attempts: ${{ env.rerun-tests-max-attempts }}
+          retry_on: any
+          shell: pwsh
+          command: |
+            python -m pytest -n auto -ra --pyargs dpnp.tests.tensor
+
   upload:
     name: Upload
 
diff --git a/.github/workflows/generate_coverage.yaml b/.github/workflows/generate_coverage.yaml
index f56018b4ef8e..5910549212f3 100644
--- a/.github/workflows/generate_coverage.yaml
+++ b/.github/workflows/generate_coverage.yaml
@@ -130,6 +130,7 @@ jobs:
             conda activate coverage
             [ -f /opt/intel/oneapi/setvars.sh ] && source /opt/intel/oneapi/setvars.sh
             git clean -fxd
+            export SKIP_TENSOR_TESTS=1
             python scripts/gen_coverage.py
 
       - name: Total number of coverage attempts
diff --git a/dpnp/tensor/CMakeLists.txt b/dpnp/tensor/CMakeLists.txt
index 0a8def4131df..d0fe57cade64 100644
--- a/dpnp/tensor/CMakeLists.txt
+++ b/dpnp/tensor/CMakeLists.txt
@@ -29,6 +29,17 @@
 
 find_package(Python COMPONENTS Development.Module)
 
+# Tensor-specific flags
+
+# dpctl doesn't add -fsycl globally
+# only to pybind11 module sources via add_sycl_to_target()
+string(REPLACE "-fsycl " "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
+
+# Use LLD linker (dpctl sets this at root level)
+if(UNIX)
+    add_link_options("-fuse-ld=lld")
+endif()
+
 # Remove global coverage flags for tensor
 # use link-time only approach like dpctl
 if(DPNP_GENERATE_COVERAGE)
@@ -55,6 +66,7 @@ if(
     endif()
 endif()
 
+# Match dpctl warning flags
 # Suppress unused parameter warnings
 add_compile_options(-Wno-unused-parameter)
 
diff --git a/dpnp/tensor/_usmarray.pyx b/dpnp/tensor/_usmarray.pyx
index 519fefed6129..ad172091702f 100644
--- a/dpnp/tensor/_usmarray.pyx
+++ b/dpnp/tensor/_usmarray.pyx
@@ -37,7 +37,7 @@ import numpy as np
 from dpctl._backend cimport DPCTLSyclUSMRef
 from dpctl._sycl_device_factory cimport _cached_default_device
 
-import dpnp.tensor
+import dpnp
 
 from ._data_types import bool as dpt_bool
 from ._device import Device
diff --git a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/round.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/round.hpp
index b20166a4d505..18867a09bcef 100644
--- a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/round.hpp
+++ b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/round.hpp
@@ -116,6 +116,7 @@ template <typename T>
 struct RoundOutputType
 {
     using value_type = typename std::disjunction<
+        td_ns::TypeMapResultEntry<T, bool, sycl::half>,
         td_ns::TypeMapResultEntry<T, std::uint8_t>,
         td_ns::TypeMapResultEntry<T, std::uint16_t>,
         td_ns::TypeMapResultEntry<T, std::uint32_t>,
diff --git a/dpnp/tensor/libtensor/include/utils/sycl_utils.hpp b/dpnp/tensor/libtensor/include/utils/sycl_utils.hpp
index f5ea4d4ca486..f45918e3c800 100644
--- a/dpnp/tensor/libtensor/include/utils/sycl_utils.hpp
+++ b/dpnp/tensor/libtensor/include/utils/sycl_utils.hpp
@@ -283,9 +283,21 @@ T custom_inclusive_scan_over_group(GroupT &&wg,
         const bool in_range = (lane_id < n_aggregates);
         const bool in_bounds = in_range && (lane_id > 0 || large_wg);
 
-        T __scan_val = (in_bounds)
-                           ? local_mem_acc[(offset + lane_id) * max_sgSize - 1]
-                           : identity;
+        // Here is a bug where IGC incorrectly optimized the below code:
+        // T __scan_val = (in_bounds)
+        //                 ? local_mem_acc[(offset + lane_id) * max_sgSize - 1]
+        //                 : identity;
+        // That causes `__scan_val` is not initialized with `identity` value:
+        //   wgs = 256, max_sgSize = 16   =>   n_aggregates = 16
+        //   wi = 0:   in_range = 1, in_bounds = 0   =>   __scan_val = identity
+        // The w/s adds SYCL atomic fence, since the explicit memory fence
+        // prevents reordering/elimination, while it will add slight overhead.
+        T __scan_val = identity;
+        sycl::atomic_fence(sycl::memory_order::relaxed,
+                           sycl::memory_scope::work_item);
+        if (in_bounds) {
+            __scan_val = local_mem_acc[(offset + lane_id) * max_sgSize - 1];
+        }
         for (std::uint32_t step = 1; step < sgSize; step *= 2) {
             const bool advanced_lane = (lane_id >= step);
             const std::uint32_t src_lane_id =
diff --git a/dpnp/tests/config.py b/dpnp/tests/config.py
index a49fd8cad250..e576c643695b 100644
--- a/dpnp/tests/config.py
+++ b/dpnp/tests/config.py
@@ -4,6 +4,7 @@
 float16_types = bool(os.getenv("DPNP_TEST_FLOAT_16", 0))
 complex_types = bool(os.getenv("DPNP_TEST_COMPLEX_TYPES", 0))
 bool_types = bool(os.getenv("DPNP_TEST_BOOL_TYPES", 0))
+skip_tensor_tests = bool(int(os.getenv("SKIP_TENSOR_TESTS", 0)))
 
 
 infra_warnings_enable = bool(os.getenv("DPNP_INFRA_WARNINGS_ENABLE", 0))
diff --git a/dpnp/tests/conftest.py b/dpnp/tests/conftest.py
index 5d766566bca5..8e3cb97ad41f 100644
--- a/dpnp/tests/conftest.py
+++ b/dpnp/tests/conftest.py
@@ -97,6 +97,10 @@ def pytest_configure(config):
     # Equivalent to norecursedirs = tests_perf
     config.addinivalue_line("norecursedirs", "tests_perf")
 
+    # Equivalent to norecursedirs = tests/tensor (conditional)
+    if dtype_config.skip_tensor_tests:
+        config.addinivalue_line("norecursedirs", "tests/tensor")
+
     # Register pytest markers
     config.addinivalue_line(
         "markers", "slow: marks tests as slow (deselect with '-m \"not slow\"')"
diff --git a/dpnp/tests/tensor/__init__.py b/dpnp/tests/tensor/__init__.py
new file mode 100644
index 000000000000..b18d8ddc7dd1
--- /dev/null
+++ b/dpnp/tests/tensor/__init__.py
@@ -0,0 +1,31 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+__doc__ = r"""
+Test suite for tensor functionality migrated from dpctl.
+Running test suite requires Cython and a working compiler."""
diff --git a/dpnp/tests/tensor/conftest.py b/dpnp/tests/tensor/conftest.py
new file mode 100644
index 000000000000..ea10d1322e76
--- /dev/null
+++ b/dpnp/tests/tensor/conftest.py
@@ -0,0 +1,31 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+"""Configures pytest to discover helper/ module"""
+
+from dpnp.tests.conftest import suppress_invalid_numpy_warnings
diff --git a/dpnp/tests/tensor/elementwise/__init__.py b/dpnp/tests/tensor/elementwise/__init__.py
new file mode 100644
index 000000000000..a794242cd7bb
--- /dev/null
+++ b/dpnp/tests/tensor/elementwise/__init__.py
@@ -0,0 +1,32 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+"""
+Collection of test and utility files for testing elementwise operations
+over :class:`dpnp.tensor.usm_ndarray`.
+"""
diff --git a/dpnp/tests/tensor/elementwise/test_abs.py b/dpnp/tests/tensor/elementwise/test_abs.py
new file mode 100644
index 000000000000..535aebfb4d58
--- /dev/null
+++ b/dpnp/tests/tensor/elementwise/test_abs.py
@@ -0,0 +1,224 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+import itertools
+import warnings
+
+import numpy as np
+import pytest
+
+import dpnp.tensor as dpt
+
+from ..helper import (
+    get_queue_or_skip,
+    skip_if_dtype_not_supported,
+)
+from .utils import (
+    _all_dtypes,
+    _complex_fp_dtypes,
+    _real_fp_dtypes,
+    _usm_types,
+)
+
+
+@pytest.mark.parametrize("dtype", _all_dtypes)
+def test_abs_out_type(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    arg_dt = np.dtype(dtype)
+    X = dpt.asarray(0, dtype=arg_dt, sycl_queue=q)
+    if np.issubdtype(arg_dt, np.complexfloating):
+        type_map = {
+            np.dtype("c8"): np.dtype("f4"),
+            np.dtype("c16"): np.dtype("f8"),
+        }
+        assert dpt.abs(X).dtype == type_map[arg_dt]
+
+        r = dpt.empty_like(X, dtype=type_map[arg_dt])
+        dpt.abs(X, out=r)
+        assert np.allclose(dpt.asnumpy(r), dpt.asnumpy(dpt.abs(X)))
+    else:
+        assert dpt.abs(X).dtype == arg_dt
+
+        r = dpt.empty_like(X, dtype=arg_dt)
+        dpt.abs(X, out=r)
+        assert np.allclose(dpt.asnumpy(r), dpt.asnumpy(dpt.abs(X)))
+
+
+@pytest.mark.parametrize("usm_type", _usm_types)
+def test_abs_usm_type(usm_type):
+    q = get_queue_or_skip()
+
+    arg_dt = np.dtype("i4")
+    input_shape = (10, 10, 10, 10)
+    X = dpt.empty(input_shape, dtype=arg_dt, usm_type=usm_type, sycl_queue=q)
+    X[..., 0::2] = 1
+    X[..., 1::2] = 0
+
+    Y = dpt.abs(X)
+    assert Y.usm_type == X.usm_type
+    assert Y.sycl_queue == X.sycl_queue
+    assert Y.flags.c_contiguous
+
+    expected_Y = dpt.asnumpy(X)
+    assert np.allclose(dpt.asnumpy(Y), expected_Y)
+
+
+def test_abs_types_property():
+    get_queue_or_skip()
+    types = dpt.abs.types
+    assert isinstance(types, list)
+    assert len(types) > 0
+    assert types == dpt.abs.types_
+
+
+@pytest.mark.parametrize("dtype", _all_dtypes[1:])
+def test_abs_order(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    arg_dt = np.dtype(dtype)
+    exp_dt = np.abs(np.ones(tuple(), dtype=arg_dt)).dtype
+    input_shape = (10, 10, 10, 10)
+    X = dpt.empty(input_shape, dtype=arg_dt, sycl_queue=q)
+    X[..., 0::2] = 1
+    X[..., 1::2] = 0
+
+    for perms in itertools.permutations(range(4)):
+        U = dpt.permute_dims(X[:, ::-1, ::-1, :], perms)
+        expected_Y = np.ones(U.shape, dtype=exp_dt)
+        expected_Y[..., 1::2] = 0
+        expected_Y = np.transpose(expected_Y, perms)
+        for ord in ["C", "F", "A", "K"]:
+            Y = dpt.abs(U, order=ord)
+            assert np.allclose(dpt.asnumpy(Y), expected_Y)
+
+
+@pytest.mark.parametrize("dtype", ["c8", "c16"])
+def test_abs_complex(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    arg_dt = np.dtype(dtype)
+    input_shape = (10, 10, 10, 10)
+    X = dpt.empty(input_shape, dtype=arg_dt, sycl_queue=q)
+    Xnp = np.random.standard_normal(
+        size=input_shape
+    ) + 1j * np.random.standard_normal(size=input_shape)
+    Xnp = Xnp.astype(arg_dt)
+    X[...] = Xnp
+
+    for ord in ["C", "F", "A", "K"]:
+        for perms in itertools.permutations(range(4)):
+            U = dpt.permute_dims(X[:, ::-1, ::-1, :], perms)
+            Y = dpt.abs(U, order=ord)
+            expected_Y = np.abs(np.transpose(Xnp[:, ::-1, ::-1, :], perms))
+            tol = dpt.finfo(Y.dtype).resolution
+            np.testing.assert_allclose(
+                dpt.asnumpy(Y), expected_Y, atol=tol, rtol=tol
+            )
+
+
+def test_abs_out_overlap():
+    get_queue_or_skip()
+
+    X = dpt.arange(-3, 3, 1, dtype="i4")
+    expected = dpt.asarray([3, 2, 1, 0, 1, 2], dtype="i4")
+    Y = dpt.abs(X, out=X)
+
+    assert Y is X
+    assert dpt.all(expected == X)
+
+    X = dpt.arange(-3, 3, 1, dtype="i4")
+    expected = expected[::-1]
+    Y = dpt.abs(X, out=X[::-1])
+    assert Y is not X
+    assert dpt.all(expected == X)
+
+
+@pytest.mark.parametrize("dtype", _real_fp_dtypes)
+def test_abs_real_fp_special_values(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    nans_ = [dpt.nan, -dpt.nan]
+    infs_ = [dpt.inf, -dpt.inf]
+    finites_ = [-1.0, -0.0, 0.0, 1.0]
+    inps_ = nans_ + infs_ + finites_
+
+    x = dpt.asarray(inps_, dtype=dtype)
+    r = dpt.abs(x)
+
+    with warnings.catch_warnings():
+        warnings.simplefilter("ignore")
+        expected_np = np.abs(np.asarray(inps_, dtype=dtype))
+
+    expected = dpt.asarray(expected_np, dtype=dtype)
+    tol = dpt.finfo(r.dtype).resolution
+
+    assert dpt.allclose(r, expected, atol=tol, rtol=tol, equal_nan=True)
+
+
+@pytest.mark.parametrize("dtype", _complex_fp_dtypes)
+def test_abs_complex_fp_special_values(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    nans_ = [dpt.nan, -dpt.nan]
+    infs_ = [dpt.inf, -dpt.inf]
+    finites_ = [-1.0, -0.0, 0.0, 1.0]
+    inps_ = nans_ + infs_ + finites_
+    c_ = [complex(*v) for v in itertools.product(inps_, repeat=2)]
+
+    z = dpt.asarray(c_, dtype=dtype)
+    r = dpt.abs(z)
+
+    with warnings.catch_warnings():
+        warnings.simplefilter("ignore")
+        expected_np = np.abs(np.asarray(c_, dtype=dtype))
+
+    expected = dpt.asarray(expected_np, dtype=dtype)
+    tol = dpt.finfo(r.dtype).resolution
+
+    assert dpt.allclose(r, expected, atol=tol, rtol=tol, equal_nan=True)
+
+
+@pytest.mark.parametrize("dtype", _all_dtypes)
+def test_abs_alignment(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    x = dpt.ones(512, dtype=dtype)
+    r = dpt.abs(x)
+
+    r2 = dpt.abs(x[1:])
+    assert np.allclose(dpt.asnumpy(r[1:]), dpt.asnumpy(r2))
+
+    dpt.abs(x[:-1], out=r[1:])
+    assert np.allclose(dpt.asnumpy(r[1:]), dpt.asnumpy(r2))
diff --git a/dpnp/tests/tensor/elementwise/test_add.py b/dpnp/tests/tensor/elementwise/test_add.py
new file mode 100644
index 000000000000..0320ec642a66
--- /dev/null
+++ b/dpnp/tests/tensor/elementwise/test_add.py
@@ -0,0 +1,593 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+import ctypes
+import re
+
+import dpctl
+import numpy as np
+import pytest
+from dpctl.utils import ExecutionPlacementError
+
+import dpnp.tensor as dpt
+from dpnp.tensor._type_utils import _can_cast
+
+from ..helper import (
+    get_queue_or_skip,
+    skip_if_dtype_not_supported,
+)
+from .utils import (
+    _all_dtypes,
+    _compare_dtypes,
+    _usm_types,
+)
+
+
+@pytest.mark.parametrize("op1_dtype", _all_dtypes)
+@pytest.mark.parametrize("op2_dtype", _all_dtypes)
+def test_add_dtype_matrix(op1_dtype, op2_dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(op1_dtype, q)
+    skip_if_dtype_not_supported(op2_dtype, q)
+
+    sz = 127
+    ar1 = dpt.ones(sz, dtype=op1_dtype)
+    ar2 = dpt.ones_like(ar1, dtype=op2_dtype)
+
+    r = dpt.add(ar1, ar2)
+    assert isinstance(r, dpt.usm_ndarray)
+    expected_dtype = np.add(
+        np.zeros(1, dtype=op1_dtype), np.zeros(1, dtype=op2_dtype)
+    ).dtype
+    assert _compare_dtypes(r.dtype, expected_dtype, sycl_queue=q)
+    assert r.shape == ar1.shape
+    assert (dpt.asnumpy(r) == np.full(r.shape, 2, dtype=r.dtype)).all()
+    assert r.sycl_queue == ar1.sycl_queue
+
+    r2 = dpt.empty_like(ar1, dtype=r.dtype)
+    dpt.add(ar1, ar2, out=r2)
+    assert (dpt.asnumpy(r2) == np.full(r2.shape, 2, dtype=r2.dtype)).all()
+
+    ar3 = dpt.ones(sz, dtype=op1_dtype)
+    ar4 = dpt.ones(2 * sz, dtype=op2_dtype)
+
+    r = dpt.add(ar3[::-1], ar4[::2])
+    assert isinstance(r, dpt.usm_ndarray)
+    expected_dtype = np.add(
+        np.zeros(1, dtype=op1_dtype), np.zeros(1, dtype=op2_dtype)
+    ).dtype
+    assert _compare_dtypes(r.dtype, expected_dtype, sycl_queue=q)
+    assert r.shape == ar3.shape
+    assert (dpt.asnumpy(r) == np.full(r.shape, 2, dtype=r.dtype)).all()
+
+    r2 = dpt.empty_like(ar1, dtype=r.dtype)
+    dpt.add(ar3[::-1], ar4[::2], out=r2)
+    assert (dpt.asnumpy(r2) == np.full(r2.shape, 2, dtype=r2.dtype)).all()
+
+
+@pytest.mark.parametrize("op1_usm_type", _usm_types)
+@pytest.mark.parametrize("op2_usm_type", _usm_types)
+def test_add_usm_type_matrix(op1_usm_type, op2_usm_type):
+    get_queue_or_skip()
+
+    sz = 128
+    ar1 = dpt.ones(sz, dtype="i4", usm_type=op1_usm_type)
+    ar2 = dpt.ones_like(ar1, dtype="i4", usm_type=op2_usm_type)
+
+    r = dpt.add(ar1, ar2)
+    assert isinstance(r, dpt.usm_ndarray)
+    expected_usm_type = dpctl.utils.get_coerced_usm_type(
+        (op1_usm_type, op2_usm_type)
+    )
+    assert r.usm_type == expected_usm_type
+
+
+def test_add_order():
+    get_queue_or_skip()
+
+    test_shape = (
+        20,
+        20,
+    )
+    test_shape2 = tuple(2 * dim for dim in test_shape)
+    n = test_shape[-1]
+
+    for dt1, dt2 in zip(["i4", "i4", "f4"], ["i4", "f4", "i4"]):
+        ar1 = dpt.ones(test_shape, dtype=dt1, order="C")
+        ar2 = dpt.ones(test_shape, dtype=dt2, order="C")
+        r1 = dpt.add(ar1, ar2, order="C")
+        assert r1.flags.c_contiguous
+        r2 = dpt.add(ar1, ar2, order="F")
+        assert r2.flags.f_contiguous
+        r3 = dpt.add(ar1, ar2, order="A")
+        assert r3.flags.c_contiguous
+        r4 = dpt.add(ar1, ar2, order="K")
+        assert r4.flags.c_contiguous
+
+        ar1 = dpt.ones(test_shape, dtype=dt1, order="F")
+        ar2 = dpt.ones(test_shape, dtype=dt2, order="F")
+        r1 = dpt.add(ar1, ar2, order="C")
+        assert r1.flags.c_contiguous
+        r2 = dpt.add(ar1, ar2, order="F")
+        assert r2.flags.f_contiguous
+        r3 = dpt.add(ar1, ar2, order="A")
+        assert r3.flags.f_contiguous
+        r4 = dpt.add(ar1, ar2, order="K")
+        assert r4.flags.f_contiguous
+
+        ar1 = dpt.ones(test_shape2, dtype=dt1, order="C")[:20, ::-2]
+        ar2 = dpt.ones(test_shape2, dtype=dt2, order="C")[:20, ::-2]
+        r4 = dpt.add(ar1, ar2, order="K")
+        assert r4.strides == (n, -1)
+        r5 = dpt.add(ar1, ar2, order="C")
+        assert r5.strides == (n, 1)
+
+        ar1 = dpt.ones(test_shape2, dtype=dt1, order="C")[:20, ::-2].mT
+        ar2 = dpt.ones(test_shape2, dtype=dt2, order="C")[:20, ::-2].mT
+        r4 = dpt.add(ar1, ar2, order="K")
+        assert r4.strides == (-1, n)
+        r5 = dpt.add(ar1, ar2, order="C")
+        assert r5.strides == (n, 1)
+
+
+def test_add_broadcasting():
+    get_queue_or_skip()
+
+    m = dpt.ones((100, 5), dtype="i4")
+    v = dpt.arange(5, dtype="i4")
+
+    r = dpt.add(m, v)
+    assert (dpt.asnumpy(r) == np.arange(1, 6, dtype="i4")[np.newaxis, :]).all()
+
+    r2 = dpt.add(v, m)
+    assert (dpt.asnumpy(r2) == np.arange(1, 6, dtype="i4")[np.newaxis, :]).all()
+
+    r3 = dpt.empty_like(m)
+    dpt.add(m, v, out=r3)
+    assert (dpt.asnumpy(r3) == np.arange(1, 6, dtype="i4")[np.newaxis, :]).all()
+
+    r4 = dpt.empty_like(m)
+    dpt.add(v, m, out=r4)
+    assert (dpt.asnumpy(r4) == np.arange(1, 6, dtype="i4")[np.newaxis, :]).all()
+
+
+def test_add_broadcasting_new_shape():
+    get_queue_or_skip()
+
+    ar1 = dpt.ones((6, 1), dtype="i4")
+    ar2 = dpt.arange(6, dtype="i4")
+
+    r = dpt.add(ar1, ar2)
+    assert (dpt.asnumpy(r) == np.arange(1, 7, dtype="i4")[np.newaxis, :]).all()
+
+    r1 = dpt.add(ar2, ar1)
+    assert (dpt.asnumpy(r1) == np.arange(1, 7, dtype="i4")[np.newaxis, :]).all()
+
+    r2 = dpt.add(ar1[::2], ar2[::2])
+    assert (
+        dpt.asnumpy(r2) == np.arange(1, 7, dtype="i4")[::2][np.newaxis, :]
+    ).all()
+
+    r3 = dpt.empty_like(ar1)
+    with pytest.raises(ValueError):
+        dpt.add(ar1, ar2, out=r3)
+
+    ar3 = dpt.ones((6, 1), dtype="i4")
+    ar4 = dpt.ones((1, 6), dtype="i4")
+
+    r4 = dpt.add(ar3, ar4)
+    assert (dpt.asnumpy(r4) == np.full((6, 6), 2, dtype="i4")).all()
+
+    r5 = dpt.add(ar4, ar3)
+    assert (dpt.asnumpy(r5) == np.full((6, 6), 2, dtype="i4")).all()
+
+    r6 = dpt.add(ar3[::2], ar4[:, ::2])
+    assert (dpt.asnumpy(r6) == np.full((3, 3), 2, dtype="i4")).all()
+
+    r7 = dpt.add(ar3[::2], ar4)
+    assert (dpt.asnumpy(r7) == np.full((3, 6), 2, dtype="i4")).all()
+
+
+def test_add_broadcasting_error():
+    get_queue_or_skip()
+    m = dpt.ones((10, 10), dtype="i4")
+    v = dpt.ones((3,), dtype="i4")
+    with pytest.raises(ValueError):
+        dpt.add(m, v)
+
+
+@pytest.mark.parametrize("arr_dt", _all_dtypes)
+def test_add_python_scalar(arr_dt):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(arr_dt, q)
+
+    X = dpt.zeros((10, 10), dtype=arr_dt, sycl_queue=q)
+    py_zeros = (
+        bool(0),
+        int(0),
+        float(0),
+        complex(0),
+        np.float32(0),
+        ctypes.c_int(0),
+    )
+    for sc in py_zeros:
+        R = dpt.add(X, sc)
+        assert isinstance(R, dpt.usm_ndarray)
+        R = dpt.add(sc, X)
+        assert isinstance(R, dpt.usm_ndarray)
+
+
+class MockArray:
+    def __init__(self, arr):
+        self.data_ = arr
+
+    @property
+    def __sycl_usm_array_interface__(self):
+        return self.data_.__sycl_usm_array_interface__
+
+
+def test_add_mock_array():
+    get_queue_or_skip()
+    a = dpt.arange(10)
+    b = dpt.ones(10)
+    c = MockArray(b)
+    r = dpt.add(a, c)
+    assert isinstance(r, dpt.usm_ndarray)
+
+
+def test_add_canary_mock_array():
+    get_queue_or_skip()
+    a = dpt.arange(10)
+
+    class Canary:
+        def __init__(self):
+            pass
+
+        @property
+        def __sycl_usm_array_interface__(self):
+            return None
+
+    c = Canary()
+    with pytest.raises(ValueError):
+        dpt.add(a, c)
+
+
+def test_add_types_property():
+    get_queue_or_skip()
+    types = dpt.add.types
+    assert isinstance(types, list)
+    assert len(types) > 0
+    assert types == dpt.add.types_
+
+
+def test_add_errors():
+    get_queue_or_skip()
+    try:
+        gpu_queue = dpctl.SyclQueue("gpu")
+    except dpctl.SyclQueueCreationError:
+        pytest.skip("SyclQueue('gpu') failed, skipping")
+    try:
+        cpu_queue = dpctl.SyclQueue("cpu")
+    except dpctl.SyclQueueCreationError:
+        pytest.skip("SyclQueue('cpu') failed, skipping")
+
+    ar1 = dpt.ones(2, dtype="float32", sycl_queue=gpu_queue)
+    ar2 = dpt.ones_like(ar1, sycl_queue=gpu_queue)
+    y = dpt.empty_like(ar1, sycl_queue=cpu_queue)
+    with pytest.raises(ExecutionPlacementError) as excinfo:
+        dpt.add(ar1, ar2, out=y)
+    assert "Input and output allocation queues are not compatible" in str(
+        excinfo.value
+    )
+
+    ar1 = dpt.ones(2, dtype="float32")
+    ar2 = dpt.ones_like(ar1, dtype="int32")
+    y = dpt.empty(3)
+    with pytest.raises(ValueError) as excinfo:
+        dpt.add(ar1, ar2, out=y)
+    assert "The shape of input and output arrays are inconsistent" in str(
+        excinfo.value
+    )
+
+    ar1 = np.ones(2, dtype="float32")
+    ar2 = np.ones_like(ar1, dtype="int32")
+    with pytest.raises(ExecutionPlacementError) as excinfo:
+        dpt.add(ar1, ar2)
+    assert re.match(
+        "Execution placement can not be unambiguously inferred.*",
+        str(excinfo.value),
+    )
+
+    ar1 = dpt.ones(2, dtype="float32")
+    ar2 = dpt.ones_like(ar1, dtype="int32")
+    y = np.empty(ar1.shape, dtype=ar1.dtype)
+    with pytest.raises(TypeError) as excinfo:
+        dpt.add(ar1, ar2, out=y)
+    assert "output array must be of usm_ndarray type" in str(excinfo.value)
+
+
+@pytest.mark.parametrize("dtype", _all_dtypes)
+def test_add_dtype_error(
+    dtype,
+):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    ar1 = dpt.ones(5, dtype=dtype)
+    ar2 = dpt.ones_like(ar1, dtype="f4")
+
+    y = dpt.zeros_like(ar1, dtype="int8")
+    with pytest.raises(ValueError) as excinfo:
+        dpt.add(ar1, ar2, out=y)
+    assert re.match("Output array of type.*is needed", str(excinfo.value))
+
+
+@pytest.mark.parametrize("dtype", _all_dtypes)
+def test_add_inplace_python_scalar(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+    X = dpt.zeros((10, 10), dtype=dtype, sycl_queue=q)
+    dt_kind = X.dtype.kind
+    if dt_kind in "ui":
+        X += int(0)
+    elif dt_kind == "f":
+        X += float(0)
+    elif dt_kind == "c":
+        X += complex(0)
+    elif dt_kind == "b":
+        X += bool(0)
+
+
+@pytest.mark.parametrize("op1_dtype", _all_dtypes)
+@pytest.mark.parametrize("op2_dtype", _all_dtypes)
+def test_add_inplace_dtype_matrix(op1_dtype, op2_dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(op1_dtype, q)
+    skip_if_dtype_not_supported(op2_dtype, q)
+
+    sz = 127
+    ar1 = dpt.ones(sz, dtype=op1_dtype)
+    ar2 = dpt.ones_like(ar1, dtype=op2_dtype)
+
+    dev = q.sycl_device
+    _fp16 = dev.has_aspect_fp16
+    _fp64 = dev.has_aspect_fp64
+    # operators use a different Python implementation which permits
+    # same kind style casting
+    if _can_cast(ar2.dtype, ar1.dtype, _fp16, _fp64, casting="same_kind"):
+        ar1 += ar2
+        assert (
+            dpt.asnumpy(ar1) == np.full(ar1.shape, 2, dtype=ar1.dtype)
+        ).all()
+
+        ar3 = dpt.ones(sz, dtype=op1_dtype)[::-1]
+        ar4 = dpt.ones(2 * sz, dtype=op2_dtype)[::2]
+        ar3 += ar4
+        assert (
+            dpt.asnumpy(ar3) == np.full(ar3.shape, 2, dtype=ar3.dtype)
+        ).all()
+    else:
+        with pytest.raises(ValueError):
+            ar1 += ar2
+
+    # here, test the special case where out is the first argument
+    # so an in-place kernel is used for efficiency
+    # this covers a specific branch in the BinaryElementwiseFunc logic
+    ar1 = dpt.ones(sz, dtype=op1_dtype)
+    ar2 = dpt.ones_like(ar1, dtype=op2_dtype)
+    if _can_cast(ar2.dtype, ar1.dtype, _fp16, _fp64):
+        dpt.add(ar1, ar2, out=ar1)
+        assert (
+            dpt.asnumpy(ar1) == np.full(ar1.shape, 2, dtype=ar1.dtype)
+        ).all()
+
+        ar3 = dpt.ones(sz, dtype=op1_dtype)[::-1]
+        ar4 = dpt.ones(2 * sz, dtype=op2_dtype)[::2]
+        dpt.add(ar3, ar4, out=ar3)
+        assert (
+            dpt.asnumpy(ar3) == np.full(ar3.shape, 2, dtype=ar3.dtype)
+        ).all()
+    else:
+        with pytest.raises(ValueError):
+            dpt.add(ar1, ar2, out=ar1)
+
+    ar1 = dpt.ones(sz, dtype=op1_dtype)
+    ar2 = dpt.ones_like(ar1, dtype=op2_dtype)
+    if _can_cast(ar1.dtype, ar2.dtype, _fp16, _fp64):
+        dpt.add(ar1, ar2, out=ar2)
+        assert (
+            dpt.asnumpy(ar2) == np.full(ar2.shape, 2, dtype=ar2.dtype)
+        ).all()
+
+        ar3 = dpt.ones(sz, dtype=op1_dtype)[::-1]
+        ar4 = dpt.ones(2 * sz, dtype=op2_dtype)[::2]
+        dpt.add(ar3, ar4, out=ar4)
+        assert (
+            dpt.asnumpy(ar4) == np.full(ar4.shape, 2, dtype=ar4.dtype)
+        ).all()
+    else:
+        with pytest.raises(ValueError):
+            dpt.add(ar1, ar2, out=ar2)
+
+
+def test_add_inplace_broadcasting():
+    get_queue_or_skip()
+
+    m = dpt.ones((100, 5), dtype="i4")
+    v = dpt.arange(5, dtype="i4")
+
+    dpt.add(m, v, out=m)
+    assert (dpt.asnumpy(m) == np.arange(1, 6, dtype="i4")[np.newaxis, :]).all()
+
+    # check case where second arg is out
+    dpt.add(v, m, out=m)
+    assert (
+        dpt.asnumpy(m) == np.arange(10, dtype="i4")[np.newaxis, 1:10:2]
+    ).all()
+
+
+def test_add_inplace_operator_broadcasting():
+    get_queue_or_skip()
+
+    m = dpt.ones((100, 5), dtype="i4")
+    v = dpt.arange(5, dtype="i4")
+
+    m += v
+    assert (dpt.asnumpy(m) == np.arange(1, 6, dtype="i4")[np.newaxis, :]).all()
+
+
+def test_add_inplace_operator_mutual_broadcast():
+    get_queue_or_skip()
+
+    x1 = dpt.ones((1, 10), dtype="i4")
+    x2 = dpt.ones((10, 1), dtype="i4")
+
+    with pytest.raises(ValueError):
+        dpt.add._inplace_op(x1, x2)
+
+
+def test_add_inplace_errors():
+    get_queue_or_skip()
+    try:
+        gpu_queue = dpctl.SyclQueue("gpu")
+    except dpctl.SyclQueueCreationError:
+        pytest.skip("SyclQueue('gpu') failed, skipping")
+    try:
+        cpu_queue = dpctl.SyclQueue("cpu")
+    except dpctl.SyclQueueCreationError:
+        pytest.skip("SyclQueue('cpu') failed, skipping")
+
+    ar1 = dpt.ones(2, dtype="float32", sycl_queue=gpu_queue)
+    ar2 = dpt.ones_like(ar1, sycl_queue=cpu_queue)
+    with pytest.raises(ExecutionPlacementError):
+        dpt.add(ar1, ar2, out=ar1)
+
+    ar1 = dpt.ones(2, dtype="float32")
+    ar2 = dpt.ones(3, dtype="float32")
+    with pytest.raises(ValueError):
+        dpt.add(ar1, ar2, out=ar1)
+
+    ar1 = np.ones(2, dtype="float32")
+    ar2 = dpt.ones(2, dtype="float32")
+    with pytest.raises(TypeError):
+        dpt.add(ar1, ar2, out=ar1)
+
+    ar1 = dpt.ones(2, dtype="float32")
+    ar2 = {}
+    with pytest.raises(ValueError):
+        dpt.add(ar1, ar2, out=ar1)
+
+    ar1 = dpt.ones((2, 1), dtype="float32")
+    ar2 = dpt.ones((1, 2), dtype="float32")
+    with pytest.raises(ValueError):
+        dpt.add(ar1, ar2, out=ar1)
+
+
+def test_add_inplace_operator_errors():
+    q1 = get_queue_or_skip()
+    q2 = get_queue_or_skip()
+
+    x = dpt.ones(10, dtype="i4", sycl_queue=q1)
+    with pytest.raises(TypeError):
+        dpt.add._inplace_op(dict(), x)
+
+    x.flags["W"] = False
+    with pytest.raises(ValueError):
+        dpt.add._inplace_op(x, 2)
+
+    x_q1 = dpt.ones(10, dtype="i4", sycl_queue=q1)
+    x_q2 = dpt.ones(10, dtype="i4", sycl_queue=q2)
+    with pytest.raises(ExecutionPlacementError):
+        dpt.add._inplace_op(x_q1, x_q2)
+
+
+def test_add_inplace_same_tensors():
+    get_queue_or_skip()
+
+    ar1 = dpt.ones(10, dtype="i4")
+    ar1 += ar1
+    assert (dpt.asnumpy(ar1) == np.full(ar1.shape, 2, dtype="i4")).all()
+
+    ar1 = dpt.ones(10, dtype="i4")
+    ar2 = dpt.ones(10, dtype="i4")
+    dpt.add(ar1, ar2, out=ar1)
+    # all ar1 vals should be 2
+    assert (dpt.asnumpy(ar1) == np.full(ar1.shape, 2, dtype="i4")).all()
+
+    dpt.add(ar2, ar1, out=ar2)
+    # all ar2 vals should be 3
+    assert (dpt.asnumpy(ar2) == np.full(ar2.shape, 3, dtype="i4")).all()
+
+    dpt.add(ar1, ar2, out=ar2)
+    # all ar2 vals should be 5
+    assert (dpt.asnumpy(ar2) == np.full(ar2.shape, 5, dtype="i4")).all()
+
+
+def test_add_str_repr():
+    add_s = str(dpt.add)
+    assert isinstance(add_s, str)
+    assert "add" in add_s
+
+    add_r = repr(dpt.add)
+    assert isinstance(add_r, str)
+    assert "add" in add_r
+
+
+def test_add_cfd():
+    q1 = get_queue_or_skip()
+    q2 = dpctl.SyclQueue(q1.sycl_device)
+
+    x1 = dpt.ones(10, sycl_queue=q1)
+    x2 = dpt.ones(10, sycl_queue=q2)
+    with pytest.raises(ExecutionPlacementError):
+        dpt.add(x1, x2)
+
+    with pytest.raises(ExecutionPlacementError):
+        dpt.add(x1, x1, out=x2)
+
+
+def test_add_out_type_check():
+    get_queue_or_skip()
+
+    x1 = dpt.ones(10)
+    x2 = dpt.ones(10)
+
+    out = range(10)
+
+    with pytest.raises(TypeError):
+        dpt.add(x1, x2, out=out)
+
+
+def test_add_out_need_temporary():
+    get_queue_or_skip()
+
+    x = dpt.ones(10, dtype="u4")
+
+    dpt.add(x[:6], 1, out=x[-6:])
+
+    assert dpt.all(x[:-6] == 1) and dpt.all(x[-6:] == 2)
diff --git a/dpnp/tests/tensor/elementwise/test_angle.py b/dpnp/tests/tensor/elementwise/test_angle.py
new file mode 100644
index 000000000000..09dc2bfc414f
--- /dev/null
+++ b/dpnp/tests/tensor/elementwise/test_angle.py
@@ -0,0 +1,111 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+import itertools
+
+import numpy as np
+import pytest
+
+import dpnp.tensor as dpt
+from dpnp.tensor._type_utils import _can_cast
+
+from ..helper import (
+    get_queue_or_skip,
+    skip_if_dtype_not_supported,
+)
+from .utils import (
+    _all_dtypes,
+    _complex_fp_dtypes,
+    _no_complex_dtypes,
+)
+
+
+@pytest.mark.parametrize("dtype", _all_dtypes)
+def test_angle_out_type(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    x = dpt.asarray(1, dtype=dtype, sycl_queue=q)
+    dt = dpt.dtype(dtype)
+    dev = q.sycl_device
+    _fp16 = dev.has_aspect_fp16
+    _fp64 = dev.has_aspect_fp64
+    if _can_cast(dt, dpt.complex64, _fp16, _fp64):
+        assert dpt.angle(x).dtype == dpt.float32
+    else:
+        assert dpt.angle(x).dtype == dpt.float64
+
+
+@pytest.mark.parametrize("dtype", _no_complex_dtypes[1:])
+def test_angle_real(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    x = dpt.arange(10, dtype=dtype, sycl_queue=q)
+    r = dpt.angle(x)
+
+    assert dpt.all(r == 0)
+
+
+@pytest.mark.parametrize("dtype", _complex_fp_dtypes)
+def test_angle_complex(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    tol = 8 * dpt.finfo(dtype).resolution
+    vals = dpt.pi * dpt.arange(10, dtype=dpt.finfo(dtype).dtype, sycl_queue=q)
+
+    x = dpt.zeros(10, dtype=dtype, sycl_queue=q)
+
+    x.imag[...] = vals
+    r = dpt.angle(x)
+    expected = dpt.atan2(x.imag, x.real)
+    assert dpt.allclose(r, expected, atol=tol, rtol=tol)
+
+    x.real[...] += dpt.pi
+    r = dpt.angle(x)
+    expected = dpt.atan2(x.imag, x.real)
+    assert dpt.allclose(r, expected, atol=tol, rtol=tol)
+
+
+@pytest.mark.parametrize("dtype", ["c8", "c16"])
+def test_angle_special_cases(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    vals = [np.nan, -np.nan, np.inf, -np.inf, +0.0, -0.0]
+    vals = [complex(*val) for val in itertools.product(vals, repeat=2)]
+
+    x = dpt.asarray(vals, dtype=dtype, sycl_queue=q)
+
+    r = dpt.angle(x)
+    expected = dpt.atan2(x.imag, x.real)
+
+    tol = 8 * dpt.finfo(dtype).resolution
+
+    assert dpt.allclose(r, expected, atol=tol, rtol=tol, equal_nan=True)
diff --git a/dpnp/tests/tensor/elementwise/test_atan2.py b/dpnp/tests/tensor/elementwise/test_atan2.py
new file mode 100644
index 000000000000..7a7bb92cdd7b
--- /dev/null
+++ b/dpnp/tests/tensor/elementwise/test_atan2.py
@@ -0,0 +1,524 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+import ctypes
+
+import numpy as np
+import pytest
+from numpy.testing import assert_allclose
+
+import dpnp.tensor as dpt
+
+from ..helper import (
+    get_queue_or_skip,
+    skip_if_dtype_not_supported,
+)
+from .utils import (
+    _compare_dtypes,
+    _no_complex_dtypes,
+)
+
+
+@pytest.mark.parametrize("op1_dtype", _no_complex_dtypes[1:])
+@pytest.mark.parametrize("op2_dtype", _no_complex_dtypes[1:])
+def test_atan2_dtype_matrix(op1_dtype, op2_dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(op1_dtype, q)
+    skip_if_dtype_not_supported(op2_dtype, q)
+
+    sz = 127
+    ar1 = dpt.ones(sz, dtype=op1_dtype, sycl_queue=q)
+    ar2 = dpt.ones_like(ar1, dtype=op2_dtype, sycl_queue=q)
+
+    r = dpt.atan2(ar1, ar2)
+    assert isinstance(r, dpt.usm_ndarray)
+    expected = np.arctan2(
+        np.ones(sz, dtype=op1_dtype), np.ones(sz, dtype=op2_dtype)
+    )
+    assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q)
+    assert r.shape == ar1.shape
+
+    tol = 8 * max(
+        dpt.finfo(r.dtype).resolution, dpt.finfo(expected.dtype).resolution
+    )
+    assert_allclose(dpt.asnumpy(r), expected, atol=tol, rtol=tol)
+    assert r.sycl_queue == ar1.sycl_queue
+
+    ar3 = dpt.ones(sz, dtype=op1_dtype, sycl_queue=q)
+    ar4 = dpt.ones(2 * sz, dtype=op2_dtype, sycl_queue=q)
+
+    r = dpt.atan2(ar3[::-1], ar4[::2])
+    assert isinstance(r, dpt.usm_ndarray)
+    expected = np.arctan2(
+        np.ones(sz, dtype=op1_dtype), np.ones(sz, dtype=op2_dtype)
+    )
+    assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q)
+    assert r.shape == ar3.shape
+
+    tol = 8 * max(
+        dpt.finfo(r.dtype).resolution, dpt.finfo(expected.dtype).resolution
+    )
+    assert_allclose(dpt.asnumpy(r), expected, atol=tol, rtol=tol)
+
+
+@pytest.mark.parametrize("arr_dt", _no_complex_dtypes[1:])
+def test_atan2_python_scalar(arr_dt):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(arr_dt, q)
+
+    X = dpt.ones((10, 10), dtype=arr_dt, sycl_queue=q)
+    py_ones = (
+        bool(1),
+        int(1),
+        float(1),
+        np.float32(1),
+        ctypes.c_int(1),
+    )
+    for sc in py_ones:
+        R = dpt.atan2(X, sc)
+        assert isinstance(R, dpt.usm_ndarray)
+        R = dpt.atan2(sc, X)
+        assert isinstance(R, dpt.usm_ndarray)
+
+
+@pytest.mark.parametrize("dt", ["f2", "f4", "f8"])
+def test_atan2_special_case_one_nan(dt):
+    """If either x1_i or x2_i is NaN, the result is NaN."""
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dt, q)
+
+    x1 = dpt.asarray([dpt.nan, dpt.nan, 1], dtype=dt)
+    x2 = dpt.asarray([dpt.nan, 1, dpt.nan], dtype=dt)
+
+    y = dpt.atan2(x1, x2)
+    assert dpt.all(dpt.isnan(y))
+
+
+@pytest.mark.parametrize("dt", ["f2", "f4", "f8"])
+def test_atan2_special_case_positive_and_pzero(dt):
+    """If x1_i is greater than 0 and x2_i is +0, the result
+    is an approximation to +pi/2.
+    """
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dt, q)
+
+    x1 = dpt.asarray([0.5, 1, 2, dpt.inf], dtype=dt)
+    x2 = dpt.asarray([+0.0], dtype=dt)
+
+    actual = dpt.atan2(x1, x2)
+    expected = dpt.asarray(dpt.pi / 2, dtype=dt)
+
+    diff = dpt.abs(dpt.subtract(actual, expected))
+    atol = 8 * dpt.finfo(diff.dtype).eps
+    assert dpt.all(dpt.less_equal(diff, atol))
+
+
+@pytest.mark.parametrize("dt", ["f2", "f4", "f8"])
+def test_atan2_special_case_positive_and_nzero(dt):
+    """If x1_i is greater than 0 and x2_i is -0, the result
+    is an approximation to +pi/2.
+    """
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dt, q)
+
+    x1 = dpt.asarray([0.5, 1, 2, dpt.inf], dtype=dt)
+    x2 = dpt.asarray([-0.0], dtype=dt)
+
+    actual = dpt.atan2(x1, x2)
+    expected = dpt.asarray(dpt.pi / 2, dtype=dt)
+
+    diff = dpt.abs(dpt.subtract(actual, expected))
+    atol = 8 * dpt.finfo(diff.dtype).eps
+    assert dpt.all(dpt.less_equal(diff, atol))
+
+
+@pytest.mark.parametrize("dt", ["f2", "f4", "f8"])
+def test_atan2_special_case_pzero_and_positive(dt):
+    """If x1_i is +0 and x2_i is greater than 0,
+    the result is +0.
+    """
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dt, q)
+
+    x1 = dpt.asarray(+0.0, dtype=dt)
+    x2 = dpt.asarray([0.5, 1, 2, dpt.inf], dtype=dt)
+
+    actual = dpt.atan2(x1, x2)
+    expected = dpt.asarray(+0.0, dtype=dt)
+
+    assert dpt.all(dpt.equal(actual, expected))
+    assert not dpt.any(dpt.signbit(actual))
+
+
+@pytest.mark.parametrize("dt", ["f2", "f4", "f8"])
+def test_atan2_special_case_pzero_and_pzero(dt):
+    """If x1_i is +0 and x2_i is +0, the result is +0."""
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dt, q)
+
+    x1 = dpt.asarray(+0.0, dtype=dt)
+    x2 = dpt.asarray([+0.0], dtype=dt)
+
+    actual = dpt.atan2(x1, x2)
+    expected = dpt.asarray(+0.0, dtype=dt)
+
+    assert dpt.all(dpt.equal(actual, expected))
+    assert not dpt.any(dpt.signbit(actual))
+
+
+@pytest.mark.parametrize("dt", ["f2", "f4", "f8"])
+def test_atan2_special_case_pzero_and_nzero(dt):
+    """
+    If x1_i is +0 and x2_i is -0, the result is an
+    approximation to +pi.
+    """
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dt, q)
+
+    x1 = dpt.asarray(+0.0, dtype=dt)
+    x2 = dpt.asarray([-0.0], dtype=dt)
+
+    actual = dpt.atan2(x1, x2)
+    expected = dpt.asarray(dpt.pi, dtype=dt)
+
+    diff = dpt.abs(dpt.subtract(actual, expected))
+    atol = 8 * dpt.finfo(diff.dtype).eps
+    assert dpt.all(dpt.less_equal(diff, atol))
+
+
+@pytest.mark.parametrize("dt", ["f2", "f4", "f8"])
+def test_atan2_special_case_pzero_and_negatvie(dt):
+    """
+    If x1_i is +0 and x2_i is less than 0, the result
+    is an approximation to +pi.
+    """
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dt, q)
+
+    x1 = dpt.asarray(+0.0, dtype=dt)
+    x2 = dpt.asarray([-0.5, -1, -2, -dpt.inf], dtype=dt)
+
+    actual = dpt.atan2(x1, x2)
+    expected = dpt.asarray(dpt.pi, dtype=dt)
+
+    diff = dpt.abs(dpt.subtract(actual, expected))
+    atol = 8 * dpt.finfo(diff.dtype).eps
+    assert dpt.all(dpt.less_equal(diff, atol))
+
+
+@pytest.mark.parametrize("dt", ["f2", "f4", "f8"])
+def test_atan2_special_case_nzero_and_positive(dt):
+    """If x1_i is -0 and x2_i is greater than 0,
+    the result is -0.
+    """
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dt, q)
+
+    x1 = dpt.asarray(-0.0, dtype=dt)
+    x2 = dpt.asarray([0.5, 1, 2, dpt.inf], dtype=dt)
+
+    actual = dpt.atan2(x1, x2)
+    expected = dpt.asarray(-0.0, dtype=dt)
+
+    assert dpt.all(dpt.equal(actual, expected))
+    assert dpt.all(dpt.signbit(actual))
+
+
+@pytest.mark.parametrize("dt", ["f2", "f4", "f8"])
+def test_atan2_special_case_nzero_and_pzero(dt):
+    """If x1_i is -0 and x2_i is +0, the result is -0."""
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dt, q)
+
+    x1 = dpt.asarray(-0.0, dtype=dt)
+    x2 = dpt.asarray([+0.0], dtype=dt)
+
+    actual = dpt.atan2(x1, x2)
+    expected = dpt.asarray(-0.0, dtype=dt)
+
+    assert dpt.all(dpt.equal(actual, expected))
+    assert dpt.all(dpt.signbit(actual))
+
+
+@pytest.mark.parametrize("dt", ["f2", "f4", "f8"])
+def test_atan2_special_case_nzero_and_nzero(dt):
+    """If x1_i is -0 and x2_i is -0, the result is
+    an approximation to -pi.
+    """
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dt, q)
+
+    x1 = dpt.asarray([-0.0], dtype=dt)
+    x2 = dpt.asarray([-0.0], dtype=dt)
+
+    actual = dpt.atan2(x1, x2)
+    expected = dpt.asarray(-dpt.pi, dtype=dt)
+
+    diff = dpt.abs(dpt.subtract(actual, expected))
+    atol = 8 * dpt.finfo(diff.dtype).eps
+    assert dpt.all(dpt.less_equal(diff, atol))
+
+
+@pytest.mark.parametrize("dt", ["f2", "f4", "f8"])
+def test_atan2_special_case_nzero_and_negative(dt):
+    """If x1_i is -0 and x2_i is less than 0, the result
+    is an approximation to -pi.
+    """
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dt, q)
+
+    x1 = dpt.asarray([-0.0], dtype=dt)
+    x2 = dpt.asarray([-dpt.inf, -2, -1, -0.5], dtype=dt)
+
+    actual = dpt.atan2(x1, x2)
+    expected = dpt.asarray(-dpt.pi, dtype=dt)
+
+    diff = dpt.abs(dpt.subtract(actual, expected))
+    atol = 8 * dpt.finfo(diff.dtype).eps
+    assert dpt.all(dpt.less_equal(diff, atol))
+
+
+@pytest.mark.parametrize("dt", ["f2", "f4", "f8"])
+def test_atan2_special_case_negative_and_pzero(dt):
+    """If x1_i is less than 0 and x2_i is +0, the result
+    is an approximation to -pi/2.
+    """
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dt, q)
+
+    x1 = dpt.asarray([-dpt.inf, -2, -1, -0.5], dtype=dt)
+    x2 = dpt.asarray(+0.0, dtype=dt)
+
+    actual = dpt.atan2(x1, x2)
+    expected = dpt.asarray(-dpt.pi / 2, dtype=dt)
+
+    diff = dpt.abs(dpt.subtract(actual, expected))
+    atol = 8 * dpt.finfo(diff.dtype).eps
+    assert dpt.all(dpt.less_equal(diff, atol))
+
+
+@pytest.mark.parametrize("dt", ["f2", "f4", "f8"])
+def test_atan2_special_case_negative_and_nzero(dt):
+    """If x1_i is less than 0 and x2_i is -0, the result
+    is an approximation to -pi/2."""
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dt, q)
+
+    x1 = dpt.asarray([-dpt.inf, -2, -1, -0.5], dtype=dt)
+    x2 = dpt.asarray(-0.0, dtype=dt)
+
+    actual = dpt.atan2(x1, x2)
+    expected = dpt.asarray(-dpt.pi / 2, dtype=dt)
+
+    diff = dpt.abs(dpt.subtract(actual, expected))
+    atol = 8 * dpt.finfo(diff.dtype).eps
+    assert dpt.all(dpt.less_equal(diff, atol))
+
+
+@pytest.mark.parametrize("dt", ["f2", "f4", "f8"])
+def test_atan2_special_case_pfinite_and_pinf(dt):
+    """If x1_i is greater than 0, x1_i is a finite number,
+    and x2_i is +infinity, the result is +0."""
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dt, q)
+
+    x1 = dpt.asarray([0.5, 1, 2, 5], dtype=dt)
+    x2 = dpt.asarray(dpt.inf, dtype=dt)
+
+    actual = dpt.atan2(x1, x2)
+    expected = dpt.asarray(+0.0, dtype=dt)
+    assert dpt.all(dpt.equal(actual, expected))
+    assert not dpt.any(dpt.signbit(actual))
+
+
+@pytest.mark.parametrize("dt", ["f2", "f4", "f8"])
+def test_atan2_special_case_pfinite_and_ninf(dt):
+    """If x1_i is greater than 0, x1_i is a finite number,
+    and x2_i is -infinity, the result is an approximation
+    to +pi."""
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dt, q)
+
+    x1 = dpt.asarray([0.5, 1, 2, 5], dtype=dt)
+    x2 = dpt.asarray(-dpt.inf, dtype=dt)
+
+    actual = dpt.atan2(x1, x2)
+    expected = dpt.asarray(dpt.pi, dtype=dt)
+
+    diff = dpt.abs(dpt.subtract(actual, expected))
+    atol = 8 * dpt.finfo(diff.dtype).eps
+    assert dpt.all(dpt.less_equal(diff, atol))
+
+
+@pytest.mark.parametrize("dt", ["f2", "f4", "f8"])
+def test_atan2_special_case_nfinite_and_pinf(dt):
+    """If x1_i is less than 0, x1_i is a finite number,
+    and x2_i is +infinity, the result is -0."""
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dt, q)
+
+    x1 = dpt.asarray([-0.5, -1, -2, -5], dtype=dt)
+    x2 = dpt.asarray(dpt.inf, dtype=dt)
+
+    actual = dpt.atan2(x1, x2)
+    expected = dpt.asarray(-0.0, dtype=dt)
+    assert dpt.all(dpt.equal(actual, expected))
+    assert dpt.all(dpt.signbit(actual))
+
+
+@pytest.mark.parametrize("dt", ["f2", "f4", "f8"])
+def test_atan2_special_case_nfinite_and_ninf(dt):
+    """If x1_i is less than 0, x1_i is a finite number, and
+    x2_i is -infinity, the result is an approximation
+    to -pi."""
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dt, q)
+
+    x1 = dpt.asarray([-0.5, -1, -2, -5], dtype=dt)
+    x2 = dpt.asarray(-dpt.inf, dtype=dt)
+
+    actual = dpt.atan2(x1, x2)
+    expected = dpt.asarray(-dpt.pi, dtype=dt)
+
+    diff = dpt.abs(dpt.subtract(actual, expected))
+    atol = 8 * dpt.finfo(diff.dtype).eps
+    assert dpt.all(dpt.less_equal(diff, atol))
+
+
+@pytest.mark.parametrize("dt", ["f2", "f4", "f8"])
+def test_atan2_special_case_pinf_and_finite(dt):
+    """If x1_i is +infinity and x2_i is a finite number,
+    the result is an approximation to +pi/2.
+    """
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dt, q)
+
+    x1 = dpt.asarray(dpt.inf, dtype=dt)
+    x2 = dpt.asarray([-2, -0.0, 0.0, 2], dtype=dt)
+
+    actual = dpt.atan2(x1, x2)
+    expected = dpt.asarray(dpt.pi / 2, dtype=dt)
+
+    diff = dpt.abs(dpt.subtract(actual, expected))
+    atol = 8 * dpt.finfo(diff.dtype).eps
+    assert dpt.all(dpt.less_equal(diff, atol))
+
+
+@pytest.mark.parametrize("dt", ["f2", "f4", "f8"])
+def test_atan2_special_case_ninf_and_finite(dt):
+    """If x1_i is -infinity and x2_i is a finite number,
+    the result is an approximation to -pi/2.
+    """
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dt, q)
+
+    x1 = dpt.asarray(-dpt.inf, dtype=dt)
+    x2 = dpt.asarray([-2, -0.0, 0.0, 2], dtype=dt)
+
+    actual = dpt.atan2(x1, x2)
+    expected = dpt.asarray(-dpt.pi / 2, dtype=dt)
+
+    diff = dpt.abs(dpt.subtract(actual, expected))
+    atol = 8 * dpt.finfo(diff.dtype).eps
+    assert dpt.all(dpt.less_equal(diff, atol))
+
+
+@pytest.mark.parametrize("dt", ["f2", "f4", "f8"])
+def test_atan2_special_case_pinf_and_pinf(dt):
+    """If x1_i is +infinity and x2_i is +infinity,
+    the result is an approximation to +pi/4.
+    """
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dt, q)
+
+    x1 = dpt.asarray(dpt.inf, dtype=dt)
+    x2 = dpt.asarray([dpt.inf], dtype=dt)
+
+    actual = dpt.atan2(x1, x2)
+    expected = dpt.asarray(dpt.pi / 4, dtype=dt)
+
+    diff = dpt.abs(dpt.subtract(actual, expected))
+    atol = 8 * dpt.finfo(diff.dtype).eps
+    assert dpt.all(dpt.less_equal(diff, atol))
+
+
+@pytest.mark.parametrize("dt", ["f2", "f4", "f8"])
+def test_atan2_special_case_pinf_and_ninf(dt):
+    """If x1_i is +infinity and x2_i is -infinity,
+    the result is an approximation to +3*pi/4.
+    """
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dt, q)
+
+    x1 = dpt.asarray(dpt.inf, dtype=dt)
+    x2 = dpt.asarray([-dpt.inf], dtype=dt)
+
+    actual = dpt.atan2(x1, x2)
+    expected = dpt.asarray(3 * dpt.pi / 4, dtype=dt)
+
+    diff = dpt.abs(dpt.subtract(actual, expected))
+    atol = 8 * dpt.finfo(diff.dtype).eps
+    assert dpt.all(dpt.less_equal(diff, atol))
+
+
+@pytest.mark.parametrize("dt", ["f2", "f4", "f8"])
+def test_atan2_special_case_ninf_and_pinf(dt):
+    """If x1_i is -infinity and x2_i is +infinity,
+    the result is an approximation to -pi/4.
+    """
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dt, q)
+
+    x1 = dpt.asarray(-dpt.inf, dtype=dt)
+    x2 = dpt.asarray([dpt.inf], dtype=dt)
+
+    actual = dpt.atan2(x1, x2)
+    expected = dpt.asarray(-dpt.pi / 4, dtype=dt)
+
+    diff = dpt.abs(dpt.subtract(actual, expected))
+    atol = 8 * dpt.finfo(diff.dtype).eps
+    assert dpt.all(dpt.less_equal(diff, atol))
+
+
+@pytest.mark.parametrize("dt", ["f2", "f4", "f8"])
+def test_atan2_special_case_ninf_and_ninf(dt):
+    """If x1_i is -infinity and x2_i is -infinity,
+    the result is an approximation to -3*pi/4.
+    """
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dt, q)
+
+    x1 = dpt.asarray(-dpt.inf, dtype=dt)
+    x2 = dpt.asarray([-dpt.inf], dtype=dt)
+
+    actual = dpt.atan2(x1, x2)
+    expected = dpt.asarray(-3 * dpt.pi / 4, dtype=dt)
+
+    diff = dpt.abs(dpt.subtract(actual, expected))
+    atol = 8 * dpt.finfo(diff.dtype).eps
+    assert dpt.all(dpt.less_equal(diff, atol))
diff --git a/dpnp/tests/tensor/elementwise/test_bitwise_and.py b/dpnp/tests/tensor/elementwise/test_bitwise_and.py
new file mode 100644
index 000000000000..c9172cb9d7d6
--- /dev/null
+++ b/dpnp/tests/tensor/elementwise/test_bitwise_and.py
@@ -0,0 +1,142 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+import numpy as np
+import pytest
+
+import dpnp.tensor as dpt
+from dpnp.tensor._type_utils import _can_cast
+
+from ..helper import (
+    get_queue_or_skip,
+    skip_if_dtype_not_supported,
+)
+from .utils import _integral_dtypes
+
+
+@pytest.mark.parametrize("op_dtype", _integral_dtypes)
+def test_bitwise_and_dtype_matrix_contig(op_dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(op_dtype, q)
+
+    sz = 7
+    n = 2 * sz
+    dt1 = dpt.dtype(op_dtype)
+    dt2 = dpt.dtype(op_dtype)
+
+    x1_range_begin = -sz if dpt.iinfo(dt1).min < 0 else 0
+    x1 = dpt.arange(x1_range_begin, x1_range_begin + n, dtype=dt1)
+
+    x2_range_begin = -sz if dpt.iinfo(dt2).min < 0 else 0
+    x2 = dpt.arange(x2_range_begin, x2_range_begin + n, dtype=dt1)
+
+    r = dpt.bitwise_and(x1, x2)
+    assert isinstance(r, dpt.usm_ndarray)
+
+    x1_np = np.arange(x1_range_begin, x1_range_begin + n, dtype=op_dtype)
+    x2_np = np.arange(x2_range_begin, x2_range_begin + n, dtype=op_dtype)
+    r_np = np.bitwise_and(x1_np, x2_np)
+
+    assert (r_np == dpt.asnumpy(r)).all()
+
+
+@pytest.mark.parametrize("op_dtype", _integral_dtypes)
+def test_bitwise_and_dtype_matrix_strided(op_dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(op_dtype, q)
+
+    sz = 11
+    n = 2 * sz
+    dt1 = dpt.dtype(op_dtype)
+    dt2 = dpt.dtype(op_dtype)
+
+    x1_range_begin = -sz if dpt.iinfo(dt1).min < 0 else 0
+    x1 = dpt.arange(x1_range_begin, x1_range_begin + n, dtype=dt1)[::2]
+
+    x2_range_begin = -(sz // 2) if dpt.iinfo(dt2).min < 0 else 0
+    x2 = dpt.arange(x2_range_begin, x2_range_begin + n, dtype=dt1)[::-2]
+
+    r = dpt.bitwise_and(x1, x2)
+    assert isinstance(r, dpt.usm_ndarray)
+
+    x1_np = np.arange(x1_range_begin, x1_range_begin + n, dtype=op_dtype)[::2]
+    x2_np = np.arange(x2_range_begin, x2_range_begin + n, dtype=op_dtype)[::-2]
+    r_np = np.bitwise_and(x1_np, x2_np)
+
+    assert (r_np == dpt.asnumpy(r)).all()
+
+
+def test_bitwise_and_bool():
+    get_queue_or_skip()
+
+    x1 = dpt.asarray([True, False])
+    x2 = dpt.asarray([False, True])
+
+    r_bw = dpt.bitwise_and(x1[:, dpt.newaxis], x2[dpt.newaxis])
+    r_lo = dpt.logical_and(x1[:, dpt.newaxis], x2[dpt.newaxis])
+
+    assert dpt.all(dpt.equal(r_bw, r_lo))
+
+
+@pytest.mark.parametrize("dtype", ["?"] + _integral_dtypes)
+def test_bitwise_and_inplace_python_scalar(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+    X = dpt.zeros((10, 10), dtype=dtype, sycl_queue=q)
+    dt_kind = X.dtype.kind
+    if dt_kind == "b":
+        X &= False
+    else:
+        X &= int(0)
+
+
+@pytest.mark.parametrize("op1_dtype", ["?"] + _integral_dtypes)
+@pytest.mark.parametrize("op2_dtype", ["?"] + _integral_dtypes)
+def test_bitwise_and_inplace_dtype_matrix(op1_dtype, op2_dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(op1_dtype, q)
+    skip_if_dtype_not_supported(op2_dtype, q)
+
+    sz = 127
+    ar1 = dpt.ones(sz, dtype=op1_dtype, sycl_queue=q)
+    ar2 = dpt.ones_like(ar1, dtype=op2_dtype, sycl_queue=q)
+
+    dev = q.sycl_device
+    _fp16 = dev.has_aspect_fp16
+    _fp64 = dev.has_aspect_fp64
+    if _can_cast(ar2.dtype, ar1.dtype, _fp16, _fp64, casting="same_kind"):
+        ar1 &= ar2
+        assert dpt.all(ar1 == 1)
+
+        ar3 = dpt.ones(sz, dtype=op1_dtype, sycl_queue=q)[::-1]
+        ar4 = dpt.ones(2 * sz, dtype=op2_dtype, sycl_queue=q)[::2]
+        ar3 &= ar4
+        assert dpt.all(ar3 == 1)
+    else:
+        with pytest.raises(ValueError):
+            ar1 &= ar2
diff --git a/dpnp/tests/tensor/elementwise/test_bitwise_invert.py b/dpnp/tests/tensor/elementwise/test_bitwise_invert.py
new file mode 100644
index 000000000000..2b7a7c3a6f93
--- /dev/null
+++ b/dpnp/tests/tensor/elementwise/test_bitwise_invert.py
@@ -0,0 +1,148 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+import numpy as np
+import pytest
+
+import dpnp.tensor as dpt
+
+from ..helper import (
+    get_queue_or_skip,
+    skip_if_dtype_not_supported,
+)
+from .utils import (
+    _compare_dtypes,
+    _integral_dtypes,
+    _usm_types,
+)
+
+
+@pytest.mark.parametrize(
+    "op_dtype",
+    [
+        "b1",
+    ]
+    + _integral_dtypes,
+)
+def test_bitwise_invert_dtype_matrix(op_dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(op_dtype, q)
+
+    sz = 7
+    ar1 = dpt.asarray(np.random.randint(0, 2, sz), dtype=op_dtype)
+
+    r = dpt.bitwise_invert(ar1)
+    assert isinstance(r, dpt.usm_ndarray)
+    assert r.dtype == ar1.dtype
+
+    expected = np.bitwise_not(dpt.asnumpy(ar1))
+    assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q)
+    assert r.shape == ar1.shape
+    assert (dpt.asnumpy(r) == expected).all()
+    assert r.sycl_queue == ar1.sycl_queue
+
+    r2 = dpt.empty_like(r, dtype=r.dtype)
+    dpt.bitwise_invert(ar1, out=r2)
+    assert dpt.all(dpt.equal(r, r2))
+
+    ar2 = dpt.zeros(sz, dtype=op_dtype)
+    r = dpt.bitwise_invert(ar2[::-1])
+    assert isinstance(r, dpt.usm_ndarray)
+
+    expected = np.bitwise_not(np.zeros(ar2.shape, dtype=op_dtype))
+    assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q)
+    assert r.shape == ar2.shape
+    assert (dpt.asnumpy(r) == expected).all()
+
+    ar3 = dpt.ones(sz, dtype=op_dtype)
+    r2 = dpt.bitwise_invert(ar3[::2])
+    assert isinstance(r, dpt.usm_ndarray)
+
+    expected = np.bitwise_not(np.ones(ar3.shape, dtype=op_dtype)[::2])
+    assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q)
+    assert (dpt.asnumpy(r2) == expected).all()
+
+    r3 = dpt.empty_like(r, dtype=r.dtype)
+    dpt.bitwise_invert(ar2[::-1], out=r3)
+    assert dpt.all(dpt.equal(r, r3))
+
+
+@pytest.mark.parametrize("op_usm_type", _usm_types)
+def test_bitwise_invert_usm_type_matrix(op_usm_type):
+    get_queue_or_skip()
+
+    sz = 128
+    ar1 = dpt.asarray(
+        np.random.randint(0, 2, sz), dtype="i4", usm_type=op_usm_type
+    )
+
+    r = dpt.bitwise_invert(ar1)
+    assert isinstance(r, dpt.usm_ndarray)
+    assert r.usm_type == op_usm_type
+
+
+def test_bitwise_invert_order():
+    get_queue_or_skip()
+
+    ar1 = dpt.ones((20, 20), dtype="i4", order="C")
+    r1 = dpt.bitwise_invert(ar1, order="C")
+    assert r1.flags.c_contiguous
+    r2 = dpt.bitwise_invert(ar1, order="F")
+    assert r2.flags.f_contiguous
+    r3 = dpt.bitwise_invert(ar1, order="A")
+    assert r3.flags.c_contiguous
+    r4 = dpt.bitwise_invert(ar1, order="K")
+    assert r4.flags.c_contiguous
+
+    ar1 = dpt.zeros((20, 20), dtype="i4", order="F")
+    r1 = dpt.bitwise_invert(ar1, order="C")
+    assert r1.flags.c_contiguous
+    r2 = dpt.bitwise_invert(ar1, order="F")
+    assert r2.flags.f_contiguous
+    r3 = dpt.bitwise_invert(ar1, order="A")
+    assert r3.flags.f_contiguous
+    r4 = dpt.bitwise_invert(ar1, order="K")
+    assert r4.flags.f_contiguous
+
+    ar1 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2]
+    r4 = dpt.bitwise_invert(ar1, order="K")
+    assert r4.strides == (20, -1)
+
+    ar1 = dpt.zeros((40, 40), dtype="i4", order="C")[:20, ::-2].mT
+    r4 = dpt.bitwise_invert(ar1, order="K")
+    assert r4.strides == (-1, 20)
+
+
+def test_bitwise_invert_large_boolean():
+    get_queue_or_skip()
+
+    x = dpt.tril(dpt.ones((32, 32), dtype="?"), k=-1)
+    res = dpt.astype(dpt.bitwise_invert(x), "i4")
+
+    assert dpt.all(res >= 0)
+    assert dpt.all(res <= 1)
diff --git a/dpnp/tests/tensor/elementwise/test_bitwise_left_shift.py b/dpnp/tests/tensor/elementwise/test_bitwise_left_shift.py
new file mode 100644
index 000000000000..bb68aab227ab
--- /dev/null
+++ b/dpnp/tests/tensor/elementwise/test_bitwise_left_shift.py
@@ -0,0 +1,150 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+import numpy as np
+import pytest
+
+import dpnp.tensor as dpt
+from dpnp.tensor._type_utils import _can_cast
+
+from ..helper import (
+    get_queue_or_skip,
+    skip_if_dtype_not_supported,
+)
+from .utils import _integral_dtypes
+
+
+@pytest.mark.parametrize("op1_dtype", _integral_dtypes)
+@pytest.mark.parametrize("op2_dtype", _integral_dtypes)
+def test_bitwise_left_shift_dtype_matrix_contig(op1_dtype, op2_dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(op1_dtype, q)
+    skip_if_dtype_not_supported(op2_dtype, q)
+
+    if op1_dtype != op2_dtype and "u8" in [op1_dtype, op2_dtype]:
+        return
+
+    sz = 7
+    n = 2 * sz
+    dt1 = dpt.dtype(op1_dtype)
+    dt2 = dpt.dtype(op2_dtype)
+
+    x1_range_begin = -sz if dpt.iinfo(dt1).min < 0 else 0
+    x1 = dpt.arange(x1_range_begin, x1_range_begin + n, dtype=dt1)
+    x2 = dpt.arange(0, n, dtype=dt2)
+
+    r = dpt.bitwise_left_shift(x1, x2)
+    assert isinstance(r, dpt.usm_ndarray)
+    assert r.sycl_queue == x1.sycl_queue
+    assert r.sycl_queue == x2.sycl_queue
+
+    x1_np = np.arange(x1_range_begin, x1_range_begin + n, dtype=op1_dtype)
+    x2_np = np.arange(0, n, dtype=op2_dtype)
+    r_np = np.left_shift(x1_np, x2_np)
+
+    assert r.dtype == r_np.dtype
+    assert (dpt.asnumpy(r) == r_np).all()
+
+
+@pytest.mark.parametrize("op1_dtype", _integral_dtypes)
+@pytest.mark.parametrize("op2_dtype", _integral_dtypes)
+def test_bitwise_left_shift_dtype_matrix_strided(op1_dtype, op2_dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(op1_dtype, q)
+    skip_if_dtype_not_supported(op2_dtype, q)
+
+    if op1_dtype != op2_dtype and "u8" in [op1_dtype, op2_dtype]:
+        return
+
+    sz = 11
+    n = 2 * sz
+    dt1 = dpt.dtype(op1_dtype)
+    dt2 = dpt.dtype(op2_dtype)
+
+    x1_range_begin = -sz if dpt.iinfo(dt1).min < 0 else 0
+    x1 = dpt.arange(x1_range_begin, x1_range_begin + n, dtype=dt1)[::-2]
+    x2 = dpt.arange(0, n, dtype=dt2)[::2]
+
+    r = dpt.bitwise_left_shift(x1, x2)
+    assert isinstance(r, dpt.usm_ndarray)
+    assert r.sycl_queue == x1.sycl_queue
+    assert r.sycl_queue == x2.sycl_queue
+
+    x1_np = np.arange(x1_range_begin, x1_range_begin + n, dtype=dt1)[::-2]
+    x2_np = np.arange(0, n, dtype=dt2)[::2]
+    r_np = np.left_shift(x1_np, x2_np)
+
+    assert r.dtype == r_np.dtype
+    assert (dpt.asnumpy(r) == r_np).all()
+
+
+@pytest.mark.parametrize("op_dtype", _integral_dtypes)
+def test_bitwise_left_shift_range(op_dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(op_dtype, q)
+
+    x = dpt.ones(255, dtype=op_dtype)
+    y = dpt.asarray(64, dtype=op_dtype)
+
+    z = dpt.bitwise_left_shift(x, y)
+    assert dpt.all(dpt.equal(z, 0))
+
+
+@pytest.mark.parametrize("dtype", _integral_dtypes)
+def test_bitwise_left_shift_inplace_python_scalar(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+    X = dpt.zeros((10, 10), dtype=dtype, sycl_queue=q)
+    X <<= int(0)
+
+
+@pytest.mark.parametrize("op1_dtype", _integral_dtypes)
+@pytest.mark.parametrize("op2_dtype", _integral_dtypes)
+def test_bitwise_left_shift_inplace_dtype_matrix(op1_dtype, op2_dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(op1_dtype, q)
+    skip_if_dtype_not_supported(op2_dtype, q)
+
+    sz = 127
+    ar1 = dpt.ones(sz, dtype=op1_dtype, sycl_queue=q)
+    ar2 = dpt.ones_like(ar1, dtype=op2_dtype, sycl_queue=q)
+
+    dev = q.sycl_device
+    _fp16 = dev.has_aspect_fp16
+    _fp64 = dev.has_aspect_fp64
+    if _can_cast(ar2.dtype, ar1.dtype, _fp16, _fp64, casting="same_kind"):
+        ar1 <<= ar2
+        assert dpt.all(ar1 == 2)
+
+        ar3 = dpt.ones(sz, dtype=op1_dtype, sycl_queue=q)[::-1]
+        ar4 = dpt.ones(2 * sz, dtype=op2_dtype, sycl_queue=q)[::2]
+        ar3 <<= ar4
+        assert dpt.all(ar3 == 2)
+    else:
+        with pytest.raises(ValueError):
+            ar1 <<= ar2
diff --git a/dpnp/tests/tensor/elementwise/test_bitwise_or.py b/dpnp/tests/tensor/elementwise/test_bitwise_or.py
new file mode 100644
index 000000000000..0e1a5bfeab1c
--- /dev/null
+++ b/dpnp/tests/tensor/elementwise/test_bitwise_or.py
@@ -0,0 +1,158 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+import numpy as np
+import pytest
+
+import dpnp.tensor as dpt
+from dpnp.tensor._type_utils import _can_cast
+
+from ..helper import (
+    get_queue_or_skip,
+    skip_if_dtype_not_supported,
+)
+from .utils import _integral_dtypes
+
+
+@pytest.mark.parametrize("op_dtype", _integral_dtypes)
+def test_bitwise_or_dtype_matrix_contig(op_dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(op_dtype, q)
+
+    sz = 7
+    n = 2 * sz
+    dt1 = dpt.dtype(op_dtype)
+    dt2 = dpt.dtype(op_dtype)
+
+    x1_range_begin = -sz if dpt.iinfo(dt1).min < 0 else 0
+    x1 = dpt.arange(x1_range_begin, x1_range_begin + n, dtype=dt1)
+
+    x2_range_begin = -sz if dpt.iinfo(dt2).min < 0 else 0
+    x2 = dpt.arange(x2_range_begin, x2_range_begin + n, dtype=dt1)
+
+    r = dpt.bitwise_or(x1, x2)
+    assert isinstance(r, dpt.usm_ndarray)
+
+    x1_np = np.arange(x1_range_begin, x1_range_begin + n, dtype=op_dtype)
+    x2_np = np.arange(x2_range_begin, x2_range_begin + n, dtype=op_dtype)
+    r_np = np.bitwise_or(x1_np, x2_np)
+
+    assert (r_np == dpt.asnumpy(r)).all()
+
+
+@pytest.mark.parametrize("op_dtype", _integral_dtypes)
+def test_bitwise_or_dtype_matrix_strided(op_dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(op_dtype, q)
+
+    sz = 11
+    n = 2 * sz
+    dt1 = dpt.dtype(op_dtype)
+    dt2 = dpt.dtype(op_dtype)
+
+    x1_range_begin = -sz if dpt.iinfo(dt1).min < 0 else 0
+    x1 = dpt.arange(x1_range_begin, x1_range_begin + n, dtype=dt1)[::2]
+
+    x2_range_begin = -(sz // 2) if dpt.iinfo(dt2).min < 0 else 0
+    x2 = dpt.arange(x2_range_begin, x2_range_begin + n, dtype=dt1)[::-2]
+
+    r = dpt.bitwise_or(x1, x2)
+    assert isinstance(r, dpt.usm_ndarray)
+
+    x1_np = np.arange(x1_range_begin, x1_range_begin + n, dtype=op_dtype)[::2]
+    x2_np = np.arange(x2_range_begin, x2_range_begin + n, dtype=op_dtype)[::-2]
+    r_np = np.bitwise_or(x1_np, x2_np)
+
+    assert (r_np == dpt.asnumpy(r)).all()
+
+
+def test_bitwise_or_bool():
+    get_queue_or_skip()
+
+    x1 = dpt.asarray([True, False])
+    x2 = dpt.asarray([False, True])
+
+    r_bw = dpt.bitwise_or(x1[:, dpt.newaxis], x2[dpt.newaxis])
+    r_lo = dpt.logical_or(x1[:, dpt.newaxis], x2[dpt.newaxis])
+
+    assert dpt.all(dpt.equal(r_bw, r_lo))
+
+
+@pytest.mark.parametrize("dtype", ["?"] + _integral_dtypes)
+def test_bitwise_or_inplace_python_scalar(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+    X = dpt.zeros((10, 10), dtype=dtype, sycl_queue=q)
+    dt_kind = X.dtype.kind
+    if dt_kind == "b":
+        X |= False
+    else:
+        X |= int(0)
+
+
+@pytest.mark.parametrize("op1_dtype", ["?"] + _integral_dtypes)
+@pytest.mark.parametrize("op2_dtype", ["?"] + _integral_dtypes)
+def test_bitwise_or_inplace_dtype_matrix(op1_dtype, op2_dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(op1_dtype, q)
+    skip_if_dtype_not_supported(op2_dtype, q)
+
+    sz = 127
+    ar1 = dpt.ones(sz, dtype=op1_dtype, sycl_queue=q)
+    ar2 = dpt.ones_like(ar1, dtype=op2_dtype, sycl_queue=q)
+
+    dev = q.sycl_device
+    _fp16 = dev.has_aspect_fp16
+    _fp64 = dev.has_aspect_fp64
+    if _can_cast(ar2.dtype, ar1.dtype, _fp16, _fp64, casting="same_kind"):
+        ar1 |= ar2
+        assert dpt.all(ar1 == 1)
+
+        ar3 = dpt.ones(sz, dtype=op1_dtype, sycl_queue=q)[::-1]
+        ar4 = dpt.ones(2 * sz, dtype=op2_dtype, sycl_queue=q)[::2]
+        ar3 |= ar4
+        assert dpt.all(ar3 == 1)
+    else:
+        with pytest.raises(ValueError):
+            ar1 |= ar2
+            dpt.bitwise_or(ar1, ar2, out=ar1)
+
+    # out is second arg
+    ar1 = dpt.ones(sz, dtype=op1_dtype, sycl_queue=q)
+    ar2 = dpt.ones_like(ar1, dtype=op2_dtype, sycl_queue=q)
+    if _can_cast(ar1.dtype, ar2.dtype, _fp16, _fp64):
+        dpt.bitwise_or(ar1, ar2, out=ar2)
+        assert dpt.all(ar2 == 1)
+
+        ar3 = dpt.ones(sz, dtype=op1_dtype, sycl_queue=q)[::-1]
+        ar4 = dpt.ones(2 * sz, dtype=op2_dtype, sycl_queue=q)[::2]
+        dpt.bitwise_or(ar3, ar4, out=ar4)
+        dpt.all(ar4 == 1)
+    else:
+        with pytest.raises(ValueError):
+            dpt.bitwise_or(ar1, ar2, out=ar2)
diff --git a/dpnp/tests/tensor/elementwise/test_bitwise_right_shift.py b/dpnp/tests/tensor/elementwise/test_bitwise_right_shift.py
new file mode 100644
index 000000000000..cdd2da9ba863
--- /dev/null
+++ b/dpnp/tests/tensor/elementwise/test_bitwise_right_shift.py
@@ -0,0 +1,166 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+import numpy as np
+import pytest
+
+import dpnp.tensor as dpt
+from dpnp.tensor._type_utils import _can_cast
+
+from ..helper import (
+    get_queue_or_skip,
+    skip_if_dtype_not_supported,
+)
+from .utils import _integral_dtypes
+
+
+@pytest.mark.parametrize("op1_dtype", _integral_dtypes)
+@pytest.mark.parametrize("op2_dtype", _integral_dtypes)
+def test_bitwise_right_shift_dtype_matrix_contig(op1_dtype, op2_dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(op1_dtype, q)
+    skip_if_dtype_not_supported(op2_dtype, q)
+
+    if op1_dtype != op2_dtype and "u8" in [op1_dtype, op2_dtype]:
+        return
+
+    sz = 7
+    n = 2 * sz
+    dt1 = dpt.dtype(op1_dtype)
+    dt2 = dpt.dtype(op2_dtype)
+
+    x1_range_begin = -sz if dpt.iinfo(dt1).min < 0 else 0
+    x1 = dpt.arange(x1_range_begin, x1_range_begin + n, dtype=dt1)
+    x2 = dpt.arange(0, n, dtype=dt2)
+
+    r = dpt.bitwise_right_shift(x1, x2)
+    assert isinstance(r, dpt.usm_ndarray)
+    assert r.sycl_queue == x1.sycl_queue
+    assert r.sycl_queue == x2.sycl_queue
+
+    x1_np = np.arange(x1_range_begin, x1_range_begin + n, dtype=op1_dtype)
+    x2_np = np.arange(0, n, dtype=op2_dtype)
+    r_np = np.right_shift(x1_np, x2_np)
+
+    assert r.dtype == r_np.dtype
+    assert (dpt.asnumpy(r) == r_np).all()
+
+
+@pytest.mark.parametrize("op1_dtype", _integral_dtypes)
+@pytest.mark.parametrize("op2_dtype", _integral_dtypes)
+def test_bitwise_right_shift_dtype_matrix_strided(op1_dtype, op2_dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(op1_dtype, q)
+    skip_if_dtype_not_supported(op2_dtype, q)
+
+    if op1_dtype != op2_dtype and "u8" in [op1_dtype, op2_dtype]:
+        return
+
+    sz = 11
+    n = 2 * sz
+    dt1 = dpt.dtype(op1_dtype)
+    dt2 = dpt.dtype(op2_dtype)
+
+    x1_range_begin = -sz if dpt.iinfo(dt1).min < 0 else 0
+    x1 = dpt.arange(x1_range_begin, x1_range_begin + n, dtype=dt1)[::-2]
+    x2 = dpt.arange(0, n, dtype=dt2)[::2]
+
+    r = dpt.bitwise_right_shift(x1, x2)
+    assert isinstance(r, dpt.usm_ndarray)
+    assert r.sycl_queue == x1.sycl_queue
+    assert r.sycl_queue == x2.sycl_queue
+
+    x1_np = np.arange(x1_range_begin, x1_range_begin + n, dtype=dt1)[::-2]
+    x2_np = np.arange(0, n, dtype=dt2)[::2]
+    r_np = np.right_shift(x1_np, x2_np)
+
+    assert r.dtype == r_np.dtype
+    assert (dpt.asnumpy(r) == r_np).all()
+
+
+@pytest.mark.parametrize("op_dtype", _integral_dtypes)
+def test_bitwise_right_shift_range(op_dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(op_dtype, q)
+
+    x = dpt.ones(255, dtype=op_dtype)
+    y = dpt.asarray(64, dtype=op_dtype)
+
+    z = dpt.bitwise_right_shift(x, y)
+    assert dpt.all(dpt.equal(z, 0))
+
+
+@pytest.mark.parametrize("dtype", _integral_dtypes)
+def test_bitwise_right_shift_inplace_python_scalar(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+    X = dpt.zeros((10, 10), dtype=dtype, sycl_queue=q)
+    X >>= int(0)
+
+
+@pytest.mark.parametrize("op1_dtype", _integral_dtypes)
+@pytest.mark.parametrize("op2_dtype", _integral_dtypes)
+def test_bitwise_right_shift_inplace_dtype_matrix(op1_dtype, op2_dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(op1_dtype, q)
+    skip_if_dtype_not_supported(op2_dtype, q)
+
+    sz = 127
+    ar1 = dpt.ones(sz, dtype=op1_dtype, sycl_queue=q)
+    ar2 = dpt.ones_like(ar1, dtype=op2_dtype, sycl_queue=q)
+
+    dev = q.sycl_device
+    _fp16 = dev.has_aspect_fp16
+    _fp64 = dev.has_aspect_fp64
+    if _can_cast(ar2.dtype, ar1.dtype, _fp16, _fp64):
+        ar1 >>= ar2
+        assert dpt.all(ar1 == 0)
+
+        ar3 = dpt.ones(sz, dtype=op1_dtype, sycl_queue=q)[::-1]
+        ar4 = dpt.ones(2 * sz, dtype=op2_dtype, sycl_queue=q)[::2]
+        ar3 >>= ar4
+        assert dpt.all(ar3 == 0)
+    else:
+        with pytest.raises(ValueError):
+            ar1 >>= ar2
+            dpt.bitwise_right_shift(ar1, ar2, out=ar1)
+
+    # out is second arg
+    ar1 = dpt.ones(sz, dtype=op1_dtype, sycl_queue=q)
+    ar2 = dpt.ones_like(ar1, dtype=op2_dtype, sycl_queue=q)
+    if _can_cast(ar1.dtype, ar2.dtype, _fp16, _fp64):
+        dpt.bitwise_right_shift(ar1, ar2, out=ar2)
+        assert dpt.all(ar2 == 0)
+
+        ar3 = dpt.ones(sz, dtype=op1_dtype, sycl_queue=q)[::-1]
+        ar4 = dpt.ones(2 * sz, dtype=op2_dtype, sycl_queue=q)[::2]
+        dpt.bitwise_right_shift(ar3, ar4, out=ar4)
+        dpt.all(ar4 == 0)
+    else:
+        with pytest.raises(ValueError):
+            dpt.bitwise_right_shift(ar1, ar2, out=ar2)
diff --git a/dpnp/tests/tensor/elementwise/test_bitwise_xor.py b/dpnp/tests/tensor/elementwise/test_bitwise_xor.py
new file mode 100644
index 000000000000..60bc2c518e26
--- /dev/null
+++ b/dpnp/tests/tensor/elementwise/test_bitwise_xor.py
@@ -0,0 +1,158 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+import numpy as np
+import pytest
+
+import dpnp.tensor as dpt
+from dpnp.tensor._type_utils import _can_cast
+
+from ..helper import (
+    get_queue_or_skip,
+    skip_if_dtype_not_supported,
+)
+from .utils import _integral_dtypes
+
+
+@pytest.mark.parametrize("op_dtype", _integral_dtypes)
+def test_bitwise_xor_dtype_matrix_contig(op_dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(op_dtype, q)
+
+    sz = 7
+    n = 2 * sz
+    dt1 = dpt.dtype(op_dtype)
+    dt2 = dpt.dtype(op_dtype)
+
+    x1_range_begin = -sz if dpt.iinfo(dt1).min < 0 else 0
+    x1 = dpt.arange(x1_range_begin, x1_range_begin + n, dtype=dt1)
+
+    x2_range_begin = -sz if dpt.iinfo(dt2).min < 0 else 0
+    x2 = dpt.arange(x2_range_begin, x2_range_begin + n, dtype=dt1)
+
+    r = dpt.bitwise_xor(x1, x2)
+    assert isinstance(r, dpt.usm_ndarray)
+
+    x1_np = np.arange(x1_range_begin, x1_range_begin + n, dtype=op_dtype)
+    x2_np = np.arange(x2_range_begin, x2_range_begin + n, dtype=op_dtype)
+    r_np = np.bitwise_xor(x1_np, x2_np)
+
+    assert (r_np == dpt.asnumpy(r)).all()
+
+
+@pytest.mark.parametrize("op_dtype", _integral_dtypes)
+def test_bitwise_xor_dtype_matrix_strided(op_dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(op_dtype, q)
+
+    sz = 11
+    n = 2 * sz
+    dt1 = dpt.dtype(op_dtype)
+    dt2 = dpt.dtype(op_dtype)
+
+    x1_range_begin = -sz if dpt.iinfo(dt1).min < 0 else 0
+    x1 = dpt.arange(x1_range_begin, x1_range_begin + n, dtype=dt1)[::2]
+
+    x2_range_begin = -(sz // 2) if dpt.iinfo(dt2).min < 0 else 0
+    x2 = dpt.arange(x2_range_begin, x2_range_begin + n, dtype=dt1)[::-2]
+
+    r = dpt.bitwise_xor(x1, x2)
+    assert isinstance(r, dpt.usm_ndarray)
+
+    x1_np = np.arange(x1_range_begin, x1_range_begin + n, dtype=op_dtype)[::2]
+    x2_np = np.arange(x2_range_begin, x2_range_begin + n, dtype=op_dtype)[::-2]
+    r_np = np.bitwise_xor(x1_np, x2_np)
+
+    assert (r_np == dpt.asnumpy(r)).all()
+
+
+def test_bitwise_xor_bool():
+    get_queue_or_skip()
+
+    x1 = dpt.asarray([True, False])
+    x2 = dpt.asarray([False, True])
+
+    r_bw = dpt.bitwise_xor(x1[:, dpt.newaxis], x2[dpt.newaxis])
+    r_lo = dpt.logical_xor(x1[:, dpt.newaxis], x2[dpt.newaxis])
+
+    assert dpt.all(dpt.equal(r_bw, r_lo))
+
+
+@pytest.mark.parametrize("dtype", ["?"] + _integral_dtypes)
+def test_bitwise_xor_inplace_python_scalar(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+    X = dpt.zeros((10, 10), dtype=dtype, sycl_queue=q)
+    dt_kind = X.dtype.kind
+    if dt_kind == "b":
+        X ^= False
+    else:
+        X ^= int(0)
+
+
+@pytest.mark.parametrize("op1_dtype", ["?"] + _integral_dtypes)
+@pytest.mark.parametrize("op2_dtype", ["?"] + _integral_dtypes)
+def test_bitwise_xor_inplace_dtype_matrix(op1_dtype, op2_dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(op1_dtype, q)
+    skip_if_dtype_not_supported(op2_dtype, q)
+
+    sz = 127
+    ar1 = dpt.ones(sz, dtype=op1_dtype, sycl_queue=q)
+    ar2 = dpt.ones_like(ar1, dtype=op2_dtype, sycl_queue=q)
+
+    dev = q.sycl_device
+    _fp16 = dev.has_aspect_fp16
+    _fp64 = dev.has_aspect_fp64
+    if _can_cast(ar2.dtype, ar1.dtype, _fp16, _fp64, casting="same_kind"):
+        ar1 ^= ar2
+        assert dpt.all(ar1 == 0)
+
+        ar3 = dpt.ones(sz, dtype=op1_dtype, sycl_queue=q)[::-1]
+        ar4 = dpt.ones(2 * sz, dtype=op2_dtype, sycl_queue=q)[::2]
+        ar3 ^= ar4
+        assert dpt.all(ar3 == 0)
+    else:
+        with pytest.raises(ValueError):
+            ar1 ^= ar2
+            dpt.bitwise_xor(ar1, ar2, out=ar1)
+
+    # out is second arg
+    ar1 = dpt.ones(sz, dtype=op1_dtype, sycl_queue=q)
+    ar2 = dpt.ones_like(ar1, dtype=op2_dtype, sycl_queue=q)
+    if _can_cast(ar1.dtype, ar2.dtype, _fp16, _fp64):
+        dpt.bitwise_xor(ar1, ar2, out=ar2)
+        assert dpt.all(ar2 == 0)
+
+        ar3 = dpt.ones(sz, dtype=op1_dtype, sycl_queue=q)[::-1]
+        ar4 = dpt.ones(2 * sz, dtype=op2_dtype, sycl_queue=q)[::2]
+        dpt.bitwise_xor(ar3, ar4, out=ar4)
+        dpt.all(ar4 == 0)
+    else:
+        with pytest.raises(ValueError):
+            dpt.bitwise_xor(ar1, ar2, out=ar2)
diff --git a/dpnp/tests/tensor/elementwise/test_cbrt.py b/dpnp/tests/tensor/elementwise/test_cbrt.py
new file mode 100644
index 000000000000..8c063d3fbdec
--- /dev/null
+++ b/dpnp/tests/tensor/elementwise/test_cbrt.py
@@ -0,0 +1,98 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+import numpy as np
+import pytest
+from numpy.testing import assert_allclose
+
+import dpnp.tensor as dpt
+
+from ..helper import (
+    get_queue_or_skip,
+    skip_if_dtype_not_supported,
+)
+from .utils import (
+    _map_to_device_dtype,
+    _no_complex_dtypes,
+    _real_fp_dtypes,
+)
+
+
+@pytest.mark.parametrize("dtype", _no_complex_dtypes)
+def test_cbrt_out_type(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    X = dpt.asarray(0, dtype=dtype, sycl_queue=q)
+    expected_dtype = np.cbrt(np.array(0, dtype=dtype)).dtype
+    expected_dtype = _map_to_device_dtype(expected_dtype, q.sycl_device)
+    assert dpt.cbrt(X).dtype == expected_dtype
+
+
+@pytest.mark.parametrize("dtype", _real_fp_dtypes)
+def test_cbrt_output_contig(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    n_seq = 1027
+
+    X = dpt.linspace(0, 13, num=n_seq, dtype=dtype, sycl_queue=q)
+    Xnp = dpt.asnumpy(X)
+
+    Y = dpt.cbrt(X)
+    tol = 8 * dpt.finfo(Y.dtype).resolution
+
+    assert_allclose(dpt.asnumpy(Y), np.cbrt(Xnp), atol=tol, rtol=tol)
+
+
+@pytest.mark.parametrize("dtype", _real_fp_dtypes)
+def test_cbrt_output_strided(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    n_seq = 2054
+
+    X = dpt.linspace(0, 13, num=n_seq, dtype=dtype, sycl_queue=q)[::-2]
+    Xnp = dpt.asnumpy(X)
+
+    Y = dpt.cbrt(X)
+    tol = 8 * dpt.finfo(Y.dtype).resolution
+
+    assert_allclose(dpt.asnumpy(Y), np.cbrt(Xnp), atol=tol, rtol=tol)
+
+
+@pytest.mark.usefixtures("suppress_invalid_numpy_warnings")
+def test_cbrt_special_cases():
+    get_queue_or_skip()
+
+    X = dpt.asarray([dpt.nan, 0.0, -0.0, dpt.inf, -dpt.inf], dtype="f4")
+    res = dpt.cbrt(X)
+    expected = dpt.asarray([dpt.nan, 0.0, -0.0, dpt.inf, -dpt.inf], dtype="f4")
+    tol = dpt.finfo(dpt.float32).resolution
+
+    assert dpt.allclose(res, expected, atol=tol, rtol=tol, equal_nan=True)
diff --git a/dpnp/tests/tensor/elementwise/test_complex.py b/dpnp/tests/tensor/elementwise/test_complex.py
new file mode 100644
index 000000000000..2a006a7c519a
--- /dev/null
+++ b/dpnp/tests/tensor/elementwise/test_complex.py
@@ -0,0 +1,243 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+import itertools
+import warnings
+
+import numpy as np
+import pytest
+from numpy.testing import assert_allclose
+
+import dpnp.tensor as dpt
+
+from ..helper import (
+    get_queue_or_skip,
+    skip_if_dtype_not_supported,
+)
+from .utils import (
+    _all_dtypes,
+    _map_to_device_dtype,
+    _usm_types,
+)
+
+
+@pytest.mark.parametrize("dtype", _all_dtypes)
+def test_complex_out_type(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    X = dpt.asarray(0, dtype=dtype, sycl_queue=q)
+    expected_dtype = np.real(np.array(0, dtype=dtype)).dtype
+    expected_dtype = _map_to_device_dtype(expected_dtype, q.sycl_device)
+    assert dpt.real(X).dtype == expected_dtype
+
+    expected_dtype = np.imag(np.array(0, dtype=dtype)).dtype
+    expected_dtype = _map_to_device_dtype(expected_dtype, q.sycl_device)
+    assert dpt.imag(X).dtype == expected_dtype
+
+    expected_dtype = np.conj(np.array(0, dtype=dtype)).dtype
+    expected_dtype = _map_to_device_dtype(expected_dtype, q.sycl_device)
+    assert dpt.conj(X).dtype == expected_dtype
+
+
+@pytest.mark.parametrize(
+    "np_call, dpt_call",
+    [(np.real, dpt.real), (np.imag, dpt.imag), (np.conj, dpt.conj)],
+)
+@pytest.mark.parametrize("dtype", _all_dtypes)
+def test_complex_output(np_call, dpt_call, dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    n_seq = 100
+
+    x1 = np.linspace(0, 10, num=n_seq, dtype=dtype)
+    x2 = np.linspace(0, 20, num=n_seq, dtype=dtype)
+    Xnp = x1 + 1j * x2
+    X = dpt.asarray(Xnp, sycl_queue=q)
+
+    Y = dpt_call(X)
+    tol = 8 * dpt.finfo(Y.dtype).resolution
+
+    assert_allclose(dpt.asnumpy(Y), np_call(Xnp), atol=tol, rtol=tol)
+
+    Z = dpt.empty_like(X, dtype=Y.dtype)
+    dpt_call(X, out=Z)
+
+    assert_allclose(dpt.asnumpy(Z), np_call(Xnp), atol=tol, rtol=tol)
+
+
+@pytest.mark.parametrize(
+    "np_call, dpt_call",
+    [(np.real, dpt.real), (np.imag, dpt.imag), (np.conj, dpt.conj)],
+)
+@pytest.mark.parametrize("usm_type", _usm_types)
+def test_complex_usm_type(np_call, dpt_call, usm_type):
+    q = get_queue_or_skip()
+
+    arg_dt = np.dtype("c8")
+    input_shape = (10, 10, 10, 10)
+    X = dpt.empty(input_shape, dtype=arg_dt, usm_type=usm_type, sycl_queue=q)
+    X[..., 0::2] = np.pi / 6 + 1j * np.pi / 3
+    X[..., 1::2] = np.pi / 3 + 1j * np.pi / 6
+
+    Y = dpt_call(X)
+    assert Y.usm_type == X.usm_type
+    assert Y.sycl_queue == X.sycl_queue
+    assert Y.flags.c_contiguous
+
+    X_np = np.empty(input_shape, dtype=arg_dt)
+    X_np[..., 0::2] = np.complex64(np.pi / 6 + 1j * np.pi / 3)
+    X_np[..., 1::2] = np.complex64(np.pi / 3 + 1j * np.pi / 6)
+
+    expected_Y = np_call(X_np)
+
+    tol = 8 * dpt.finfo(Y.dtype).resolution
+
+    assert_allclose(dpt.asnumpy(Y), expected_Y, atol=tol, rtol=tol)
+
+
+@pytest.mark.parametrize(
+    "np_call, dpt_call",
+    [(np.real, dpt.real), (np.imag, dpt.imag), (np.conj, dpt.conj)],
+)
+@pytest.mark.parametrize("dtype", _all_dtypes)
+def test_complex_order(np_call, dpt_call, dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    arg_dt = np.dtype(dtype)
+    input_shape = (10, 10, 10, 10)
+    X = dpt.empty(input_shape, dtype=arg_dt, sycl_queue=q)
+    X[..., 0::2] = np.pi / 6 + 1j * np.pi / 3
+    X[..., 1::2] = np.pi / 3 + 1j * np.pi / 6
+
+    for perms in itertools.permutations(range(4)):
+        U = dpt.permute_dims(X[:, ::-1, ::-1, :], perms)
+        expected_Y = np_call(dpt.asnumpy(U))
+        for ord in ["C", "F", "A", "K"]:
+            Y = dpt_call(U, order=ord)
+            assert_allclose(dpt.asnumpy(Y), expected_Y)
+
+
+@pytest.mark.parametrize("dtype", ["c8", "c16"])
+def test_projection_complex(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    X = [
+        complex(1, 2),
+        complex(dpt.inf, -1),
+        complex(0, -dpt.inf),
+        complex(-dpt.inf, dpt.nan),
+    ]
+    Y = [
+        complex(1, 2),
+        complex(np.inf, -0.0),
+        complex(np.inf, -0.0),
+        complex(np.inf, 0.0),
+    ]
+
+    Xf = dpt.asarray(X, dtype=dtype, sycl_queue=q)
+    Yf = np.array(Y, dtype=dtype)
+
+    tol = 8 * dpt.finfo(Xf.dtype).resolution
+    assert_allclose(dpt.asnumpy(dpt.proj(Xf)), Yf, atol=tol, rtol=tol)
+
+
+@pytest.mark.parametrize("dtype", _all_dtypes)
+def test_projection(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    Xf = dpt.asarray(1, dtype=dtype, sycl_queue=q)
+    out_dtype = dpt.proj(Xf).dtype
+    Yf = np.array(complex(1, 0), dtype=out_dtype)
+
+    tol = 8 * dpt.finfo(Yf.dtype).resolution
+    assert_allclose(dpt.asnumpy(dpt.proj(Xf)), Yf, atol=tol, rtol=tol)
+
+
+@pytest.mark.parametrize(
+    "np_call, dpt_call",
+    [(np.real, dpt.real), (np.imag, dpt.imag), (np.conj, dpt.conj)],
+)
+@pytest.mark.parametrize("dtype", ["c8", "c16"])
+def test_complex_strided(np_call, dpt_call, dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    np.random.seed(42)
+    strides = np.array([-4, -3, -2, -1, 1, 2, 3, 4])
+    sizes = [2, 4, 6, 8, 9, 24, 72]
+    tol = 8 * dpt.finfo(dtype).resolution
+
+    low = -1000.0
+    high = 1000.0
+    for ii in sizes:
+        x1 = np.random.uniform(low=low, high=high, size=ii)
+        x2 = np.random.uniform(low=low, high=high, size=ii)
+        Xnp = np.array([complex(v1, v2) for v1, v2 in zip(x1, x2)], dtype=dtype)
+        X = dpt.asarray(Xnp)
+        Ynp = np_call(Xnp)
+        for jj in strides:
+            assert_allclose(
+                dpt.asnumpy(dpt_call(X[::jj])),
+                Ynp[::jj],
+                atol=tol,
+                rtol=tol,
+            )
+
+
+@pytest.mark.parametrize("dtype", ["c8", "c16"])
+def test_complex_special_cases(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    x = [np.nan, -np.nan, np.inf, -np.inf, +0.0, -0.0]
+    xc = [complex(*val) for val in itertools.product(x, repeat=2)]
+
+    Xc_np = np.array(xc, dtype=dtype)
+    Xc = dpt.asarray(Xc_np, dtype=dtype, sycl_queue=q)
+
+    tol = 8 * dpt.finfo(dtype).resolution
+
+    actual = dpt.real(Xc)
+    expected = np.real(Xc_np)
+    assert_allclose(dpt.asnumpy(actual), expected, atol=tol, rtol=tol)
+
+    actual = dpt.imag(Xc)
+    expected = np.imag(Xc_np)
+    assert_allclose(dpt.asnumpy(actual), expected, atol=tol, rtol=tol)
+
+    actual = dpt.conj(Xc)
+    expected = np.conj(Xc_np)
+
+    with warnings.catch_warnings():
+        warnings.simplefilter("ignore")
+        assert_allclose(dpt.asnumpy(actual), expected, atol=tol, rtol=tol)
diff --git a/dpnp/tests/tensor/elementwise/test_copysign.py b/dpnp/tests/tensor/elementwise/test_copysign.py
new file mode 100644
index 000000000000..f9ec5345d257
--- /dev/null
+++ b/dpnp/tests/tensor/elementwise/test_copysign.py
@@ -0,0 +1,130 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+import ctypes
+
+import numpy as np
+import pytest
+
+import dpnp.tensor as dpt
+
+from ..helper import (
+    get_queue_or_skip,
+    skip_if_dtype_not_supported,
+)
+from .utils import (
+    _compare_dtypes,
+    _no_complex_dtypes,
+    _real_fp_dtypes,
+)
+
+
+@pytest.mark.parametrize("op1_dtype", _no_complex_dtypes)
+@pytest.mark.parametrize("op2_dtype", _no_complex_dtypes)
+def test_copysign_dtype_matrix(op1_dtype, op2_dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(op1_dtype, q)
+    skip_if_dtype_not_supported(op2_dtype, q)
+
+    sz = 127
+    ar1 = dpt.ones(sz, dtype=op1_dtype)
+    ar2 = dpt.ones_like(ar1, dtype=op2_dtype)
+
+    r = dpt.copysign(ar1, ar2)
+    assert isinstance(r, dpt.usm_ndarray)
+    expected = np.copysign(
+        np.ones(1, dtype=op1_dtype), np.ones(1, dtype=op2_dtype)
+    )
+    assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q)
+    assert r.shape == ar1.shape
+    assert (dpt.asnumpy(r) == expected.astype(r.dtype)).all()
+    assert r.sycl_queue == ar1.sycl_queue
+
+    ar3 = dpt.ones(sz, dtype=op1_dtype)
+    ar4 = dpt.ones(2 * sz, dtype=op2_dtype)
+
+    r = dpt.copysign(ar3[::-1], ar4[::2])
+    assert isinstance(r, dpt.usm_ndarray)
+    expected = np.copysign(
+        np.ones(1, dtype=op1_dtype), np.ones(1, dtype=op2_dtype)
+    )
+    assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q)
+    assert r.shape == ar3.shape
+    assert (dpt.asnumpy(r) == expected.astype(r.dtype)).all()
+
+
+@pytest.mark.parametrize("arr_dt", _real_fp_dtypes)
+def test_copysign_python_scalar(arr_dt):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(arr_dt, q)
+
+    X = dpt.ones((10, 10), dtype=arr_dt, sycl_queue=q)
+    py_ones = (
+        bool(1),
+        int(1),
+        float(1),
+        np.float32(1),
+        ctypes.c_int(1),
+    )
+    for sc in py_ones:
+        R = dpt.copysign(X, sc)
+        assert isinstance(R, dpt.usm_ndarray)
+        R = dpt.copysign(sc, X)
+        assert isinstance(R, dpt.usm_ndarray)
+
+
+@pytest.mark.parametrize("dt", _real_fp_dtypes)
+def test_copysign(dt):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dt, q)
+
+    x = dpt.arange(100, dtype=dt, sycl_queue=q)
+    x[1::2] *= -1
+    y = dpt.ones(100, dtype=dt, sycl_queue=q)
+    y[::2] *= -1
+    res = dpt.copysign(x, y)
+    expected = dpt.negative(x)
+    tol = dpt.finfo(dt).resolution
+    assert dpt.allclose(res, expected, atol=tol, rtol=tol)
+
+
+def test_copysign_special_values():
+    get_queue_or_skip()
+
+    x1 = dpt.asarray([1.0, 0.0, dpt.nan, dpt.nan], dtype="f4")
+    y1 = dpt.asarray([-1.0, -0.0, -dpt.nan, -1], dtype="f4")
+    res = dpt.copysign(x1, y1)
+    assert dpt.all(dpt.signbit(res))
+    x2 = dpt.asarray([-1.0, -0.0, -dpt.nan, -dpt.nan], dtype="f4")
+    res = dpt.copysign(x2, y1)
+    assert dpt.all(dpt.signbit(res))
+    y2 = dpt.asarray([0.0, 1.0, dpt.nan, 1.0], dtype="f4")
+    res = dpt.copysign(x2, y2)
+    assert not dpt.any(dpt.signbit(res))
+    res = dpt.copysign(x1, y2)
+    assert not dpt.any(dpt.signbit(res))
diff --git a/dpnp/tests/tensor/elementwise/test_divide.py b/dpnp/tests/tensor/elementwise/test_divide.py
new file mode 100644
index 000000000000..e39436394f7d
--- /dev/null
+++ b/dpnp/tests/tensor/elementwise/test_divide.py
@@ -0,0 +1,313 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+import ctypes
+
+import dpctl
+import numpy as np
+import pytest
+from dpctl.utils import SequentialOrderManager
+
+import dpnp.tensor as dpt
+from dpnp.tensor._tensor_elementwise_impl import _divide_by_scalar
+from dpnp.tensor._type_utils import _can_cast
+
+from ..helper import (
+    get_queue_or_skip,
+    skip_if_dtype_not_supported,
+)
+from .utils import (
+    _all_dtypes,
+    _compare_dtypes,
+    _complex_fp_dtypes,
+    _real_fp_dtypes,
+    _usm_types,
+)
+
+
+@pytest.mark.parametrize("op1_dtype", _all_dtypes)
+@pytest.mark.parametrize("op2_dtype", _all_dtypes)
+def test_divide_dtype_matrix(op1_dtype, op2_dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(op1_dtype, q)
+    skip_if_dtype_not_supported(op2_dtype, q)
+
+    sz = 127
+    ar1 = dpt.ones(sz, dtype=op1_dtype)
+    ar2 = dpt.ones_like(ar1, dtype=op2_dtype)
+
+    r = dpt.divide(ar1, ar2)
+    assert isinstance(r, dpt.usm_ndarray)
+    expected = np.divide(
+        np.ones(1, dtype=op1_dtype), np.ones(1, dtype=op2_dtype)
+    )
+    assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q)
+    assert r.shape == ar1.shape
+    assert (dpt.asnumpy(r) == expected.astype(r.dtype)).all()
+    assert r.sycl_queue == ar1.sycl_queue
+
+    ar3 = dpt.ones(sz, dtype=op1_dtype)
+    ar4 = dpt.ones(2 * sz, dtype=op2_dtype)
+
+    r = dpt.divide(ar3[::-1], ar4[::2])
+    assert isinstance(r, dpt.usm_ndarray)
+    expected = np.divide(
+        np.ones(1, dtype=op1_dtype), np.ones(1, dtype=op2_dtype)
+    )
+    assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q)
+    assert r.shape == ar3.shape
+    assert (dpt.asnumpy(r) == expected.astype(r.dtype)).all()
+
+
+@pytest.mark.parametrize("op1_usm_type", _usm_types)
+@pytest.mark.parametrize("op2_usm_type", _usm_types)
+def test_divide_usm_type_matrix(op1_usm_type, op2_usm_type):
+    get_queue_or_skip()
+
+    sz = 128
+    ar1 = dpt.ones(sz, dtype="i4", usm_type=op1_usm_type)
+    ar2 = dpt.ones_like(ar1, dtype="i4", usm_type=op2_usm_type)
+
+    r = dpt.divide(ar1, ar2)
+    assert isinstance(r, dpt.usm_ndarray)
+    expected_usm_type = dpctl.utils.get_coerced_usm_type(
+        (op1_usm_type, op2_usm_type)
+    )
+    assert r.usm_type == expected_usm_type
+
+
+def test_divide_order():
+    get_queue_or_skip()
+
+    ar1 = dpt.ones((20, 20), dtype="i4", order="C")
+    ar2 = dpt.ones((20, 20), dtype="i4", order="C")
+    r1 = dpt.divide(ar1, ar2, order="C")
+    assert r1.flags.c_contiguous
+    r2 = dpt.divide(ar1, ar2, order="F")
+    assert r2.flags.f_contiguous
+    r3 = dpt.divide(ar1, ar2, order="A")
+    assert r3.flags.c_contiguous
+    r4 = dpt.divide(ar1, ar2, order="K")
+    assert r4.flags.c_contiguous
+
+    ar1 = dpt.ones((20, 20), dtype="i4", order="F")
+    ar2 = dpt.ones((20, 20), dtype="i4", order="F")
+    r1 = dpt.divide(ar1, ar2, order="C")
+    assert r1.flags.c_contiguous
+    r2 = dpt.divide(ar1, ar2, order="F")
+    assert r2.flags.f_contiguous
+    r3 = dpt.divide(ar1, ar2, order="A")
+    assert r3.flags.f_contiguous
+    r4 = dpt.divide(ar1, ar2, order="K")
+    assert r4.flags.f_contiguous
+
+    ar1 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2]
+    ar2 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2]
+    r4 = dpt.divide(ar1, ar2, order="K")
+    assert r4.strides == (20, -1)
+
+    ar1 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2].mT
+    ar2 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2].mT
+    r4 = dpt.divide(ar1, ar2, order="K")
+    assert r4.strides == (-1, 20)
+
+
+def test_divide_broadcasting():
+    get_queue_or_skip()
+
+    m = dpt.ones((100, 5), dtype="i4")
+    v = dpt.arange(1, 6, dtype="i4")
+
+    r = dpt.divide(m, v)
+
+    expected = np.divide(
+        np.ones((100, 5), dtype="i4"), np.arange(1, 6, dtype="i4")
+    )
+    assert (dpt.asnumpy(r) == expected.astype(r.dtype)).all()
+
+    r2 = dpt.divide(v, m)
+    expected2 = np.divide(
+        np.arange(1, 6, dtype="i4"), np.ones((100, 5), dtype="i4")
+    )
+    assert (dpt.asnumpy(r2) == expected2.astype(r2.dtype)).all()
+
+
+@pytest.mark.parametrize("arr_dt", _all_dtypes)
+def test_divide_python_scalar(arr_dt):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(arr_dt, q)
+
+    X = dpt.ones((10, 10), dtype=arr_dt, sycl_queue=q)
+    py_ones = (
+        bool(1),
+        int(1),
+        float(1),
+        complex(1),
+        np.float32(1),
+        ctypes.c_int(1),
+    )
+    for sc in py_ones:
+        R = dpt.divide(X, sc)
+        assert isinstance(R, dpt.usm_ndarray)
+        R = dpt.divide(sc, X)
+        assert isinstance(R, dpt.usm_ndarray)
+
+
+class MockArray:
+    def __init__(self, arr):
+        self.data_ = arr
+
+    @property
+    def __sycl_usm_array_interface__(self):
+        return self.data_.__sycl_usm_array_interface__
+
+
+def test_divide_mock_array():
+    get_queue_or_skip()
+    a = dpt.arange(10)
+    b = dpt.ones(10)
+    c = MockArray(b)
+    r = dpt.divide(a, c)
+    assert isinstance(r, dpt.usm_ndarray)
+
+
+def test_divide_canary_mock_array():
+    get_queue_or_skip()
+    a = dpt.arange(10)
+
+    class Canary:
+        def __init__(self):
+            pass
+
+        @property
+        def __sycl_usm_array_interface__(self):
+            return None
+
+    c = Canary()
+    with pytest.raises(ValueError):
+        dpt.divide(a, c)
+
+
+@pytest.mark.parametrize("dtype", _real_fp_dtypes + _complex_fp_dtypes)
+def test_divide_inplace_python_scalar(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+    X = dpt.zeros((10, 10), dtype=dtype, sycl_queue=q)
+    dt_kind = X.dtype.kind
+    if dt_kind == "f":
+        X /= float(1)
+    elif dt_kind == "c":
+        X /= complex(1)
+
+
+@pytest.mark.parametrize("op1_dtype", _all_dtypes)
+@pytest.mark.parametrize("op2_dtype", _all_dtypes)
+def test_divide_inplace_dtype_matrix(op1_dtype, op2_dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(op1_dtype, q)
+    skip_if_dtype_not_supported(op2_dtype, q)
+
+    sz = 127
+    ar1 = dpt.ones(sz, dtype=op1_dtype, sycl_queue=q)
+    ar2 = dpt.ones_like(ar1, dtype=op2_dtype, sycl_queue=q)
+
+    dev = q.sycl_device
+    _fp16 = dev.has_aspect_fp16
+    _fp64 = dev.has_aspect_fp64
+    # out array only valid if it is inexact
+    if (
+        _can_cast(ar2.dtype, ar1.dtype, _fp16, _fp64, casting="same_kind")
+        and dpt.dtype(op1_dtype).kind in "fc"
+    ):
+        ar1 /= ar2
+        assert dpt.all(ar1 == 1)
+
+        ar3 = dpt.ones(sz, dtype=op1_dtype, sycl_queue=q)[::-1]
+        ar4 = dpt.ones(2 * sz, dtype=op2_dtype, sycl_queue=q)[::2]
+        ar3 /= ar4
+        assert dpt.all(ar3 == 1)
+    else:
+        with pytest.raises(ValueError):
+            ar1 /= ar2
+            dpt.divide(ar1, ar2, out=ar1)
+
+    # out is second arg
+    ar1 = dpt.ones(sz, dtype=op1_dtype, sycl_queue=q)
+    ar2 = dpt.ones_like(ar1, dtype=op2_dtype, sycl_queue=q)
+    if (
+        _can_cast(ar1.dtype, ar2.dtype, _fp16, _fp64)
+        and dpt.dtype(op2_dtype).kind in "fc"
+    ):
+        dpt.divide(ar1, ar2, out=ar2)
+        assert dpt.all(ar2 == 1)
+
+        ar3 = dpt.ones(sz, dtype=op1_dtype, sycl_queue=q)[::-1]
+        ar4 = dpt.ones(2 * sz, dtype=op2_dtype, sycl_queue=q)[::2]
+        dpt.divide(ar3, ar4, out=ar4)
+        dpt.all(ar4 == 1)
+    else:
+        with pytest.raises(ValueError):
+            dpt.divide(ar1, ar2, out=ar2)
+
+
+def test_divide_gh_1711():
+    "See https://github.com/IntelPython/dpctl/issues/1711"
+    get_queue_or_skip()
+
+    res = dpt.divide(-4, dpt.asarray(1, dtype="u4"))
+    assert isinstance(res, dpt.usm_ndarray)
+    assert res.dtype.kind == "f"
+    assert dpt.allclose(res, -4 / dpt.asarray(1, dtype="i4"))
+
+    res = dpt.divide(dpt.asarray(3, dtype="u4"), -2)
+    assert isinstance(res, dpt.usm_ndarray)
+    assert res.dtype.kind == "f"
+    assert dpt.allclose(res, dpt.asarray(3, dtype="i4") / -2)
+
+
+# don't test for overflowing double as Python won't cast
+# a Python integer of that size to a Python float
+@pytest.mark.parametrize("fp_dt", [dpt.float16, dpt.float32])
+def test_divide_by_scalar_overflow(fp_dt):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(fp_dt, q)
+
+    x = dpt.ones(10, dtype=fp_dt, sycl_queue=q)
+    out = dpt.empty_like(x)
+
+    max_exp = np.finfo(fp_dt).maxexp
+    sca = 2**max_exp
+
+    _manager = SequentialOrderManager[q]
+    dep_evs = _manager.submitted_events
+    _, ev = _divide_by_scalar(
+        src=x, scalar=sca, dst=out, sycl_queue=q, depends=dep_evs
+    )
+    ev.wait()
+
+    assert dpt.all(out == 0)
diff --git a/dpnp/tests/tensor/elementwise/test_elementwise_classes.py b/dpnp/tests/tensor/elementwise/test_elementwise_classes.py
new file mode 100644
index 000000000000..04b92937f371
--- /dev/null
+++ b/dpnp/tests/tensor/elementwise/test_elementwise_classes.py
@@ -0,0 +1,150 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+import pytest
+
+import dpnp.tensor as dpt
+
+from ..helper import get_queue_or_skip
+
+unary_fn = dpt.negative
+binary_fn = dpt.divide
+
+
+def test_unary_class_getters():
+    fn = unary_fn.get_implementation_function()
+    assert callable(fn)
+
+    fn = unary_fn.get_type_result_resolver_function()
+    assert callable(fn)
+
+
+def test_unary_class_types_property():
+    get_queue_or_skip()
+    loop_types = unary_fn.types
+    assert isinstance(loop_types, list)
+    assert len(loop_types) > 0
+    assert all(isinstance(sig, str) for sig in loop_types)
+    assert all("->" in sig for sig in loop_types)
+
+
+def test_unary_class_str_repr():
+    s = str(unary_fn)
+    r = repr(unary_fn)
+
+    assert isinstance(s, str)
+    assert isinstance(r, str)
+    kl_n = unary_fn.__name__
+    assert kl_n in s
+    assert kl_n in r
+
+
+def test_unary_read_only_out():
+    get_queue_or_skip()
+    x = dpt.arange(32, dtype=dpt.int32)
+    r = dpt.empty_like(x)
+    r.flags["W"] = False
+    with pytest.raises(ValueError):
+        unary_fn(x, out=r)
+
+
+def test_binary_class_getters():
+    fn = binary_fn.get_implementation_function()
+    assert callable(fn)
+
+    fn = binary_fn.get_implementation_inplace_function()
+    assert callable(fn)
+
+    fn = binary_fn.get_type_result_resolver_function()
+    assert callable(fn)
+
+    fn = binary_fn.get_type_promotion_path_acceptance_function()
+    assert callable(fn)
+
+
+def test_binary_class_types_property():
+    get_queue_or_skip()
+    loop_types = binary_fn.types
+    assert isinstance(loop_types, list)
+    assert len(loop_types) > 0
+    assert all(isinstance(sig, str) for sig in loop_types)
+    assert all("->" in sig for sig in loop_types)
+
+
+def test_binary_class_str_repr():
+    s = str(binary_fn)
+    r = repr(binary_fn)
+
+    assert isinstance(s, str)
+    assert isinstance(r, str)
+    kl_n = binary_fn.__name__
+    assert kl_n in s
+    assert kl_n in r
+
+
+def test_unary_class_nin():
+    nin = unary_fn.nin
+    assert isinstance(nin, int)
+    assert nin == 1
+
+
+def test_binary_class_nin():
+    nin = binary_fn.nin
+    assert isinstance(nin, int)
+    assert nin == 2
+
+
+def test_unary_class_nout():
+    nout = unary_fn.nout
+    assert isinstance(nout, int)
+    assert nout == 1
+
+
+def test_binary_class_nout():
+    nout = binary_fn.nout
+    assert isinstance(nout, int)
+    assert nout == 1
+
+
+def test_binary_read_only_out():
+    get_queue_or_skip()
+    x1 = dpt.ones(32, dtype=dpt.float32)
+    x2 = dpt.ones_like(x1)
+    r = dpt.empty_like(x1)
+    r.flags["W"] = False
+    with pytest.raises(ValueError):
+        binary_fn(x1, x2, out=r)
+
+
+def test_binary_no_inplace_op():
+    get_queue_or_skip()
+    x1 = dpt.ones(10, dtype="i4")
+    x2 = dpt.ones_like(x1)
+
+    with pytest.raises(ValueError):
+        dpt.logaddexp._inplace_op(x1, x2)
diff --git a/dpnp/tests/tensor/elementwise/test_equal.py b/dpnp/tests/tensor/elementwise/test_equal.py
new file mode 100644
index 000000000000..2791d600f7a3
--- /dev/null
+++ b/dpnp/tests/tensor/elementwise/test_equal.py
@@ -0,0 +1,209 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+import ctypes
+
+import dpctl
+import numpy as np
+import pytest
+
+import dpnp.tensor as dpt
+
+from ..helper import (
+    get_queue_or_skip,
+    skip_if_dtype_not_supported,
+)
+from .utils import (
+    _all_dtypes,
+    _compare_dtypes,
+    _usm_types,
+)
+
+
+@pytest.mark.parametrize("op1_dtype", _all_dtypes)
+@pytest.mark.parametrize("op2_dtype", _all_dtypes)
+def test_equal_dtype_matrix(op1_dtype, op2_dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(op1_dtype, q)
+    skip_if_dtype_not_supported(op2_dtype, q)
+
+    sz = 127
+    ar1 = dpt.ones(sz, dtype=op1_dtype)
+    ar2 = dpt.ones_like(ar1, dtype=op2_dtype)
+
+    r = dpt.equal(ar1, ar2)
+    assert isinstance(r, dpt.usm_ndarray)
+    expected_dtype = np.equal(
+        np.zeros(1, dtype=op1_dtype), np.zeros(1, dtype=op2_dtype)
+    ).dtype
+    assert _compare_dtypes(r.dtype, expected_dtype, sycl_queue=q)
+    assert r.shape == ar1.shape
+    assert (dpt.asnumpy(r) == np.full(r.shape, True, dtype=r.dtype)).all()
+    assert r.sycl_queue == ar1.sycl_queue
+
+    ar3 = dpt.ones(sz, dtype=op1_dtype)
+    ar4 = dpt.ones(2 * sz, dtype=op2_dtype)
+
+    r = dpt.equal(ar3[::-1], ar4[::2])
+    assert isinstance(r, dpt.usm_ndarray)
+    expected_dtype = np.equal(
+        np.ones(1, dtype=op1_dtype), np.ones(1, dtype=op2_dtype)
+    ).dtype
+    assert _compare_dtypes(r.dtype, expected_dtype, sycl_queue=q)
+    assert r.shape == ar3.shape
+    assert (dpt.asnumpy(r) == np.full(r.shape, True, dtype=r.dtype)).all()
+
+
+@pytest.mark.parametrize("op1_usm_type", _usm_types)
+@pytest.mark.parametrize("op2_usm_type", _usm_types)
+def test_equal_usm_type_matrix(op1_usm_type, op2_usm_type):
+    get_queue_or_skip()
+
+    sz = 128
+    ar1 = dpt.ones(sz, dtype="i4", usm_type=op1_usm_type)
+    ar2 = dpt.ones_like(ar1, dtype="i4", usm_type=op2_usm_type)
+
+    r = dpt.equal(ar1, ar2)
+    assert isinstance(r, dpt.usm_ndarray)
+    expected_usm_type = dpctl.utils.get_coerced_usm_type(
+        (op1_usm_type, op2_usm_type)
+    )
+    assert r.usm_type == expected_usm_type
+
+
+def test_equal_order():
+    get_queue_or_skip()
+
+    ar1 = dpt.ones((20, 20), dtype="i4", order="C")
+    ar2 = dpt.ones((20, 20), dtype="i4", order="C")
+    r1 = dpt.equal(ar1, ar2, order="C")
+    assert r1.flags.c_contiguous
+    r2 = dpt.equal(ar1, ar2, order="F")
+    assert r2.flags.f_contiguous
+    r3 = dpt.equal(ar1, ar2, order="A")
+    assert r3.flags.c_contiguous
+    r4 = dpt.equal(ar1, ar2, order="K")
+    assert r4.flags.c_contiguous
+
+    ar1 = dpt.ones((20, 20), dtype="i4", order="F")
+    ar2 = dpt.ones((20, 20), dtype="i4", order="F")
+    r1 = dpt.equal(ar1, ar2, order="C")
+    assert r1.flags.c_contiguous
+    r2 = dpt.equal(ar1, ar2, order="F")
+    assert r2.flags.f_contiguous
+    r3 = dpt.equal(ar1, ar2, order="A")
+    assert r3.flags.f_contiguous
+    r4 = dpt.equal(ar1, ar2, order="K")
+    assert r4.flags.f_contiguous
+
+    ar1 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2]
+    ar2 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2]
+    r4 = dpt.equal(ar1, ar2, order="K")
+    assert r4.strides == (20, -1)
+
+    ar1 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2].mT
+    ar2 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2].mT
+    r4 = dpt.equal(ar1, ar2, order="K")
+    assert r4.strides == (-1, 20)
+
+
+def test_equal_broadcasting():
+    get_queue_or_skip()
+
+    m = dpt.ones((100, 5), dtype="i4")
+    v = dpt.arange(5, dtype="i4")
+
+    r = dpt.equal(m, v)
+    expected = np.full((100, 5), [False, True, False, False, False], dtype="?")
+
+    assert (dpt.asnumpy(r) == expected).all()
+
+    r2 = dpt.equal(v, m)
+    assert (dpt.asnumpy(r2) == expected).all()
+
+    r3 = dpt.empty_like(m, dtype="?")
+    dpt.equal(m, v, out=r3)
+    assert (dpt.asnumpy(r3) == expected).all()
+
+
+@pytest.mark.parametrize("arr_dt", _all_dtypes)
+def test_equal_python_scalar(arr_dt):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(arr_dt, q)
+
+    X = dpt.zeros((10, 10), dtype=arr_dt, sycl_queue=q)
+    py_zeros = (
+        bool(0),
+        int(0),
+        float(0),
+        complex(0),
+        np.float32(0),
+        ctypes.c_int(0),
+    )
+    for sc in py_zeros:
+        R = dpt.equal(X, sc)
+        assert isinstance(R, dpt.usm_ndarray)
+        assert dpt.all(R)
+        R = dpt.equal(sc, X)
+        assert isinstance(R, dpt.usm_ndarray)
+        assert dpt.all(R)
+
+
+class MockArray:
+    def __init__(self, arr):
+        self.data_ = arr
+
+    @property
+    def __sycl_usm_array_interface__(self):
+        return self.data_.__sycl_usm_array_interface__
+
+
+def test_equal_mock_array():
+    get_queue_or_skip()
+    a = dpt.arange(10)
+    b = dpt.ones(10)
+    c = MockArray(b)
+    r = dpt.equal(a, c)
+    assert isinstance(r, dpt.usm_ndarray)
+
+
+def test_equal_canary_mock_array():
+    get_queue_or_skip()
+    a = dpt.arange(10)
+
+    class Canary:
+        def __init__(self):
+            pass
+
+        @property
+        def __sycl_usm_array_interface__(self):
+            return None
+
+    c = Canary()
+    with pytest.raises(ValueError):
+        dpt.equal(a, c)
diff --git a/dpnp/tests/tensor/elementwise/test_exp.py b/dpnp/tests/tensor/elementwise/test_exp.py
new file mode 100644
index 000000000000..d123ed0c83a8
--- /dev/null
+++ b/dpnp/tests/tensor/elementwise/test_exp.py
@@ -0,0 +1,253 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+import itertools
+
+import numpy as np
+import pytest
+from numpy.testing import assert_allclose, assert_array_equal
+
+import dpnp.tensor as dpt
+
+from ..helper import (
+    get_queue_or_skip,
+    skip_if_dtype_not_supported,
+)
+from .utils import (
+    _all_dtypes,
+    _map_to_device_dtype,
+    _usm_types,
+)
+
+
+@pytest.mark.parametrize("dtype", _all_dtypes)
+def test_exp_out_type(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    X = dpt.asarray(0, dtype=dtype, sycl_queue=q)
+    expected_dtype = np.exp(np.array(0, dtype=dtype)).dtype
+    expected_dtype = _map_to_device_dtype(expected_dtype, q.sycl_device)
+    assert dpt.exp(X).dtype == expected_dtype
+
+
+@pytest.mark.parametrize("dtype", ["f2", "f4", "f8"])
+def test_exp_real_contig(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    n_seq = 100
+    n_rep = 137
+    Xnp = np.linspace(0.01, 88.1, num=n_seq, dtype=dtype)
+    X = dpt.asarray(np.repeat(Xnp, n_rep), dtype=dtype, sycl_queue=q)
+    Y = dpt.exp(X)
+    with np.errstate(all="ignore"):
+        Ynp = np.exp(Xnp)
+
+    tol = 8 * dpt.finfo(dtype).resolution
+    assert_allclose(dpt.asnumpy(Y), np.repeat(Ynp, n_rep), atol=tol, rtol=tol)
+
+    Z = dpt.empty_like(X, dtype=dtype)
+    dpt.exp(X, out=Z)
+
+    assert_allclose(dpt.asnumpy(Z), np.repeat(Ynp, n_rep), atol=tol, rtol=tol)
+
+
+@pytest.mark.parametrize("dtype", ["c8", "c16"])
+def test_exp_complex_contig(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    n_seq = 100
+    n_rep = 137
+    low = -88.0
+    high = 88.0
+    x1 = np.random.uniform(low=low, high=high, size=n_seq)
+    x2 = np.random.uniform(low=low, high=high, size=n_seq)
+    Xnp = np.array([complex(v1, v2) for v1, v2 in zip(x1, x2)], dtype=dtype)
+
+    X = dpt.asarray(np.repeat(Xnp, n_rep), dtype=dtype, sycl_queue=q)
+    Y = dpt.exp(X)
+
+    tol = 8 * dpt.finfo(dtype).resolution
+    assert_allclose(
+        dpt.asnumpy(Y), np.repeat(np.exp(Xnp), n_rep), atol=tol, rtol=tol
+    )
+
+    Z = dpt.empty_like(X, dtype=dtype)
+    dpt.exp(X, out=Z)
+
+    assert_allclose(
+        dpt.asnumpy(Z), np.repeat(np.exp(Xnp), n_rep), atol=tol, rtol=tol
+    )
+
+
+@pytest.mark.parametrize("usm_type", _usm_types)
+def test_exp_usm_type(usm_type):
+    q = get_queue_or_skip()
+
+    arg_dt = np.dtype("f4")
+    input_shape = (10, 10, 10, 10)
+    X = dpt.empty(input_shape, dtype=arg_dt, usm_type=usm_type, sycl_queue=q)
+    X[..., 0::2] = 16.0
+    X[..., 1::2] = 23.0
+
+    Y = dpt.exp(X)
+    assert Y.usm_type == X.usm_type
+    assert Y.sycl_queue == X.sycl_queue
+    assert Y.flags.c_contiguous
+
+    expected_Y = np.empty(input_shape, dtype=arg_dt)
+    expected_Y[..., 0::2] = np.exp(np.float32(16.0))
+    expected_Y[..., 1::2] = np.exp(np.float32(23.0))
+    tol = 8 * dpt.finfo(Y.dtype).resolution
+
+    assert_allclose(dpt.asnumpy(Y), expected_Y, atol=tol, rtol=tol)
+
+
+@pytest.mark.parametrize("dtype", _all_dtypes)
+def test_exp_order(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    arg_dt = np.dtype(dtype)
+    input_shape = (10, 10, 10, 10)
+    X = dpt.empty(input_shape, dtype=arg_dt, sycl_queue=q)
+    X[..., 0::2] = 8.0
+    X[..., 1::2] = 11.0
+
+    for perms in itertools.permutations(range(4)):
+        U = dpt.permute_dims(X[:, ::-1, ::-1, :], perms)
+        expected_Y = np.exp(dpt.asnumpy(U))
+        for ord in ["C", "F", "A", "K"]:
+            Y = dpt.exp(U, order=ord)
+            tol = 8 * max(
+                dpt.finfo(Y.dtype).resolution,
+                np.finfo(expected_Y.dtype).resolution,
+            )
+            assert_allclose(dpt.asnumpy(Y), expected_Y, atol=tol, rtol=tol)
+
+
+@pytest.mark.parametrize("dtype", ["f2", "f4", "f8"])
+def test_exp_analytical_values(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    tol = 8 * dpt.finfo(dtype).resolution
+    x = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
+    log2_ = 0.69314718055994530943
+    Xnp = np.array(x, dtype=dtype) * log2_
+    X = dpt.asarray(Xnp, dtype=dtype)
+    assert_allclose(dpt.asnumpy(dpt.exp(X)), np.exp(Xnp), atol=tol, rtol=tol)
+
+
+@pytest.mark.parametrize("dtype", ["f2", "f4", "f8"])
+def test_exp_real_special_cases(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    tol = 8 * dpt.finfo(dtype).resolution
+    x = [np.nan, np.inf, -np.inf, 0.0, -0.0]
+    Xnp = np.array(x, dtype=dtype)
+    X = dpt.asarray(x, dtype=dtype)
+
+    Y = dpt.asnumpy(dpt.exp(X))
+    Ynp = np.exp(Xnp)
+    assert_allclose(Y, Ynp, atol=tol, rtol=tol)
+    assert_array_equal(np.signbit(Y), np.signbit(Ynp))
+
+
+@pytest.mark.parametrize("dtype", ["f2", "f4", "f8"])
+def test_exp_real_strided(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    np.random.seed(42)
+    strides = np.array([-4, -3, -2, -1, 1, 2, 3, 4])
+    sizes = [2, 4, 6, 8, 9, 24, 72]
+    tol = 8 * dpt.finfo(dtype).resolution
+
+    for ii in sizes:
+        Xnp = np.random.uniform(low=0.01, high=88.1, size=ii)
+        Xnp.astype(dtype)
+        X = dpt.asarray(Xnp)
+        Ynp = np.exp(Xnp)
+        for jj in strides:
+            assert_allclose(
+                dpt.asnumpy(dpt.exp(X[::jj])),
+                Ynp[::jj],
+                atol=tol,
+                rtol=tol,
+            )
+
+
+@pytest.mark.parametrize("dtype", ["c8", "c16"])
+def test_exp_complex_strided(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    np.random.seed(42)
+    strides = np.array([-4, -3, -2, -1, 1, 2, 3, 4])
+    sizes = [2, 4, 6, 8, 9, 24, 72]
+    tol = 8 * dpt.finfo(dtype).resolution
+
+    low = -88.0
+    high = 88.0
+    for ii in sizes:
+        x1 = np.random.uniform(low=low, high=high, size=ii)
+        x2 = np.random.uniform(low=low, high=high, size=ii)
+        Xnp = np.array([complex(v1, v2) for v1, v2 in zip(x1, x2)], dtype=dtype)
+        X = dpt.asarray(Xnp)
+        Ynp = np.exp(Xnp)
+        for jj in strides:
+            assert_allclose(
+                dpt.asnumpy(dpt.exp(X[::jj])),
+                Ynp[::jj],
+                atol=tol,
+                rtol=tol,
+            )
+
+
+@pytest.mark.parametrize("dtype", ["c8", "c16"])
+def test_exp_complex_special_cases(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    x = [np.nan, np.inf, -np.inf, +0.0, -0.0, +1.0, -1.0]
+    xc = [complex(*val) for val in itertools.product(x, repeat=2)]
+
+    Xc_np = np.array(xc, dtype=dtype)
+    Xc = dpt.asarray(Xc_np, dtype=dtype, sycl_queue=q)
+
+    with np.errstate(all="ignore"):
+        Ynp = np.exp(Xc_np)
+    Y = dpt.exp(Xc)
+
+    tol = 8 * dpt.finfo(dtype).resolution
+    assert_allclose(dpt.asnumpy(dpt.real(Y)), np.real(Ynp), atol=tol, rtol=tol)
+    assert_allclose(dpt.asnumpy(dpt.imag(Y)), np.imag(Ynp), atol=tol, rtol=tol)
diff --git a/dpnp/tests/tensor/elementwise/test_exp2.py b/dpnp/tests/tensor/elementwise/test_exp2.py
new file mode 100644
index 000000000000..ae2ab43c39be
--- /dev/null
+++ b/dpnp/tests/tensor/elementwise/test_exp2.py
@@ -0,0 +1,187 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+import itertools
+
+import numpy as np
+import pytest
+from numpy.testing import assert_allclose
+
+import dpnp.tensor as dpt
+
+from ..helper import (
+    get_queue_or_skip,
+    skip_if_dtype_not_supported,
+)
+from .utils import (
+    _all_dtypes,
+    _map_to_device_dtype,
+    _usm_types,
+)
+
+
+@pytest.mark.parametrize("dtype", _all_dtypes)
+def test_exp2_out_type(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    X = dpt.asarray(0, dtype=dtype, sycl_queue=q)
+    expected_dtype = np.exp2(np.array(0, dtype=dtype)).dtype
+    expected_dtype = _map_to_device_dtype(expected_dtype, q.sycl_device)
+    assert dpt.exp2(X).dtype == expected_dtype
+
+
+@pytest.mark.parametrize("dtype", ["f2", "f4", "f8", "c8", "c16"])
+def test_exp2_output_contig(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    n_seq = 1027
+
+    X = dpt.linspace(1, 5, num=n_seq, dtype=dtype, sycl_queue=q)
+    Xnp = dpt.asnumpy(X)
+
+    Y = dpt.exp2(X)
+    tol = 8 * dpt.finfo(Y.dtype).resolution
+
+    assert_allclose(dpt.asnumpy(Y), np.exp2(Xnp), atol=tol, rtol=tol)
+
+
+@pytest.mark.parametrize("dtype", ["f2", "f4", "f8", "c8", "c16"])
+def test_exp2_output_strided(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    n_seq = 2 * 1027
+
+    X = dpt.linspace(1, 5, num=n_seq, dtype=dtype, sycl_queue=q)[::-2]
+    Xnp = dpt.asnumpy(X)
+
+    Y = dpt.exp2(X)
+    tol = 8 * dpt.finfo(Y.dtype).resolution
+
+    assert_allclose(dpt.asnumpy(Y), np.exp2(Xnp), atol=tol, rtol=tol)
+
+
+@pytest.mark.parametrize("usm_type", _usm_types)
+def test_exp2_usm_type(usm_type):
+    q = get_queue_or_skip()
+
+    arg_dt = np.dtype("f4")
+    input_shape = (10, 10, 10, 10)
+    X = dpt.empty(input_shape, dtype=arg_dt, usm_type=usm_type, sycl_queue=q)
+    X[..., 0::2] = 1 / 4
+    X[..., 1::2] = 1 / 2
+
+    Y = dpt.exp2(X)
+    assert Y.usm_type == X.usm_type
+    assert Y.sycl_queue == X.sycl_queue
+    assert Y.flags.c_contiguous
+
+    expected_Y = np.empty(input_shape, dtype=arg_dt)
+    expected_Y[..., 0::2] = np.exp2(np.float32(1 / 4))
+    expected_Y[..., 1::2] = np.exp2(np.float32(1 / 2))
+    tol = 8 * dpt.finfo(Y.dtype).resolution
+
+    assert_allclose(dpt.asnumpy(Y), expected_Y, atol=tol, rtol=tol)
+
+
+@pytest.mark.parametrize("dtype", _all_dtypes)
+def test_exp2_order(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    arg_dt = np.dtype(dtype)
+    input_shape = (10, 10, 10, 10)
+    X = dpt.empty(input_shape, dtype=arg_dt, sycl_queue=q)
+    X[..., 0::2] = 1 / 4
+    X[..., 1::2] = 1 / 2
+
+    for perms in itertools.permutations(range(4)):
+        U = dpt.permute_dims(X[:, ::-1, ::-1, :], perms)
+        expected_Y = np.exp2(dpt.asnumpy(U))
+        for ord in ["C", "F", "A", "K"]:
+            Y = dpt.exp2(U, order=ord)
+            tol = 8 * max(
+                dpt.finfo(Y.dtype).resolution,
+                np.finfo(expected_Y.dtype).resolution,
+            )
+            assert_allclose(dpt.asnumpy(Y), expected_Y, atol=tol, rtol=tol)
+
+
+def test_exp2_special_cases():
+    get_queue_or_skip()
+
+    X = dpt.asarray([dpt.nan, 0.0, -0.0, dpt.inf, -dpt.inf], dtype="f4")
+    res = np.asarray([np.nan, 1.0, 1.0, np.inf, 0.0], dtype="f4")
+
+    tol = dpt.finfo(X.dtype).resolution
+    assert_allclose(dpt.asnumpy(dpt.exp2(X)), res, atol=tol, rtol=tol)
+
+    # special cases for complex variant
+    num_finite = 1.0
+    vals = [
+        complex(0.0, 0.0),
+        complex(num_finite, dpt.inf),
+        complex(num_finite, dpt.nan),
+        complex(dpt.inf, 0.0),
+        complex(-dpt.inf, num_finite),
+        complex(dpt.inf, num_finite),
+        complex(-dpt.inf, dpt.inf),
+        complex(dpt.inf, dpt.inf),
+        complex(-dpt.inf, dpt.nan),
+        complex(dpt.inf, dpt.nan),
+        complex(dpt.nan, 0.0),
+        complex(dpt.nan, num_finite),
+        complex(dpt.nan, dpt.nan),
+    ]
+    X = dpt.asarray(vals, dtype=dpt.complex64)
+    cis_1 = complex(np.cos(num_finite), np.sin(num_finite))
+    c_nan = complex(np.nan, np.nan)
+    res = np.asarray(
+        [
+            complex(1.0, 0.0),
+            c_nan,
+            c_nan,
+            complex(np.inf, 0.0),
+            0.0,
+            np.inf * cis_1,
+            complex(0.0, 0.0),
+            complex(np.inf, np.nan),
+            complex(0.0, 0.0),
+            complex(np.inf, np.nan),
+            complex(np.nan, 0.0),
+            c_nan,
+            c_nan,
+        ],
+        dtype=np.complex64,
+    )
+
+    tol = dpt.finfo(X.dtype).resolution
+    with np.errstate(invalid="ignore"):
+        assert_allclose(dpt.asnumpy(dpt.exp2(X)), res, atol=tol, rtol=tol)
diff --git a/dpnp/tests/tensor/elementwise/test_expm1.py b/dpnp/tests/tensor/elementwise/test_expm1.py
new file mode 100644
index 000000000000..bb665c424564
--- /dev/null
+++ b/dpnp/tests/tensor/elementwise/test_expm1.py
@@ -0,0 +1,187 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+import itertools
+
+import numpy as np
+import pytest
+from numpy.testing import assert_allclose
+
+import dpnp.tensor as dpt
+
+from ..helper import (
+    get_queue_or_skip,
+    skip_if_dtype_not_supported,
+)
+from .utils import (
+    _all_dtypes,
+    _map_to_device_dtype,
+    _usm_types,
+)
+
+
+@pytest.mark.parametrize("dtype", _all_dtypes)
+def test_expm1_out_type(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    X = dpt.asarray(0, dtype=dtype, sycl_queue=q)
+    expected_dtype = np.expm1(np.array(0, dtype=dtype)).dtype
+    expected_dtype = _map_to_device_dtype(expected_dtype, q.sycl_device)
+    assert dpt.expm1(X).dtype == expected_dtype
+
+
+@pytest.mark.parametrize("dtype", ["f2", "f4", "f8", "c8", "c16"])
+def test_expm1_output_contig(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    n_seq = 1027
+
+    X = dpt.linspace(-2, 2, num=n_seq, dtype=dtype, sycl_queue=q)
+    Xnp = dpt.asnumpy(X)
+
+    Y = dpt.expm1(X)
+    tol = 8 * dpt.finfo(Y.dtype).resolution
+
+    assert_allclose(dpt.asnumpy(Y), np.expm1(Xnp), atol=tol, rtol=tol)
+
+
+@pytest.mark.parametrize("dtype", ["f2", "f4", "f8", "c8", "c16"])
+def test_expm1_output_strided(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    n_seq = 2 * 1027
+
+    X = dpt.linspace(-2, 2, num=n_seq, dtype=dtype, sycl_queue=q)[::-2]
+    Xnp = dpt.asnumpy(X)
+
+    Y = dpt.expm1(X)
+    tol = 8 * dpt.finfo(Y.dtype).resolution
+
+    assert_allclose(dpt.asnumpy(Y), np.expm1(Xnp), atol=tol, rtol=tol)
+
+
+@pytest.mark.parametrize("usm_type", _usm_types)
+def test_expm1_usm_type(usm_type):
+    q = get_queue_or_skip()
+
+    arg_dt = np.dtype("f4")
+    input_shape = (10, 10, 10, 10)
+    X = dpt.empty(input_shape, dtype=arg_dt, usm_type=usm_type, sycl_queue=q)
+    X[..., 0::2] = 1 / 50
+    X[..., 1::2] = 1 / 25
+
+    Y = dpt.expm1(X)
+    assert Y.usm_type == X.usm_type
+    assert Y.sycl_queue == X.sycl_queue
+    assert Y.flags.c_contiguous
+
+    expected_Y = np.empty(input_shape, dtype=arg_dt)
+    expected_Y[..., 0::2] = np.expm1(np.float32(1 / 50))
+    expected_Y[..., 1::2] = np.expm1(np.float32(1 / 25))
+    tol = 8 * dpt.finfo(Y.dtype).resolution
+
+    assert_allclose(dpt.asnumpy(Y), expected_Y, atol=tol, rtol=tol)
+
+
+@pytest.mark.parametrize("dtype", _all_dtypes)
+def test_expm1_order(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    arg_dt = np.dtype(dtype)
+    input_shape = (10, 10, 10, 10)
+    X = dpt.empty(input_shape, dtype=arg_dt, sycl_queue=q)
+    X[..., 0::2] = 1 / 50
+    X[..., 1::2] = 1 / 25
+
+    for perms in itertools.permutations(range(4)):
+        U = dpt.permute_dims(X[:, ::-1, ::-1, :], perms)
+        expected_Y = np.expm1(dpt.asnumpy(U))
+        for ord in ["C", "F", "A", "K"]:
+            Y = dpt.expm1(U, order=ord)
+            tol = 8 * max(
+                dpt.finfo(Y.dtype).resolution,
+                np.finfo(expected_Y.dtype).resolution,
+            )
+            assert_allclose(dpt.asnumpy(Y), expected_Y, atol=tol, rtol=tol)
+
+
+def test_expm1_special_cases():
+    get_queue_or_skip()
+
+    X = dpt.asarray([dpt.nan, 0.0, -0.0, dpt.inf, -dpt.inf], dtype="f4")
+    res = np.asarray([np.nan, 0.0, -0.0, np.inf, -1.0], dtype="f4")
+
+    tol = dpt.finfo(X.dtype).resolution
+    assert_allclose(dpt.asnumpy(dpt.expm1(X)), res, atol=tol, rtol=tol)
+
+    # special cases for complex variant
+    num_finite = 1.0
+    vals = [
+        complex(0.0, 0.0),
+        complex(num_finite, dpt.inf),
+        complex(num_finite, dpt.nan),
+        complex(dpt.inf, 0.0),
+        complex(-dpt.inf, num_finite),
+        complex(dpt.inf, num_finite),
+        complex(-dpt.inf, dpt.inf),
+        complex(dpt.inf, dpt.inf),
+        complex(-dpt.inf, dpt.nan),
+        complex(dpt.inf, dpt.nan),
+        complex(dpt.nan, 0.0),
+        complex(dpt.nan, num_finite),
+        complex(dpt.nan, dpt.nan),
+    ]
+    X = dpt.asarray(vals, dtype=dpt.complex64)
+    cis_1 = complex(np.cos(num_finite), np.sin(num_finite))
+    c_nan = complex(np.nan, np.nan)
+    res = np.asarray(
+        [
+            complex(0.0, 0.0),
+            c_nan,
+            c_nan,
+            complex(np.inf, 0.0),
+            0.0 * cis_1 - 1.0,
+            np.inf * cis_1 - 1.0,
+            complex(-1.0, 0.0),
+            complex(np.inf, np.nan),
+            complex(-1.0, 0.0),
+            complex(np.inf, np.nan),
+            complex(np.nan, 0.0),
+            c_nan,
+            c_nan,
+        ],
+        dtype=np.complex64,
+    )
+
+    tol = dpt.finfo(X.dtype).resolution
+    with np.errstate(invalid="ignore"):
+        assert_allclose(dpt.asnumpy(dpt.expm1(X)), res, atol=tol, rtol=tol)
diff --git a/dpnp/tests/tensor/elementwise/test_floor_ceil_trunc.py b/dpnp/tests/tensor/elementwise/test_floor_ceil_trunc.py
new file mode 100644
index 000000000000..f9af864b29fe
--- /dev/null
+++ b/dpnp/tests/tensor/elementwise/test_floor_ceil_trunc.py
@@ -0,0 +1,182 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+import itertools
+import re
+
+import numpy as np
+import pytest
+from numpy.testing import assert_allclose, assert_array_equal
+
+import dpnp.tensor as dpt
+
+from ..helper import (
+    get_queue_or_skip,
+    skip_if_dtype_not_supported,
+)
+from .utils import (
+    _map_to_device_dtype,
+    _no_complex_dtypes,
+    _real_value_dtypes,
+)
+
+_all_funcs = [(np.floor, dpt.floor), (np.ceil, dpt.ceil), (np.trunc, dpt.trunc)]
+
+
+@pytest.mark.parametrize("dpt_call", [dpt.floor, dpt.ceil, dpt.trunc])
+@pytest.mark.parametrize("dtype", _no_complex_dtypes)
+def test_floor_ceil_trunc_out_type(dpt_call, dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    arg_dt = np.dtype(dtype)
+    X = dpt.asarray(0.1, dtype=arg_dt, sycl_queue=q)
+    expected_dtype = _map_to_device_dtype(arg_dt, q.sycl_device)
+    assert dpt_call(X).dtype == expected_dtype
+
+    X = dpt.asarray(0.1, dtype=dtype, sycl_queue=q)
+    expected_dtype = _map_to_device_dtype(arg_dt, q.sycl_device)
+    Y = dpt.empty_like(X, dtype=expected_dtype)
+    dpt_call(X, out=Y)
+    assert_allclose(dpt.asnumpy(dpt_call(X)), dpt.asnumpy(Y))
+
+
+@pytest.mark.parametrize("np_call, dpt_call", _all_funcs)
+@pytest.mark.parametrize("usm_type", ["device", "shared", "host"])
+def test_floor_ceil_trunc_usm_type(np_call, dpt_call, usm_type):
+    q = get_queue_or_skip()
+
+    arg_dt = np.dtype("f4")
+    input_shape = (10, 10, 10, 10)
+    X = dpt.empty(input_shape, dtype=arg_dt, usm_type=usm_type, sycl_queue=q)
+    X[..., 0::2] = -0.4
+    X[..., 1::2] = 0.7
+
+    Y = dpt_call(X)
+    assert Y.usm_type == X.usm_type
+    assert Y.sycl_queue == X.sycl_queue
+    assert Y.flags.c_contiguous
+
+    expected_Y = np_call(dpt.asnumpy(X))
+    tol = 8 * dpt.finfo(Y.dtype).resolution
+    assert_allclose(dpt.asnumpy(Y), expected_Y, atol=tol, rtol=tol)
+
+
+@pytest.mark.parametrize("np_call, dpt_call", _all_funcs)
+@pytest.mark.parametrize("dtype", _no_complex_dtypes)
+def test_floor_ceil_trunc_order(np_call, dpt_call, dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    arg_dt = np.dtype(dtype)
+    input_shape = (4, 4, 4, 4)
+    X = dpt.empty(input_shape, dtype=arg_dt, sycl_queue=q)
+    X[..., 0::2] = -0.4
+    X[..., 1::2] = 0.7
+
+    for perms in itertools.permutations(range(4)):
+        U = dpt.permute_dims(X[:, ::-1, ::-1, :], perms)
+        expected_Y = np_call(dpt.asnumpy(U))
+        for ord in ["C", "F", "A", "K"]:
+            Y = dpt_call(U, order=ord)
+            assert_allclose(dpt.asnumpy(Y), expected_Y)
+
+
+@pytest.mark.parametrize("dpt_call", [dpt.floor, dpt.ceil, dpt.trunc])
+@pytest.mark.parametrize("dtype", _real_value_dtypes)
+def test_floor_ceil_trunc_error_dtype(dpt_call, dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    x = dpt.zeros(5, dtype=dtype)
+    y = dpt.empty_like(x, dtype="b1")
+    with pytest.raises(ValueError) as excinfo:
+        dpt_call(x, out=y)
+    assert re.match("Output array of type.*is needed", str(excinfo.value))
+
+
+@pytest.mark.parametrize("np_call, dpt_call", _all_funcs)
+@pytest.mark.parametrize("dtype", _no_complex_dtypes)
+def test_floor_ceil_trunc_contig(np_call, dpt_call, dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    n_seq = 100
+    n_rep = 137
+    Xnp = np.linspace(-99.9, 99.9, num=n_seq, dtype=dtype)
+
+    X = dpt.asarray(np.repeat(Xnp, n_rep), dtype=dtype, sycl_queue=q)
+    Y = dpt_call(X)
+
+    assert_allclose(dpt.asnumpy(Y), np.repeat(np_call(Xnp), n_rep))
+
+    Z = dpt.empty_like(X, dtype=dtype)
+    dpt_call(X, out=Z)
+
+    assert_allclose(dpt.asnumpy(Z), np.repeat(np_call(Xnp), n_rep))
+
+
+@pytest.mark.parametrize("np_call, dpt_call", _all_funcs)
+@pytest.mark.parametrize("dtype", _no_complex_dtypes)
+def test_floor_ceil_trunc_strided(np_call, dpt_call, dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    np.random.seed(42)
+    strides = np.array([-4, -3, -2, -1, 1, 2, 3, 4])
+    sizes = [2, 4, 6, 8, 24, 32, 72]
+
+    for ii in sizes:
+        Xnp = np.random.uniform(low=-99.9, high=99.9, size=ii)
+        Xnp.astype(dtype)
+        X = dpt.asarray(Xnp)
+        Ynp = np_call(Xnp)
+        for jj in strides:
+            assert_allclose(
+                dpt.asnumpy(dpt_call(X[::jj])),
+                Ynp[::jj],
+            )
+
+
+@pytest.mark.parametrize("np_call, dpt_call", _all_funcs)
+@pytest.mark.parametrize("dtype", ["f2", "f4", "f8"])
+def test_floor_ceil_trunc_special_cases(np_call, dpt_call, dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    x = [np.nan, np.inf, -np.inf, +0.0, -0.0]
+
+    xf = np.array(x, dtype=dtype)
+    yf = dpt.asarray(xf, dtype=dtype, sycl_queue=q)
+
+    Y_np = np_call(xf)
+    Y = dpt.asnumpy(dpt_call(yf))
+
+    tol = 8 * dpt.finfo(dtype).resolution
+    assert_allclose(Y, Y_np, atol=tol, rtol=tol)
+    assert_array_equal(np.signbit(Y), np.signbit(Y_np))
diff --git a/dpnp/tests/tensor/elementwise/test_floor_divide.py b/dpnp/tests/tensor/elementwise/test_floor_divide.py
new file mode 100644
index 000000000000..6a18575722b5
--- /dev/null
+++ b/dpnp/tests/tensor/elementwise/test_floor_divide.py
@@ -0,0 +1,319 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+import ctypes
+
+import dpctl
+import numpy as np
+import pytest
+
+import dpnp.tensor as dpt
+from dpnp.tensor._type_utils import _can_cast
+
+from ..helper import (
+    get_queue_or_skip,
+    skip_if_dtype_not_supported,
+)
+from .utils import (
+    _compare_dtypes,
+    _integral_dtypes,
+    _no_complex_dtypes,
+    _usm_types,
+)
+
+
+@pytest.mark.parametrize("op1_dtype", _no_complex_dtypes[1:])
+@pytest.mark.parametrize("op2_dtype", _no_complex_dtypes[1:])
+def test_floor_divide_dtype_matrix(op1_dtype, op2_dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(op1_dtype, q)
+    skip_if_dtype_not_supported(op2_dtype, q)
+
+    sz = 127
+    ar1 = dpt.ones(sz, dtype=op1_dtype)
+    ar2 = dpt.ones_like(ar1, dtype=op2_dtype)
+
+    r = dpt.floor_divide(ar1, ar2)
+    assert isinstance(r, dpt.usm_ndarray)
+    expected = np.floor_divide(
+        np.ones(1, dtype=op1_dtype), np.ones(1, dtype=op2_dtype)
+    )
+    assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q)
+    assert r.shape == ar1.shape
+    assert (dpt.asnumpy(r) == expected.astype(r.dtype)).all()
+    assert r.sycl_queue == ar1.sycl_queue
+
+    ar3 = dpt.ones(sz, dtype=op1_dtype)
+    ar4 = dpt.ones(2 * sz, dtype=op2_dtype)
+
+    r = dpt.floor_divide(ar3[::-1], ar4[::2])
+    assert isinstance(r, dpt.usm_ndarray)
+    expected = np.floor_divide(
+        np.ones(1, dtype=op1_dtype), np.ones(1, dtype=op2_dtype)
+    )
+    assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q)
+    assert r.shape == ar3.shape
+    assert (dpt.asnumpy(r) == expected.astype(r.dtype)).all()
+
+
+@pytest.mark.parametrize("op1_usm_type", _usm_types)
+@pytest.mark.parametrize("op2_usm_type", _usm_types)
+def test_floor_divide_usm_type_matrix(op1_usm_type, op2_usm_type):
+    get_queue_or_skip()
+
+    sz = 128
+    ar1 = dpt.ones(sz, dtype="i4", usm_type=op1_usm_type)
+    ar2 = dpt.ones_like(ar1, dtype="i4", usm_type=op2_usm_type)
+
+    r = dpt.floor_divide(ar1, ar2)
+    assert isinstance(r, dpt.usm_ndarray)
+    expected_usm_type = dpctl.utils.get_coerced_usm_type(
+        (op1_usm_type, op2_usm_type)
+    )
+    assert r.usm_type == expected_usm_type
+
+
+def test_floor_divide_order():
+    get_queue_or_skip()
+
+    ar1 = dpt.ones((20, 20), dtype="i4", order="C")
+    ar2 = dpt.ones((20, 20), dtype="i4", order="C")
+    r1 = dpt.floor_divide(ar1, ar2, order="C")
+    assert r1.flags.c_contiguous
+    r2 = dpt.floor_divide(ar1, ar2, order="F")
+    assert r2.flags.f_contiguous
+    r3 = dpt.floor_divide(ar1, ar2, order="A")
+    assert r3.flags.c_contiguous
+    r4 = dpt.floor_divide(ar1, ar2, order="K")
+    assert r4.flags.c_contiguous
+
+    ar1 = dpt.ones((20, 20), dtype="i4", order="F")
+    ar2 = dpt.ones((20, 20), dtype="i4", order="F")
+    r1 = dpt.floor_divide(ar1, ar2, order="C")
+    assert r1.flags.c_contiguous
+    r2 = dpt.floor_divide(ar1, ar2, order="F")
+    assert r2.flags.f_contiguous
+    r3 = dpt.floor_divide(ar1, ar2, order="A")
+    assert r3.flags.f_contiguous
+    r4 = dpt.floor_divide(ar1, ar2, order="K")
+    assert r4.flags.f_contiguous
+
+    ar1 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2]
+    ar2 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2]
+    r4 = dpt.floor_divide(ar1, ar2, order="K")
+    assert r4.strides == (20, -1)
+
+    ar1 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2].mT
+    ar2 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2].mT
+    r4 = dpt.floor_divide(ar1, ar2, order="K")
+    assert r4.strides == (-1, 20)
+
+
+def test_floor_divide_broadcasting():
+    get_queue_or_skip()
+
+    m = dpt.ones((100, 5), dtype="i4")
+    v = dpt.arange(1, 6, dtype="i4")
+
+    r = dpt.floor_divide(m, v)
+
+    expected = np.floor_divide(
+        np.ones((100, 5), dtype="i4"), np.arange(1, 6, dtype="i4")
+    )
+    assert (dpt.asnumpy(r) == expected.astype(r.dtype)).all()
+
+    r2 = dpt.floor_divide(v, m)
+    expected2 = np.floor_divide(
+        np.arange(1, 6, dtype="i4"), np.ones((100, 5), dtype="i4")
+    )
+    assert (dpt.asnumpy(r2) == expected2.astype(r2.dtype)).all()
+
+
+@pytest.mark.parametrize("arr_dt", _no_complex_dtypes[1:])
+def test_floor_divide_python_scalar(arr_dt):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(arr_dt, q)
+
+    X = dpt.ones((10, 10), dtype=arr_dt, sycl_queue=q)
+    py_ones = (
+        bool(1),
+        int(1),
+        float(1),
+        np.float32(1),
+        ctypes.c_int(1),
+    )
+    for sc in py_ones:
+        R = dpt.floor_divide(X, sc)
+        assert isinstance(R, dpt.usm_ndarray)
+        R = dpt.floor_divide(sc, X)
+        assert isinstance(R, dpt.usm_ndarray)
+
+
+class MockArray:
+    def __init__(self, arr):
+        self.data_ = arr
+
+    @property
+    def __sycl_usm_array_interface__(self):
+        return self.data_.__sycl_usm_array_interface__
+
+
+def test_floor_divide_mock_array():
+    get_queue_or_skip()
+    a = dpt.arange(10)
+    b = dpt.ones(10)
+    c = MockArray(b)
+    r = dpt.floor_divide(a, c)
+    assert isinstance(r, dpt.usm_ndarray)
+
+
+def test_floor_divide_canary_mock_array():
+    get_queue_or_skip()
+    a = dpt.arange(10)
+
+    class Canary:
+        def __init__(self):
+            pass
+
+        @property
+        def __sycl_usm_array_interface__(self):
+            return None
+
+    c = Canary()
+    with pytest.raises(ValueError):
+        dpt.floor_divide(a, c)
+
+
+def test_floor_divide_gh_1247():
+    get_queue_or_skip()
+
+    x = dpt.ones(1, dtype="i4")
+    res = dpt.floor_divide(x, -2)
+    np.testing.assert_array_equal(
+        dpt.asnumpy(res), np.full(res.shape, -1, dtype=res.dtype)
+    )
+
+    x = dpt.full(1, -1, dtype="i4")
+    res = dpt.floor_divide(x, 2)
+    np.testing.assert_array_equal(
+        dpt.asnumpy(res), np.full(res.shape, -1, dtype=res.dtype)
+    )
+
+
+@pytest.mark.parametrize("dtype", _integral_dtypes)
+def test_floor_divide_integer_zero(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    x = dpt.arange(10, dtype=dtype, sycl_queue=q)
+    y = dpt.zeros_like(x, sycl_queue=q)
+    res = dpt.floor_divide(x, y)
+    np.testing.assert_array_equal(
+        dpt.asnumpy(res), np.zeros(x.shape, dtype=res.dtype)
+    )
+
+
+def test_floor_divide_special_cases():
+    q = get_queue_or_skip()
+
+    x = dpt.empty(1, dtype="f4", sycl_queue=q)
+    y = dpt.empty_like(x)
+    x[0], y[0] = dpt.inf, dpt.inf
+    res = dpt.floor_divide(x, y)
+    with np.errstate(all="ignore"):
+        res_np = np.floor_divide(dpt.asnumpy(x), dpt.asnumpy(y))
+        np.testing.assert_array_equal(dpt.asnumpy(res), res_np)
+
+    x[0], y[0] = 0.0, -1.0
+    res = dpt.floor_divide(x, y)
+    x_np = dpt.asnumpy(x)
+    y_np = dpt.asnumpy(y)
+    res_np = np.floor_divide(x_np, y_np)
+    np.testing.assert_array_equal(dpt.asnumpy(res), res_np)
+
+    res = dpt.floor_divide(y, x)
+    with np.errstate(all="ignore"):
+        res_np = np.floor_divide(y_np, x_np)
+        np.testing.assert_array_equal(dpt.asnumpy(res), res_np)
+
+    x[0], y[0] = -1.0, dpt.inf
+    res = dpt.floor_divide(x, y)
+    np.testing.assert_array_equal(
+        dpt.asnumpy(res), np.asarray([-0.0], dtype="f4")
+    )
+
+    res = dpt.floor_divide(y, x)
+    np.testing.assert_array_equal(
+        dpt.asnumpy(res), np.asarray([-dpt.inf], dtype="f4")
+    )
+
+    x[0], y[0] = 1.0, dpt.nan
+    res = dpt.floor_divide(x, y)
+    res_np = np.floor_divide(dpt.asnumpy(x), dpt.asnumpy(y))
+    np.testing.assert_array_equal(dpt.asnumpy(res), res_np)
+
+
+@pytest.mark.parametrize("dtype", _no_complex_dtypes[1:])
+def test_divide_inplace_python_scalar(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+    X = dpt.zeros((10, 10), dtype=dtype, sycl_queue=q)
+    dt_kind = X.dtype.kind
+    if dt_kind in "ui":
+        X //= int(1)
+    elif dt_kind == "f":
+        X //= float(1)
+
+
+@pytest.mark.parametrize("op1_dtype", _no_complex_dtypes[1:])
+@pytest.mark.parametrize("op2_dtype", _no_complex_dtypes[1:])
+def test_floor_divide_inplace_dtype_matrix(op1_dtype, op2_dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(op1_dtype, q)
+    skip_if_dtype_not_supported(op2_dtype, q)
+
+    sz = 127
+    ar1 = dpt.ones(sz, dtype=op1_dtype, sycl_queue=q)
+    ar2 = dpt.ones_like(ar1, dtype=op2_dtype, sycl_queue=q)
+
+    dev = q.sycl_device
+    _fp16 = dev.has_aspect_fp16
+    _fp64 = dev.has_aspect_fp64
+    # out array only valid if it is inexact
+    if _can_cast(ar2.dtype, ar1.dtype, _fp16, _fp64, casting="same_kind"):
+        ar1 //= ar2
+        assert dpt.all(ar1 == 1)
+
+        ar3 = dpt.ones(sz, dtype=op1_dtype, sycl_queue=q)[::-1]
+        ar4 = dpt.ones(2 * sz, dtype=op2_dtype, sycl_queue=q)[::2]
+        ar3 //= ar4
+        assert dpt.all(ar3 == 1)
+    else:
+        with pytest.raises(ValueError):
+            ar1 //= ar2
+            dpt.floor_divide(ar1, ar2, out=ar1)
diff --git a/dpnp/tests/tensor/elementwise/test_greater.py b/dpnp/tests/tensor/elementwise/test_greater.py
new file mode 100644
index 000000000000..7234bd03d86a
--- /dev/null
+++ b/dpnp/tests/tensor/elementwise/test_greater.py
@@ -0,0 +1,316 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+import ctypes
+
+import dpctl
+import numpy as np
+import pytest
+
+import dpnp.tensor as dpt
+
+from ..helper import (
+    get_queue_or_skip,
+    skip_if_dtype_not_supported,
+)
+from .utils import (
+    _all_dtypes,
+    _compare_dtypes,
+    _usm_types,
+)
+
+
+@pytest.mark.parametrize("op1_dtype", _all_dtypes)
+@pytest.mark.parametrize("op2_dtype", _all_dtypes)
+def test_greater_dtype_matrix(op1_dtype, op2_dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(op1_dtype, q)
+    skip_if_dtype_not_supported(op2_dtype, q)
+
+    sz = 127
+    ar1 = dpt.zeros(sz, dtype=op1_dtype)
+    ar2 = dpt.ones_like(ar1, dtype=op2_dtype)
+
+    r = dpt.greater(ar1, ar2)
+    assert isinstance(r, dpt.usm_ndarray)
+    expected = np.greater(
+        np.zeros(1, dtype=op1_dtype), np.ones(1, dtype=op2_dtype)
+    )
+    assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q)
+    assert r.shape == ar1.shape
+    assert (dpt.asnumpy(r) == expected.astype(r.dtype)).all()
+    assert r.sycl_queue == ar1.sycl_queue
+
+    ar3 = dpt.zeros(sz, dtype=op1_dtype)
+    ar4 = dpt.ones(2 * sz, dtype=op2_dtype)
+
+    r = dpt.greater(ar3[::-1], ar4[::2])
+    assert isinstance(r, dpt.usm_ndarray)
+    expected = np.greater(
+        np.zeros(1, dtype=op1_dtype), np.ones(1, dtype=op2_dtype)
+    )
+    assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q)
+    assert r.shape == ar3.shape
+    assert (dpt.asnumpy(r) == expected.astype(r.dtype)).all()
+
+
+@pytest.mark.parametrize("op_dtype", ["c8", "c16"])
+def test_greater_complex_matrix(op_dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(op_dtype, q)
+
+    sz = 127
+    ar1_np_real = np.random.randint(0, 10, sz)
+    ar1_np_imag = np.random.randint(0, 10, sz)
+    ar1_np = ar1_np_real + 1j * ar1_np_imag
+    ar1 = dpt.asarray(ar1_np, dtype=op_dtype)
+
+    ar2_np_real = np.random.randint(0, 10, sz)
+    ar2_np_imag = np.random.randint(0, 10, sz)
+    ar2_np = ar2_np_real + 1j * ar2_np_imag
+    ar2 = dpt.asarray(ar2_np, dtype=op_dtype)
+
+    r = dpt.greater(ar1, ar2)
+    expected = np.greater(ar1_np, ar2_np)
+    assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q)
+    assert r.shape == expected.shape
+    assert (dpt.asnumpy(r) == expected).all()
+
+    r1 = dpt.greater(ar1[::-2], ar2[::2])
+    expected1 = np.greater(ar1_np[::-2], ar2_np[::2])
+    assert _compare_dtypes(r.dtype, expected1.dtype, sycl_queue=q)
+    assert r1.shape == expected1.shape
+    assert (dpt.asnumpy(r1) == expected1).all()
+
+    ar3 = dpt.asarray([1.0 + 9j, 2.0 + 0j, 2.0 + 1j, 2.0 + 2j], dtype=op_dtype)
+    ar4 = dpt.asarray([2.0 + 0j, dpt.nan, dpt.inf, -dpt.inf], dtype=op_dtype)
+
+    ar3_np = dpt.asnumpy(ar3)
+    ar4_np = dpt.asnumpy(ar4)
+
+    r2 = dpt.greater(ar3, ar4)
+    with np.errstate(invalid="ignore"):
+        expected2 = np.greater(ar3_np, ar4_np)
+    assert (dpt.asnumpy(r2) == expected2).all()
+
+    r3 = dpt.greater(ar4, ar4)
+    with np.errstate(invalid="ignore"):
+        expected3 = np.greater(ar4_np, ar4_np)
+    assert (dpt.asnumpy(r3) == expected3).all()
+
+
+def test_greater_complex_float():
+    get_queue_or_skip()
+
+    ar1 = dpt.asarray([1.0 + 9j, 2.0 + 0j, 2.0 + 1j, 2.0 + 2j], dtype="c8")
+    ar2 = dpt.full((4,), 2, dtype="f4")
+
+    ar1_np = dpt.asnumpy(ar1)
+    ar2_np = dpt.asnumpy(ar2)
+
+    r = dpt.greater(ar1, ar2)
+    expected = np.greater(ar1_np, ar2_np)
+    assert (dpt.asnumpy(r) == expected).all()
+
+    r1 = dpt.greater(ar2, ar1)
+    expected1 = np.greater(ar2_np, ar1_np)
+    assert (dpt.asnumpy(r1) == expected1).all()
+    with np.errstate(invalid="ignore"):
+        for tp in [dpt.nan, dpt.inf, -dpt.inf]:
+
+            ar3 = dpt.full((4,), tp)
+            ar3_np = dpt.asnumpy(ar3)
+
+            r2 = dpt.greater(ar1, ar3)
+            expected2 = np.greater(ar1_np, ar3_np)
+            assert (dpt.asnumpy(r2) == expected2).all()
+
+            r3 = dpt.greater(ar3, ar1)
+            expected3 = np.greater(ar3_np, ar1_np)
+            assert (dpt.asnumpy(r3) == expected3).all()
+
+
+@pytest.mark.parametrize("op1_usm_type", _usm_types)
+@pytest.mark.parametrize("op2_usm_type", _usm_types)
+def test_greater_usm_type_matrix(op1_usm_type, op2_usm_type):
+    get_queue_or_skip()
+
+    sz = 128
+    ar1 = dpt.ones(sz, dtype="i4", usm_type=op1_usm_type)
+    ar2 = dpt.ones_like(ar1, dtype="i4", usm_type=op2_usm_type)
+
+    r = dpt.greater(ar1, ar2)
+    assert isinstance(r, dpt.usm_ndarray)
+    expected_usm_type = dpctl.utils.get_coerced_usm_type(
+        (op1_usm_type, op2_usm_type)
+    )
+    assert r.usm_type == expected_usm_type
+
+
+def test_greater_order():
+    get_queue_or_skip()
+
+    ar1 = dpt.ones((20, 20), dtype="i4", order="C")
+    ar2 = dpt.ones((20, 20), dtype="i4", order="C")
+    r1 = dpt.greater(ar1, ar2, order="C")
+    assert r1.flags.c_contiguous
+    r2 = dpt.greater(ar1, ar2, order="F")
+    assert r2.flags.f_contiguous
+    r3 = dpt.greater(ar1, ar2, order="A")
+    assert r3.flags.c_contiguous
+    r4 = dpt.greater(ar1, ar2, order="K")
+    assert r4.flags.c_contiguous
+
+    ar1 = dpt.ones((20, 20), dtype="i4", order="F")
+    ar2 = dpt.ones((20, 20), dtype="i4", order="F")
+    r1 = dpt.greater(ar1, ar2, order="C")
+    assert r1.flags.c_contiguous
+    r2 = dpt.greater(ar1, ar2, order="F")
+    assert r2.flags.f_contiguous
+    r3 = dpt.greater(ar1, ar2, order="A")
+    assert r3.flags.f_contiguous
+    r4 = dpt.greater(ar1, ar2, order="K")
+    assert r4.flags.f_contiguous
+
+    ar1 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2]
+    ar2 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2]
+    r4 = dpt.greater(ar1, ar2, order="K")
+    assert r4.strides == (20, -1)
+
+    ar1 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2].mT
+    ar2 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2].mT
+    r4 = dpt.greater(ar1, ar2, order="K")
+    assert r4.strides == (-1, 20)
+
+
+def test_greater_broadcasting():
+    get_queue_or_skip()
+
+    m = dpt.ones((100, 5), dtype="i4")
+    v = dpt.arange(1, 6, dtype="i4")
+
+    r = dpt.greater(m, v)
+
+    expected = np.greater(
+        np.ones((100, 5), dtype="i4"), np.arange(1, 6, dtype="i4")
+    )
+    assert (dpt.asnumpy(r) == expected.astype(r.dtype)).all()
+
+    r2 = dpt.greater(v, m)
+    expected2 = np.greater(
+        np.arange(1, 6, dtype="i4"), np.ones((100, 5), dtype="i4")
+    )
+    assert (dpt.asnumpy(r2) == expected2.astype(r2.dtype)).all()
+
+
+@pytest.mark.parametrize("arr_dt", _all_dtypes)
+def test_greater_python_scalar(arr_dt):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(arr_dt, q)
+
+    X = dpt.ones((10, 10), dtype=arr_dt, sycl_queue=q)
+    py_ones = (
+        bool(1),
+        int(1),
+        float(1),
+        complex(1),
+        np.float32(1),
+        ctypes.c_int(1),
+    )
+    for sc in py_ones:
+        R = dpt.greater(X, sc)
+        assert isinstance(R, dpt.usm_ndarray)
+        R = dpt.greater(sc, X)
+        assert isinstance(R, dpt.usm_ndarray)
+
+
+class MockArray:
+    def __init__(self, arr):
+        self.data_ = arr
+
+    @property
+    def __sycl_usm_array_interface__(self):
+        return self.data_.__sycl_usm_array_interface__
+
+
+def test_greater_mock_array():
+    get_queue_or_skip()
+    a = dpt.arange(10)
+    b = dpt.ones(10)
+    c = MockArray(b)
+    r = dpt.greater(a, c)
+    assert isinstance(r, dpt.usm_ndarray)
+
+
+def test_greater_canary_mock_array():
+    get_queue_or_skip()
+    a = dpt.arange(10)
+
+    class Canary:
+        def __init__(self):
+            pass
+
+        @property
+        def __sycl_usm_array_interface__(self):
+            return None
+
+    c = Canary()
+    with pytest.raises(ValueError):
+        dpt.greater(a, c)
+
+
+def test_greater_mixed_integer_kinds():
+    get_queue_or_skip()
+
+    x1 = dpt.flip(dpt.arange(-9, 1, dtype="i8"))
+    x2 = dpt.arange(10, dtype="u8")
+
+    # u8 - i8
+    res = dpt.greater(x2, x1)
+    assert dpt.all(res[1:])
+    assert not res[0]
+    # i8 - u8
+    assert not dpt.any(dpt.greater(x1, x2))
+
+    # Python scalar
+    assert dpt.all(dpt.greater(x2, -1))
+    assert not dpt.any(dpt.greater(-1, x2))
+
+
+def test_greater_very_large_py_int():
+    get_queue_or_skip()
+
+    py_int = dpt.iinfo(dpt.int64).max + 10
+
+    x = dpt.asarray(3, dtype="u8")
+    assert py_int > x
+    assert not dpt.greater(x, py_int)
+
+    x = dpt.asarray(py_int, dtype="u8")
+    assert x > -1
+    assert not dpt.greater(-1, x)
diff --git a/dpnp/tests/tensor/elementwise/test_greater_equal.py b/dpnp/tests/tensor/elementwise/test_greater_equal.py
new file mode 100644
index 000000000000..888dfbd342b7
--- /dev/null
+++ b/dpnp/tests/tensor/elementwise/test_greater_equal.py
@@ -0,0 +1,315 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+import ctypes
+
+import dpctl
+import numpy as np
+import pytest
+
+import dpnp.tensor as dpt
+
+from ..helper import (
+    get_queue_or_skip,
+    skip_if_dtype_not_supported,
+)
+from .utils import (
+    _all_dtypes,
+    _compare_dtypes,
+    _usm_types,
+)
+
+
+@pytest.mark.parametrize("op1_dtype", _all_dtypes)
+@pytest.mark.parametrize("op2_dtype", _all_dtypes)
+def test_greater_equal_dtype_matrix(op1_dtype, op2_dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(op1_dtype, q)
+    skip_if_dtype_not_supported(op2_dtype, q)
+
+    sz = 127
+    ar1 = dpt.zeros(sz, dtype=op1_dtype)
+    ar2 = dpt.ones_like(ar1, dtype=op2_dtype)
+
+    r = dpt.greater_equal(ar1, ar2)
+    assert isinstance(r, dpt.usm_ndarray)
+    expected = np.greater_equal(
+        np.zeros(1, dtype=op1_dtype), np.ones(1, dtype=op2_dtype)
+    )
+    assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q)
+    assert r.shape == ar1.shape
+    assert (dpt.asnumpy(r) == expected.astype(r.dtype)).all()
+    assert r.sycl_queue == ar1.sycl_queue
+
+    ar3 = dpt.zeros(sz, dtype=op1_dtype)
+    ar4 = dpt.ones(2 * sz, dtype=op2_dtype)
+
+    r = dpt.greater_equal(ar3[::-1], ar4[::2])
+    assert isinstance(r, dpt.usm_ndarray)
+    expected = np.greater_equal(
+        np.zeros(1, dtype=op1_dtype), np.ones(1, dtype=op2_dtype)
+    )
+    assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q)
+    assert r.shape == ar3.shape
+    assert (dpt.asnumpy(r) == expected.astype(r.dtype)).all()
+
+
+@pytest.mark.parametrize("op_dtype", ["c8", "c16"])
+def test_greater_equal_complex_matrix(op_dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(op_dtype, q)
+
+    sz = 127
+    ar1_np_real = np.random.randint(0, 10, sz)
+    ar1_np_imag = np.random.randint(0, 10, sz)
+    ar1_np = ar1_np_real + 1j * ar1_np_imag
+    ar1 = dpt.asarray(ar1_np, dtype=op_dtype)
+
+    ar2_np_real = np.random.randint(0, 10, sz)
+    ar2_np_imag = np.random.randint(0, 10, sz)
+    ar2_np = ar2_np_real + 1j * ar2_np_imag
+    ar2 = dpt.asarray(ar2_np, dtype=op_dtype)
+
+    r = dpt.greater_equal(ar1, ar2)
+    expected = np.greater_equal(ar1_np, ar2_np)
+    assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q)
+    assert r.shape == expected.shape
+    assert (dpt.asnumpy(r) == expected).all()
+
+    r1 = dpt.greater_equal(ar1[::-2], ar2[::2])
+    expected1 = np.greater_equal(ar1_np[::-2], ar2_np[::2])
+    assert _compare_dtypes(r.dtype, expected1.dtype, sycl_queue=q)
+    assert r1.shape == expected1.shape
+    assert (dpt.asnumpy(r1) == expected1).all()
+
+    ar3 = dpt.asarray([1.0 + 9j, 2.0 + 0j, 2.0 + 1j, 2.0 + 2j], dtype=op_dtype)
+    ar4 = dpt.asarray([2.0 + 0j, dpt.nan, dpt.inf, -dpt.inf], dtype=op_dtype)
+
+    ar3_np = dpt.asnumpy(ar3)
+    ar4_np = dpt.asnumpy(ar4)
+    r2 = dpt.greater_equal(ar3, ar4)
+    with np.errstate(invalid="ignore"):
+        expected2 = np.greater_equal(ar3_np, ar4_np)
+    assert (dpt.asnumpy(r2) == expected2).all()
+
+    r3 = dpt.greater_equal(ar4, ar4)
+    with np.errstate(invalid="ignore"):
+        expected3 = np.greater_equal(ar4_np, ar4_np)
+    assert (dpt.asnumpy(r3) == expected3).all()
+
+
+def test_greater_equal_complex_float():
+    get_queue_or_skip()
+
+    ar1 = dpt.asarray([1.0 + 9j, 2.0 + 0j, 2.0 + 1j, 2.0 + 2j], dtype="c8")
+    ar2 = dpt.full((4,), 2, dtype="f4")
+
+    ar1_np = dpt.asnumpy(ar1)
+    ar2_np = dpt.asnumpy(ar2)
+
+    r = dpt.greater_equal(ar1, ar2)
+    expected = np.greater_equal(ar1_np, ar2_np)
+    assert (dpt.asnumpy(r) == expected).all()
+
+    r1 = dpt.greater_equal(ar2, ar1)
+    expected1 = np.greater_equal(ar2_np, ar1_np)
+    assert (dpt.asnumpy(r1) == expected1).all()
+    with np.errstate(invalid="ignore"):
+        for tp in [dpt.nan, dpt.inf, -dpt.inf]:
+
+            ar3 = dpt.full((4,), tp)
+            ar3_np = dpt.asnumpy(ar3)
+            r2 = dpt.greater_equal(ar1, ar3)
+            expected2 = np.greater_equal(ar1_np, ar3_np)
+            assert (dpt.asnumpy(r2) == expected2).all()
+
+            r3 = dpt.greater_equal(ar3, ar1)
+            expected3 = np.greater_equal(ar3_np, ar1_np)
+            assert (dpt.asnumpy(r3) == expected3).all()
+
+
+@pytest.mark.parametrize("op1_usm_type", _usm_types)
+@pytest.mark.parametrize("op2_usm_type", _usm_types)
+def test_greater_equal_usm_type_matrix(op1_usm_type, op2_usm_type):
+    get_queue_or_skip()
+
+    sz = 128
+    ar1 = dpt.ones(sz, dtype="i4", usm_type=op1_usm_type)
+    ar2 = dpt.ones_like(ar1, dtype="i4", usm_type=op2_usm_type)
+
+    r = dpt.greater_equal(ar1, ar2)
+    assert isinstance(r, dpt.usm_ndarray)
+    expected_usm_type = dpctl.utils.get_coerced_usm_type(
+        (op1_usm_type, op2_usm_type)
+    )
+    assert r.usm_type == expected_usm_type
+
+
+def test_greater_equal_order():
+    get_queue_or_skip()
+
+    ar1 = dpt.ones((20, 20), dtype="i4", order="C")
+    ar2 = dpt.ones((20, 20), dtype="i4", order="C")
+    r1 = dpt.greater_equal(ar1, ar2, order="C")
+    assert r1.flags.c_contiguous
+    r2 = dpt.greater_equal(ar1, ar2, order="F")
+    assert r2.flags.f_contiguous
+    r3 = dpt.greater_equal(ar1, ar2, order="A")
+    assert r3.flags.c_contiguous
+    r4 = dpt.greater_equal(ar1, ar2, order="K")
+    assert r4.flags.c_contiguous
+
+    ar1 = dpt.ones((20, 20), dtype="i4", order="F")
+    ar2 = dpt.ones((20, 20), dtype="i4", order="F")
+    r1 = dpt.greater_equal(ar1, ar2, order="C")
+    assert r1.flags.c_contiguous
+    r2 = dpt.greater_equal(ar1, ar2, order="F")
+    assert r2.flags.f_contiguous
+    r3 = dpt.greater_equal(ar1, ar2, order="A")
+    assert r3.flags.f_contiguous
+    r4 = dpt.greater_equal(ar1, ar2, order="K")
+    assert r4.flags.f_contiguous
+
+    ar1 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2]
+    ar2 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2]
+    r4 = dpt.greater_equal(ar1, ar2, order="K")
+    assert r4.strides == (20, -1)
+
+    ar1 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2].mT
+    ar2 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2].mT
+    r4 = dpt.greater_equal(ar1, ar2, order="K")
+    assert r4.strides == (-1, 20)
+
+
+def test_greater_equal_broadcasting():
+    get_queue_or_skip()
+
+    m = dpt.ones((100, 5), dtype="i4")
+    v = dpt.arange(1, 6, dtype="i4")
+
+    r = dpt.greater_equal(m, v)
+
+    expected = np.greater_equal(
+        np.ones((100, 5), dtype="i4"), np.arange(1, 6, dtype="i4")
+    )
+    assert (dpt.asnumpy(r) == expected.astype(r.dtype)).all()
+
+    r2 = dpt.greater_equal(v, m)
+    expected2 = np.greater_equal(
+        np.arange(1, 6, dtype="i4"), np.ones((100, 5), dtype="i4")
+    )
+    assert (dpt.asnumpy(r2) == expected2.astype(r2.dtype)).all()
+
+
+@pytest.mark.parametrize("arr_dt", _all_dtypes)
+def test_greater_equal_python_scalar(arr_dt):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(arr_dt, q)
+
+    X = dpt.ones((10, 10), dtype=arr_dt, sycl_queue=q)
+    py_ones = (
+        bool(1),
+        int(1),
+        float(1),
+        complex(1),
+        np.float32(1),
+        ctypes.c_int(1),
+    )
+    for sc in py_ones:
+        R = dpt.greater_equal(X, sc)
+        assert isinstance(R, dpt.usm_ndarray)
+        R = dpt.greater_equal(sc, X)
+        assert isinstance(R, dpt.usm_ndarray)
+
+
+class MockArray:
+    def __init__(self, arr):
+        self.data_ = arr
+
+    @property
+    def __sycl_usm_array_interface__(self):
+        return self.data_.__sycl_usm_array_interface__
+
+
+def test_greater_equal_mock_array():
+    get_queue_or_skip()
+    a = dpt.arange(10)
+    b = dpt.ones(10)
+    c = MockArray(b)
+    r = dpt.greater_equal(a, c)
+    assert isinstance(r, dpt.usm_ndarray)
+
+
+def test_greater_equal_canary_mock_array():
+    get_queue_or_skip()
+    a = dpt.arange(10)
+
+    class Canary:
+        def __init__(self):
+            pass
+
+        @property
+        def __sycl_usm_array_interface__(self):
+            return None
+
+    c = Canary()
+    with pytest.raises(ValueError):
+        dpt.greater_equal(a, c)
+
+
+def test_greater_equal_mixed_integer_kinds():
+    get_queue_or_skip()
+
+    x1 = dpt.flip(dpt.arange(-9, 1, dtype="i8"))
+    x2 = dpt.arange(10, dtype="u8")
+
+    # u8 - i8
+    res = dpt.greater_equal(x2, x1)
+    assert dpt.all(res)
+    # i8 - u8
+    res = dpt.greater_equal(x1, x2)
+    assert not dpt.any(res[1:])
+    assert res[0]
+
+    # Python scalar
+    assert dpt.all(dpt.greater_equal(x2, -1))
+    assert not dpt.any(dpt.greater_equal(-1, x2))
+
+
+def test_greater_equal_very_large_py_int():
+    get_queue_or_skip()
+
+    py_int = dpt.iinfo(dpt.int64).max + 10
+
+    x = dpt.asarray(3, dtype="u8")
+    assert py_int >= x
+    assert not dpt.greater_equal(x, py_int)
+
+    x = dpt.asarray(py_int, dtype="u8")
+    assert x >= -1
+    assert not dpt.greater_equal(-1, x)
diff --git a/dpnp/tests/tensor/elementwise/test_hyperbolic.py b/dpnp/tests/tensor/elementwise/test_hyperbolic.py
new file mode 100644
index 000000000000..b94c5ede3f2a
--- /dev/null
+++ b/dpnp/tests/tensor/elementwise/test_hyperbolic.py
@@ -0,0 +1,202 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+import numpy as np
+import pytest
+from numpy.testing import assert_allclose
+
+import dpnp.tensor as dpt
+
+from ..helper import (
+    get_queue_or_skip,
+    skip_if_dtype_not_supported,
+)
+from .utils import (
+    _all_dtypes,
+    _map_to_device_dtype,
+)
+
+_hyper_funcs = [(np.sinh, dpt.sinh), (np.cosh, dpt.cosh), (np.tanh, dpt.tanh)]
+_inv_hyper_funcs = [
+    (np.arcsinh, dpt.asinh),
+    (np.arccosh, dpt.acosh),
+    (np.arctanh, dpt.atanh),
+]
+_all_funcs = _hyper_funcs + _inv_hyper_funcs
+
+
+@pytest.mark.parametrize("np_call, dpt_call", _all_funcs)
+@pytest.mark.parametrize("dtype", _all_dtypes)
+def test_hyper_out_type(np_call, dpt_call, dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    a = 1 if np_call == np.arccosh else 0
+
+    x = dpt.asarray(a, dtype=dtype, sycl_queue=q)
+    expected_dtype = np_call(np.array(a, dtype=dtype)).dtype
+    expected_dtype = _map_to_device_dtype(expected_dtype, q.sycl_device)
+    assert dpt_call(x).dtype == expected_dtype
+
+
+@pytest.mark.parametrize("np_call, dpt_call", _all_funcs)
+@pytest.mark.parametrize("dtype", ["f2", "f4", "f8"])
+def test_hyper_real_contig(np_call, dpt_call, dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    n_seq = 100
+    n_rep = 137
+    if np_call == np.arctanh:
+        Xnp = np.linspace(-0.9, 0.9, num=n_seq, dtype=dtype)
+    elif np_call == np.arccosh:
+        Xnp = np.linspace(1.01, 10.0, num=n_seq, dtype=dtype)
+    else:
+        Xnp = np.linspace(-10.0, 10.0, num=n_seq, dtype=dtype)
+
+    X = dpt.asarray(np.repeat(Xnp, n_rep), dtype=dtype, sycl_queue=q)
+    Y = dpt_call(X)
+
+    tol = 8 * dpt.finfo(Y.dtype).resolution
+    assert_allclose(
+        dpt.asnumpy(Y), np.repeat(np_call(Xnp), n_rep), atol=tol, rtol=tol
+    )
+
+    Z = dpt.empty_like(X, dtype=dtype)
+    dpt_call(X, out=Z)
+
+    assert_allclose(
+        dpt.asnumpy(Z), np.repeat(np_call(Xnp), n_rep), atol=tol, rtol=tol
+    )
+
+
+@pytest.mark.parametrize("np_call, dpt_call", _all_funcs)
+@pytest.mark.parametrize("dtype", ["c8", "c16"])
+def test_hyper_complex_contig(np_call, dpt_call, dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    n_seq = 100
+    n_rep = 137
+    low = -9.0
+    high = 9.0
+    x1 = np.random.uniform(low=low, high=high, size=n_seq)
+    x2 = np.random.uniform(low=low, high=high, size=n_seq)
+    Xnp = x1 + 1j * x2
+
+    X = dpt.asarray(np.repeat(Xnp, n_rep), dtype=dtype, sycl_queue=q)
+    Y = dpt_call(X)
+
+    expected = np.repeat(np_call(Xnp), n_rep).astype(dtype)
+    tol = 50 * dpt.finfo(dtype).resolution
+    assert_allclose(dpt.asnumpy(Y), expected, atol=tol, rtol=tol)
+
+    Z = dpt.empty_like(X, dtype=dtype)
+    dpt_call(X, out=Z)
+
+    assert_allclose(dpt.asnumpy(Z), expected, atol=tol, rtol=tol)
+
+
+@pytest.mark.parametrize("np_call, dpt_call", _all_funcs)
+@pytest.mark.parametrize("dtype", ["f2", "f4", "f8"])
+def test_hyper_real_strided(np_call, dpt_call, dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    np.random.seed(42)
+    strides = np.array([-4, -3, -2, -1, 1, 2, 3, 4])
+    sizes = [2, 4, 6, 8, 9, 24, 72]
+    tol = 8 * dpt.finfo(dtype).resolution
+
+    low = -10.0
+    high = 10.0
+    if np_call == np.arctanh:
+        low = -0.9
+        high = 0.9
+    elif np_call == np.arccosh:
+        low = 1.01
+        high = 100.0
+
+    for ii in sizes:
+        Xnp = np.random.uniform(low=low, high=high, size=ii)
+        Xnp.astype(dtype)
+        X = dpt.asarray(Xnp)
+        Ynp = np_call(Xnp)
+        for jj in strides:
+            assert_allclose(
+                dpt.asnumpy(dpt_call(X[::jj])),
+                Ynp[::jj],
+                atol=tol,
+                rtol=tol,
+            )
+
+
+@pytest.mark.parametrize("np_call, dpt_call", _all_funcs)
+@pytest.mark.parametrize("dtype", ["c8", "c16"])
+def test_hyper_complex_strided(np_call, dpt_call, dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    np.random.seed(42)
+    strides = np.array([-4, -3, -2, -1, 1, 2, 3, 4])
+    sizes = [2, 4, 6, 8, 9, 24, 72]
+    tol = 50 * dpt.finfo(dtype).resolution
+
+    low = -8.0
+    high = 8.0
+    for ii in sizes:
+        x1 = np.random.uniform(low=low, high=high, size=ii)
+        x2 = np.random.uniform(low=low, high=high, size=ii)
+        Xnp = np.array([complex(v1, v2) for v1, v2 in zip(x1, x2)], dtype=dtype)
+        X = dpt.asarray(Xnp)
+        Ynp = np_call(Xnp)
+        for jj in strides:
+            assert_allclose(
+                dpt.asnumpy(dpt_call(X[::jj])),
+                Ynp[::jj],
+                atol=tol,
+                rtol=tol,
+            )
+
+
+@pytest.mark.parametrize("np_call, dpt_call", _all_funcs)
+@pytest.mark.parametrize("dtype", ["f2", "f4", "f8"])
+def test_hyper_real_special_cases(np_call, dpt_call, dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    x = [np.nan, np.inf, -np.inf, 2.0, -2.0, +0.0, -0.0, +1.0, -1.0]
+
+    xf = np.array(x, dtype=dtype)
+    yf = dpt.asarray(xf, dtype=dtype, sycl_queue=q)
+
+    with np.errstate(all="ignore"):
+        Y_np = np_call(xf)
+
+    tol = 8 * dpt.finfo(dtype).resolution
+    assert_allclose(dpt.asnumpy(dpt_call(yf)), Y_np, atol=tol, rtol=tol)
diff --git a/dpnp/tests/tensor/elementwise/test_hypot.py b/dpnp/tests/tensor/elementwise/test_hypot.py
new file mode 100644
index 000000000000..7cebaf3bf6ab
--- /dev/null
+++ b/dpnp/tests/tensor/elementwise/test_hypot.py
@@ -0,0 +1,212 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+import ctypes
+
+import dpctl
+import numpy as np
+import pytest
+
+import dpnp.tensor as dpt
+
+from ..helper import (
+    get_queue_or_skip,
+    skip_if_dtype_not_supported,
+)
+from .utils import (
+    _compare_dtypes,
+    _no_complex_dtypes,
+    _usm_types,
+)
+
+
+@pytest.mark.parametrize("op1_dtype", _no_complex_dtypes[1:])
+@pytest.mark.parametrize("op2_dtype", _no_complex_dtypes[1:])
+def test_hypot_dtype_matrix(op1_dtype, op2_dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(op1_dtype, q)
+    skip_if_dtype_not_supported(op2_dtype, q)
+
+    sz = 127
+    ar1 = dpt.zeros(sz, dtype=op1_dtype, sycl_queue=q)
+    ar2 = dpt.zeros_like(ar1, dtype=op2_dtype, sycl_queue=q)
+
+    r = dpt.hypot(ar1, ar2)
+    assert isinstance(r, dpt.usm_ndarray)
+    expected = np.hypot(
+        np.zeros(1, dtype=op1_dtype), np.zeros(1, dtype=op2_dtype)
+    )
+    assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q)
+    assert r.shape == ar1.shape
+    assert (dpt.asnumpy(r) == expected.astype(r.dtype)).all()
+    assert r.sycl_queue == ar1.sycl_queue
+
+    ar3 = dpt.zeros(sz, dtype=op1_dtype, sycl_queue=q)
+    ar4 = dpt.zeros(2 * sz, dtype=op2_dtype, sycl_queue=q)
+
+    r = dpt.hypot(ar3[::-1], ar4[::2])
+    assert isinstance(r, dpt.usm_ndarray)
+    expected = np.hypot(
+        np.zeros(1, dtype=op1_dtype), np.zeros(1, dtype=op2_dtype)
+    )
+    assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q)
+    assert r.shape == ar3.shape
+    assert (dpt.asnumpy(r) == expected.astype(r.dtype)).all()
+
+
+@pytest.mark.parametrize("op1_usm_type", _usm_types)
+@pytest.mark.parametrize("op2_usm_type", _usm_types)
+def test_hypot_usm_type_matrix(op1_usm_type, op2_usm_type):
+    get_queue_or_skip()
+
+    sz = 128
+    ar1 = dpt.ones(sz, dtype="i4", usm_type=op1_usm_type)
+    ar2 = dpt.ones_like(ar1, dtype="i4", usm_type=op2_usm_type)
+
+    r = dpt.hypot(ar1, ar2)
+    assert isinstance(r, dpt.usm_ndarray)
+    expected_usm_type = dpctl.utils.get_coerced_usm_type(
+        (op1_usm_type, op2_usm_type)
+    )
+    assert r.usm_type == expected_usm_type
+
+
+def test_hypot_order():
+    get_queue_or_skip()
+
+    ar1 = dpt.ones((20, 20), dtype="i4", order="C")
+    ar2 = dpt.ones((20, 20), dtype="i4", order="C")
+    r1 = dpt.hypot(ar1, ar2, order="C")
+    assert r1.flags.c_contiguous
+    r2 = dpt.hypot(ar1, ar2, order="F")
+    assert r2.flags.f_contiguous
+    r3 = dpt.hypot(ar1, ar2, order="A")
+    assert r3.flags.c_contiguous
+    r4 = dpt.hypot(ar1, ar2, order="K")
+    assert r4.flags.c_contiguous
+
+    ar1 = dpt.ones((20, 20), dtype="i4", order="F")
+    ar2 = dpt.ones((20, 20), dtype="i4", order="F")
+    r1 = dpt.hypot(ar1, ar2, order="C")
+    assert r1.flags.c_contiguous
+    r2 = dpt.hypot(ar1, ar2, order="F")
+    assert r2.flags.f_contiguous
+    r3 = dpt.hypot(ar1, ar2, order="A")
+    assert r3.flags.f_contiguous
+    r4 = dpt.hypot(ar1, ar2, order="K")
+    assert r4.flags.f_contiguous
+
+    ar1 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2]
+    ar2 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2]
+    r4 = dpt.hypot(ar1, ar2, order="K")
+    assert r4.strides == (20, -1)
+
+    ar1 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2].mT
+    ar2 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2].mT
+    r4 = dpt.hypot(ar1, ar2, order="K")
+    assert r4.strides == (-1, 20)
+
+
+def test_hypot_broadcasting():
+    get_queue_or_skip()
+
+    m = dpt.ones((100, 5), dtype="i4")
+    v = dpt.arange(1, 6, dtype="i4")
+
+    r = dpt.hypot(m, v)
+
+    expected = np.hypot(
+        np.ones((100, 5), dtype="i4"), np.arange(1, 6, dtype="i4")
+    )
+    tol = 8 * np.finfo(r.dtype).resolution
+    assert np.allclose(
+        dpt.asnumpy(r), expected.astype(r.dtype), atol=tol, rtol=tol
+    )
+
+    r2 = dpt.hypot(v, m)
+    expected2 = np.hypot(
+        np.arange(1, 6, dtype="i4"), np.ones((100, 5), dtype="i4")
+    )
+    assert np.allclose(
+        dpt.asnumpy(r2), expected2.astype(r2.dtype), atol=tol, rtol=tol
+    )
+
+
+@pytest.mark.parametrize("arr_dt", _no_complex_dtypes[1:])
+def test_hypot_python_scalar(arr_dt):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(arr_dt, q)
+
+    X = dpt.ones((10, 10), dtype=arr_dt, sycl_queue=q)
+    py_ones = (
+        bool(1),
+        int(1),
+        float(1),
+        np.float32(1),
+        ctypes.c_int(1),
+    )
+    for sc in py_ones:
+        R = dpt.hypot(X, sc)
+        assert isinstance(R, dpt.usm_ndarray)
+        R = dpt.hypot(sc, X)
+        assert isinstance(R, dpt.usm_ndarray)
+
+
+class MockArray:
+    def __init__(self, arr):
+        self.data_ = arr
+
+    @property
+    def __sycl_usm_array_interface__(self):
+        return self.data_.__sycl_usm_array_interface__
+
+
+def test_hypot_mock_array():
+    get_queue_or_skip()
+    a = dpt.arange(10)
+    b = dpt.ones(10)
+    c = MockArray(b)
+    r = dpt.hypot(a, c)
+    assert isinstance(r, dpt.usm_ndarray)
+
+
+def test_hypot_canary_mock_array():
+    get_queue_or_skip()
+    a = dpt.arange(10)
+
+    class Canary:
+        def __init__(self):
+            pass
+
+        @property
+        def __sycl_usm_array_interface__(self):
+            return None
+
+    c = Canary()
+    with pytest.raises(ValueError):
+        dpt.hypot(a, c)
diff --git a/dpnp/tests/tensor/elementwise/test_isfinite.py b/dpnp/tests/tensor/elementwise/test_isfinite.py
new file mode 100644
index 000000000000..f3a6664e6916
--- /dev/null
+++ b/dpnp/tests/tensor/elementwise/test_isfinite.py
@@ -0,0 +1,114 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+import itertools
+
+import numpy as np
+import pytest
+from numpy.testing import assert_allclose
+
+import dpnp.tensor as dpt
+
+from ..helper import (
+    get_queue_or_skip,
+    skip_if_dtype_not_supported,
+)
+from .utils import _all_dtypes
+
+
+@pytest.mark.parametrize("dtype", _all_dtypes)
+def test_isfinite_out_type(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    X = dpt.asarray(0, dtype=dtype, sycl_queue=q)
+    assert dpt.isfinite(X).dtype == dpt.bool
+
+
+def test_isfinite_output():
+    q = get_queue_or_skip()
+
+    Xnp = np.asarray(np.nan)
+    X = dpt.asarray(np.nan, sycl_queue=q)
+    assert dpt.asnumpy(dpt.isfinite(X)) == np.isfinite(Xnp)
+
+
+@pytest.mark.parametrize("dtype", ["c8", "c16"])
+def test_isfinite_complex(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    y1 = complex(np.nan, np.nan)
+    y2 = complex(1, np.nan)
+    y3 = complex(np.nan, 1)
+    y4 = complex(2, 1)
+    y5 = complex(np.inf, 1)
+
+    Ynp = np.repeat(np.array([y1, y2, y3, y4, y5], dtype=dtype), 12)
+    Y = dpt.asarray(Ynp, sycl_queue=q)
+    assert np.array_equal(dpt.asnumpy(dpt.isfinite(Y)), np.isfinite(Ynp))
+
+    r = dpt.empty_like(Y, dtype="bool")
+    dpt.isfinite(Y, out=r)
+    assert np.array_equal(dpt.asnumpy(r)[()], np.isfinite(Ynp))
+
+
+@pytest.mark.parametrize("dtype", ["f2", "f4", "f8"])
+def test_isfinite_floats(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    y1 = np.nan
+    y2 = 1
+    y3 = np.inf
+
+    for mult in [123, 137, 255, 271, 272]:
+        Ynp = np.repeat(np.array([y1, y2, y3], dtype=dtype), mult)
+        Y = dpt.asarray(Ynp, sycl_queue=q)
+        assert np.array_equal(dpt.asnumpy(dpt.isfinite(Y)), np.isfinite(Ynp))
+
+        r = dpt.empty_like(Y, dtype="bool")
+        dpt.isfinite(Y, out=r)
+        assert np.array_equal(dpt.asnumpy(r)[()], np.isfinite(Ynp))
+
+
+@pytest.mark.parametrize("dtype", _all_dtypes)
+def test_isfinite_order(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    arg_dt = np.dtype(dtype)
+    input_shape = (10, 10, 10, 10)
+    X = dpt.ones(input_shape, dtype=arg_dt, sycl_queue=q)
+
+    for perms in itertools.permutations(range(4)):
+        U = dpt.permute_dims(X[::2, ::-1, ::-1, ::5], perms)
+        expected_Y = np.full(U.shape, fill_value=True, dtype=dpt.bool)
+        for ord in ["C", "F", "A", "K"]:
+            Y = dpt.isfinite(U, order=ord)
+            assert_allclose(dpt.asnumpy(Y), expected_Y)
diff --git a/dpnp/tests/tensor/elementwise/test_isinf.py b/dpnp/tests/tensor/elementwise/test_isinf.py
new file mode 100644
index 000000000000..91b2e9420446
--- /dev/null
+++ b/dpnp/tests/tensor/elementwise/test_isinf.py
@@ -0,0 +1,108 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+import itertools
+
+import numpy as np
+import pytest
+from numpy.testing import assert_allclose
+
+import dpnp.tensor as dpt
+
+from ..helper import (
+    get_queue_or_skip,
+    skip_if_dtype_not_supported,
+)
+from .utils import _all_dtypes
+
+
+@pytest.mark.parametrize("dtype", _all_dtypes)
+def test_isinf_out_type(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    X = dpt.asarray(0, dtype=dtype, sycl_queue=q)
+    assert dpt.isinf(X).dtype == dpt.bool
+
+
+def test_isinf_output():
+    q = get_queue_or_skip()
+
+    Xnp = np.asarray(np.inf)
+    X = dpt.asarray(np.inf, sycl_queue=q)
+    assert dpt.asnumpy(dpt.isinf(X)) == np.isinf(Xnp)
+
+
+@pytest.mark.parametrize("dtype", ["c8", "c16"])
+def test_isinf_complex(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    y1 = complex(np.inf, np.inf)
+    y2 = complex(1, np.inf)
+    y3 = complex(np.inf, 1)
+    y4 = complex(2, 1)
+    y5 = complex(np.inf, 1)
+    y6 = complex(np.inf, np.nan)
+
+    Ynp = np.repeat(np.array([y1, y2, y3, y4, y5, y6], dtype=dtype), 123)
+    Y = dpt.asarray(Ynp, sycl_queue=q)
+    assert np.array_equal(dpt.asnumpy(dpt.isinf(Y)), np.isinf(Ynp))
+
+
+@pytest.mark.parametrize("dtype", ["f2", "f4", "f8"])
+def test_isinf_floats(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    y1 = np.nan
+    y2 = 1
+    y3 = np.inf
+    y4 = -np.inf
+
+    for mult in [123, 137, 255, 271, 272]:
+        Ynp = np.repeat(np.array([y1, y2, y3, y4], dtype=dtype), mult)
+        Y = dpt.asarray(Ynp, sycl_queue=q)
+        assert np.array_equal(dpt.asnumpy(dpt.isinf(Y)), np.isinf(Ynp))
+
+
+@pytest.mark.parametrize("dtype", _all_dtypes)
+def test_isinf_order(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    arg_dt = np.dtype(dtype)
+    input_shape = (10, 10, 10, 10)
+    X = dpt.ones(input_shape, dtype=arg_dt, sycl_queue=q)
+
+    for perms in itertools.permutations(range(4)):
+        U = dpt.permute_dims(X[::2, ::-1, ::-1, ::5], perms)
+        expected_Y = np.full(U.shape, fill_value=False, dtype=dpt.bool)
+        for ord in ["C", "F", "A", "K"]:
+            Y = dpt.isinf(U, order=ord)
+            assert_allclose(dpt.asnumpy(Y), expected_Y)
diff --git a/dpnp/tests/tensor/elementwise/test_isnan.py b/dpnp/tests/tensor/elementwise/test_isnan.py
new file mode 100644
index 000000000000..fe6f2660734a
--- /dev/null
+++ b/dpnp/tests/tensor/elementwise/test_isnan.py
@@ -0,0 +1,113 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+import itertools
+
+import numpy as np
+import pytest
+
+import dpnp.tensor as dpt
+
+from ..helper import (
+    get_queue_or_skip,
+    skip_if_dtype_not_supported,
+)
+from .utils import _all_dtypes
+
+
+@pytest.mark.parametrize("dtype", _all_dtypes)
+def test_isnan_out_type(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    X = dpt.asarray(0, dtype=dtype, sycl_queue=q)
+    assert dpt.isnan(X).dtype == dpt.bool
+
+
+def test_isnan_output():
+    q = get_queue_or_skip()
+
+    Xnp = np.asarray(np.nan)
+    X = dpt.asarray(np.nan, sycl_queue=q)
+    assert dpt.asnumpy(dpt.isnan(X)) == np.isnan(Xnp)
+
+
+@pytest.mark.parametrize("dtype", ["c8", "c16"])
+def test_isnan_complex(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    y1 = complex(np.nan, np.nan)
+    y2 = complex(1, np.nan)
+    y3 = complex(np.nan, 1)
+    y4 = complex(2, 1)
+    y5 = complex(np.inf, 1)
+
+    Ynp = np.repeat(np.array([y1, y2, y3, y4, y5], dtype=dtype), 123)
+    Y = dpt.asarray(Ynp, sycl_queue=q)
+    assert np.array_equal(dpt.asnumpy(dpt.isnan(Y)), np.isnan(Ynp))
+
+    r = dpt.empty_like(Y, dtype="bool")
+    dpt.isnan(Y, out=r)
+    assert np.array_equal(dpt.asnumpy(r)[()], np.isnan(Ynp))
+
+
+@pytest.mark.parametrize("dtype", ["f2", "f4", "f8"])
+def test_isnan_floats(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    y1 = np.nan
+    y2 = 1
+    y3 = np.inf
+
+    for mult in [123, 137, 255, 271, 272]:
+        Ynp = np.repeat(np.array([y1, y2, y3], dtype=dtype), mult)
+        Y = dpt.asarray(Ynp, sycl_queue=q)
+        assert np.array_equal(dpt.asnumpy(dpt.isnan(Y)), np.isnan(Ynp))
+
+        r = dpt.empty_like(Y, dtype="bool")
+        dpt.isnan(Y, out=r)
+        assert np.array_equal(dpt.asnumpy(r)[()], np.isnan(Ynp))
+
+
+@pytest.mark.parametrize("dtype", _all_dtypes)
+def test_isnan_order(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    arg_dt = np.dtype(dtype)
+    input_shape = (10, 10, 10, 10)
+    X = dpt.ones(input_shape, dtype=arg_dt, sycl_queue=q)
+
+    for perms in itertools.permutations(range(4)):
+        U = dpt.permute_dims(X[::2, ::-1, ::-1, ::5], perms)
+        expected_Y = np.full(U.shape, fill_value=False, dtype=dpt.bool)
+        for ord in ["C", "F", "A", "K"]:
+            Y = dpt.isnan(U, order=ord)
+            assert np.allclose(dpt.asnumpy(Y), expected_Y)
diff --git a/dpnp/tests/tensor/elementwise/test_less.py b/dpnp/tests/tensor/elementwise/test_less.py
new file mode 100644
index 000000000000..65fb9c2d9a84
--- /dev/null
+++ b/dpnp/tests/tensor/elementwise/test_less.py
@@ -0,0 +1,316 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+import ctypes
+
+import dpctl
+import numpy as np
+import pytest
+
+import dpnp.tensor as dpt
+
+from ..helper import (
+    get_queue_or_skip,
+    skip_if_dtype_not_supported,
+)
+from .utils import (
+    _all_dtypes,
+    _compare_dtypes,
+    _usm_types,
+)
+
+
+@pytest.mark.parametrize("op1_dtype", _all_dtypes)
+@pytest.mark.parametrize("op2_dtype", _all_dtypes)
+def test_less_dtype_matrix(op1_dtype, op2_dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(op1_dtype, q)
+    skip_if_dtype_not_supported(op2_dtype, q)
+
+    sz = 127
+    ar1 = dpt.zeros(sz, dtype=op1_dtype)
+    ar2 = dpt.ones_like(ar1, dtype=op2_dtype)
+
+    r = dpt.less(ar1, ar2)
+    assert isinstance(r, dpt.usm_ndarray)
+    expected = np.less(
+        np.zeros(1, dtype=op1_dtype), np.ones(1, dtype=op2_dtype)
+    )
+    assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q)
+    assert r.shape == ar1.shape
+    assert (dpt.asnumpy(r) == expected.astype(r.dtype)).all()
+    assert r.sycl_queue == ar1.sycl_queue
+
+    ar3 = dpt.zeros(sz, dtype=op1_dtype)
+    ar4 = dpt.ones(2 * sz, dtype=op2_dtype)
+
+    r = dpt.less(ar3[::-1], ar4[::2])
+    assert isinstance(r, dpt.usm_ndarray)
+    expected = np.less(
+        np.zeros(1, dtype=op1_dtype), np.ones(1, dtype=op2_dtype)
+    )
+    assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q)
+    assert r.shape == ar3.shape
+    assert (dpt.asnumpy(r) == expected.astype(r.dtype)).all()
+
+
+@pytest.mark.parametrize("op_dtype", ["c8", "c16"])
+def test_less_complex_matrix(op_dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(op_dtype, q)
+
+    sz = 127
+    ar1_np_real = np.random.randint(0, 10, sz)
+    ar1_np_imag = np.random.randint(0, 10, sz)
+    ar1_np = ar1_np_real + 1j * ar1_np_imag
+    ar1 = dpt.asarray(ar1_np, dtype=op_dtype)
+
+    ar2_np_real = np.random.randint(0, 10, sz)
+    ar2_np_imag = np.random.randint(0, 10, sz)
+    ar2_np = ar2_np_real + 1j * ar2_np_imag
+    ar2 = dpt.asarray(ar2_np, dtype=op_dtype)
+
+    r = dpt.less(ar1, ar2)
+    expected = np.less(ar1_np, ar2_np)
+    assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q)
+    assert r.shape == expected.shape
+    assert (dpt.asnumpy(r) == expected).all()
+
+    r1 = dpt.less(ar1[::-2], ar2[::2])
+    expected1 = np.less(ar1_np[::-2], ar2_np[::2])
+    assert _compare_dtypes(r.dtype, expected1.dtype, sycl_queue=q)
+    assert r1.shape == expected1.shape
+    assert (dpt.asnumpy(r1) == expected1).all()
+
+    ar3 = dpt.asarray([1.0 + 9j, 2.0 + 0j, 2.0 + 1j, 2.0 + 2j], dtype=op_dtype)
+    ar4 = dpt.asarray([2.0 + 0j, dpt.nan, dpt.inf, -dpt.inf], dtype=op_dtype)
+
+    ar3_np = dpt.asnumpy(ar3)
+    ar4_np = dpt.asnumpy(ar4)
+
+    r2 = dpt.less(ar3, ar4)
+    with np.errstate(invalid="ignore"):
+        expected2 = np.less(ar3_np, ar4_np)
+    assert (dpt.asnumpy(r2) == expected2).all()
+
+    r3 = dpt.less(ar4, ar4)
+    with np.errstate(invalid="ignore"):
+        expected3 = np.less(ar4_np, ar4_np)
+    assert (dpt.asnumpy(r3) == expected3).all()
+
+
+def test_less_complex_float():
+    get_queue_or_skip()
+
+    ar1 = dpt.asarray([1.0 + 9j, 2.0 + 0j, 2.0 + 1j, 2.0 + 2j], dtype="c8")
+    ar2 = dpt.full((4,), 2, dtype="f4")
+
+    ar1_np = dpt.asnumpy(ar1)
+    ar2_np = dpt.asnumpy(ar2)
+
+    r = dpt.less(ar1, ar2)
+    expected = np.less(ar1_np, ar2_np)
+    assert (dpt.asnumpy(r) == expected).all()
+
+    r1 = dpt.less(ar2, ar1)
+    expected1 = np.less(ar2_np, ar1_np)
+    assert (dpt.asnumpy(r1) == expected1).all()
+    with np.errstate(invalid="ignore"):
+        for tp in [dpt.nan, dpt.inf, -dpt.inf]:
+
+            ar3 = dpt.full((4,), tp)
+            ar3_np = dpt.asnumpy(ar3)
+
+            r2 = dpt.less(ar1, ar3)
+            expected2 = np.less(ar1_np, ar3_np)
+            assert (dpt.asnumpy(r2) == expected2).all()
+
+            r3 = dpt.less(ar3, ar1)
+            expected3 = np.less(ar3_np, ar1_np)
+            assert (dpt.asnumpy(r3) == expected3).all()
+
+
+@pytest.mark.parametrize("op1_usm_type", _usm_types)
+@pytest.mark.parametrize("op2_usm_type", _usm_types)
+def test_less_usm_type_matrix(op1_usm_type, op2_usm_type):
+    get_queue_or_skip()
+
+    sz = 128
+    ar1 = dpt.ones(sz, dtype="i4", usm_type=op1_usm_type)
+    ar2 = dpt.ones_like(ar1, dtype="i4", usm_type=op2_usm_type)
+
+    r = dpt.less(ar1, ar2)
+    assert isinstance(r, dpt.usm_ndarray)
+    expected_usm_type = dpctl.utils.get_coerced_usm_type(
+        (op1_usm_type, op2_usm_type)
+    )
+    assert r.usm_type == expected_usm_type
+
+
+def test_less_order():
+    get_queue_or_skip()
+
+    ar1 = dpt.ones((20, 20), dtype="i4", order="C")
+    ar2 = dpt.ones((20, 20), dtype="i4", order="C")
+    r1 = dpt.less(ar1, ar2, order="C")
+    assert r1.flags.c_contiguous
+    r2 = dpt.less(ar1, ar2, order="F")
+    assert r2.flags.f_contiguous
+    r3 = dpt.less(ar1, ar2, order="A")
+    assert r3.flags.c_contiguous
+    r4 = dpt.less(ar1, ar2, order="K")
+    assert r4.flags.c_contiguous
+
+    ar1 = dpt.ones((20, 20), dtype="i4", order="F")
+    ar2 = dpt.ones((20, 20), dtype="i4", order="F")
+    r1 = dpt.less(ar1, ar2, order="C")
+    assert r1.flags.c_contiguous
+    r2 = dpt.less(ar1, ar2, order="F")
+    assert r2.flags.f_contiguous
+    r3 = dpt.less(ar1, ar2, order="A")
+    assert r3.flags.f_contiguous
+    r4 = dpt.less(ar1, ar2, order="K")
+    assert r4.flags.f_contiguous
+
+    ar1 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2]
+    ar2 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2]
+    r4 = dpt.less(ar1, ar2, order="K")
+    assert r4.strides == (20, -1)
+
+    ar1 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2].mT
+    ar2 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2].mT
+    r4 = dpt.less(ar1, ar2, order="K")
+    assert r4.strides == (-1, 20)
+
+
+def test_less_broadcasting():
+    get_queue_or_skip()
+
+    m = dpt.ones((100, 5), dtype="i4")
+    v = dpt.arange(1, 6, dtype="i4")
+
+    r = dpt.less(m, v)
+
+    expected = np.less(
+        np.ones((100, 5), dtype="i4"), np.arange(1, 6, dtype="i4")
+    )
+    assert (dpt.asnumpy(r) == expected.astype(r.dtype)).all()
+
+    r2 = dpt.less(v, m)
+    expected2 = np.less(
+        np.arange(1, 6, dtype="i4"), np.ones((100, 5), dtype="i4")
+    )
+    assert (dpt.asnumpy(r2) == expected2.astype(r2.dtype)).all()
+
+
+@pytest.mark.parametrize("arr_dt", _all_dtypes)
+def test_less_python_scalar(arr_dt):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(arr_dt, q)
+
+    X = dpt.ones((10, 10), dtype=arr_dt, sycl_queue=q)
+    py_ones = (
+        bool(1),
+        int(1),
+        float(1),
+        complex(1),
+        np.float32(1),
+        ctypes.c_int(1),
+    )
+    for sc in py_ones:
+        R = dpt.less(X, sc)
+        assert isinstance(R, dpt.usm_ndarray)
+        R = dpt.less(sc, X)
+        assert isinstance(R, dpt.usm_ndarray)
+
+
+class MockArray:
+    def __init__(self, arr):
+        self.data_ = arr
+
+    @property
+    def __sycl_usm_array_interface__(self):
+        return self.data_.__sycl_usm_array_interface__
+
+
+def test_less_mock_array():
+    get_queue_or_skip()
+    a = dpt.arange(10)
+    b = dpt.ones(10)
+    c = MockArray(b)
+    r = dpt.less(a, c)
+    assert isinstance(r, dpt.usm_ndarray)
+
+
+def test_less_canary_mock_array():
+    get_queue_or_skip()
+    a = dpt.arange(10)
+
+    class Canary:
+        def __init__(self):
+            pass
+
+        @property
+        def __sycl_usm_array_interface__(self):
+            return None
+
+    c = Canary()
+    with pytest.raises(ValueError):
+        dpt.less(a, c)
+
+
+def test_less_mixed_integer_kinds():
+    get_queue_or_skip()
+
+    x1 = dpt.flip(dpt.arange(-9, 1, dtype="i8"))
+    x2 = dpt.arange(10, dtype="u8")
+
+    # u8 - i8
+    assert not dpt.any(dpt.less(x2, x1))
+    # i8 - u8
+    res = dpt.less(x1, x2)
+    assert not res[0]
+    assert dpt.all(res[1:])
+
+    # Python scalar
+    assert not dpt.any(dpt.less(x2, -1))
+    assert dpt.all(dpt.less(-1, x2))
+
+
+def test_less_very_large_py_int():
+    get_queue_or_skip()
+
+    py_int = dpt.iinfo(dpt.int64).max + 10
+
+    x = dpt.asarray(3, dtype="u8")
+    assert not py_int < x
+    assert dpt.less(x, py_int)
+
+    x = dpt.asarray(py_int, dtype="u8")
+    assert not x < -1
+    assert dpt.less(-1, x)
diff --git a/dpnp/tests/tensor/elementwise/test_less_equal.py b/dpnp/tests/tensor/elementwise/test_less_equal.py
new file mode 100644
index 000000000000..b3f9d3b42a69
--- /dev/null
+++ b/dpnp/tests/tensor/elementwise/test_less_equal.py
@@ -0,0 +1,315 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+import ctypes
+
+import dpctl
+import numpy as np
+import pytest
+
+import dpnp.tensor as dpt
+
+from ..helper import (
+    get_queue_or_skip,
+    skip_if_dtype_not_supported,
+)
+from .utils import (
+    _all_dtypes,
+    _compare_dtypes,
+    _usm_types,
+)
+
+
+@pytest.mark.parametrize("op1_dtype", _all_dtypes)
+@pytest.mark.parametrize("op2_dtype", _all_dtypes)
+def test_less_equal_dtype_matrix(op1_dtype, op2_dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(op1_dtype, q)
+    skip_if_dtype_not_supported(op2_dtype, q)
+
+    sz = 127
+    ar1 = dpt.zeros(sz, dtype=op1_dtype)
+    ar2 = dpt.ones_like(ar1, dtype=op2_dtype)
+
+    r = dpt.less_equal(ar1, ar2)
+    assert isinstance(r, dpt.usm_ndarray)
+    expected = np.less_equal(
+        np.zeros(1, dtype=op1_dtype), np.ones(1, dtype=op2_dtype)
+    )
+    assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q)
+    assert r.shape == ar1.shape
+    assert (dpt.asnumpy(r) == expected.astype(r.dtype)).all()
+    assert r.sycl_queue == ar1.sycl_queue
+
+    ar3 = dpt.zeros(sz, dtype=op1_dtype)
+    ar4 = dpt.ones(2 * sz, dtype=op2_dtype)
+
+    r = dpt.less_equal(ar3[::-1], ar4[::2])
+    assert isinstance(r, dpt.usm_ndarray)
+    expected = np.less_equal(
+        np.zeros(1, dtype=op1_dtype), np.ones(1, dtype=op2_dtype)
+    )
+    assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q)
+    assert r.shape == ar3.shape
+    assert (dpt.asnumpy(r) == expected.astype(r.dtype)).all()
+
+
+@pytest.mark.parametrize("op_dtype", ["c8", "c16"])
+def test_less_equal_complex_matrix(op_dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(op_dtype, q)
+
+    sz = 127
+    ar1_np_real = np.random.randint(0, 10, sz)
+    ar1_np_imag = np.random.randint(0, 10, sz)
+    ar1_np = ar1_np_real + 1j * ar1_np_imag
+    ar1 = dpt.asarray(ar1_np, dtype=op_dtype)
+
+    ar2_np_real = np.random.randint(0, 10, sz)
+    ar2_np_imag = np.random.randint(0, 10, sz)
+    ar2_np = ar2_np_real + 1j * ar2_np_imag
+    ar2 = dpt.asarray(ar2_np, dtype=op_dtype)
+
+    r = dpt.less_equal(ar1, ar2)
+    expected = np.less_equal(ar1_np, ar2_np)
+    assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q)
+    assert r.shape == expected.shape
+    assert (dpt.asnumpy(r) == expected).all()
+
+    r1 = dpt.less_equal(ar1[::-2], ar2[::2])
+    expected1 = np.less_equal(ar1_np[::-2], ar2_np[::2])
+    assert _compare_dtypes(r.dtype, expected1.dtype, sycl_queue=q)
+    assert r1.shape == expected1.shape
+    assert (dpt.asnumpy(r1) == expected1).all()
+
+    ar3 = dpt.asarray([1.0 + 9j, 2.0 + 0j, 2.0 + 1j, 2.0 + 2j], dtype=op_dtype)
+    ar4 = dpt.asarray([2.0 + 0j, dpt.nan, dpt.inf, -dpt.inf], dtype=op_dtype)
+
+    ar3_np = dpt.asnumpy(ar3)
+    ar4_np = dpt.asnumpy(ar4)
+
+    r2 = dpt.less_equal(ar3, ar4)
+    with np.errstate(invalid="ignore"):
+        expected2 = np.less_equal(ar3_np, ar4_np)
+    assert (dpt.asnumpy(r2) == expected2).all()
+
+    r3 = dpt.less_equal(ar4, ar4)
+    with np.errstate(invalid="ignore"):
+        expected3 = np.less_equal(ar4_np, ar4_np)
+    assert (dpt.asnumpy(r3) == expected3).all()
+
+
+def test_less_equal_complex_float():
+    get_queue_or_skip()
+
+    ar1 = dpt.asarray([1.0 + 9j, 2.0 + 0j, 2.0 + 1j, 2.0 + 2j], dtype="c8")
+    ar2 = dpt.full((4,), 2, dtype="f4")
+
+    ar1_np = dpt.asnumpy(ar1)
+    ar2_np = dpt.asnumpy(ar2)
+
+    r = dpt.less_equal(ar1, ar2)
+    expected = np.less_equal(ar1_np, ar2_np)
+    assert (dpt.asnumpy(r) == expected).all()
+
+    r1 = dpt.less_equal(ar2, ar1)
+    expected1 = np.less_equal(ar2_np, ar1_np)
+    assert (dpt.asnumpy(r1) == expected1).all()
+    with np.errstate(invalid="ignore"):
+        for tp in [dpt.nan, dpt.inf, -dpt.inf]:
+
+            ar3 = dpt.full((4,), tp)
+            ar3_np = dpt.asnumpy(ar3)
+            r2 = dpt.less_equal(ar1, ar3)
+            expected2 = np.less_equal(ar1_np, ar3_np)
+            assert (dpt.asnumpy(r2) == expected2).all()
+
+            r3 = dpt.less_equal(ar3, ar1)
+            expected3 = np.less_equal(ar3_np, ar1_np)
+            assert (dpt.asnumpy(r3) == expected3).all()
+
+
+@pytest.mark.parametrize("op1_usm_type", _usm_types)
+@pytest.mark.parametrize("op2_usm_type", _usm_types)
+def test_less_equal_usm_type_matrix(op1_usm_type, op2_usm_type):
+    get_queue_or_skip()
+
+    sz = 128
+    ar1 = dpt.ones(sz, dtype="i4", usm_type=op1_usm_type)
+    ar2 = dpt.ones_like(ar1, dtype="i4", usm_type=op2_usm_type)
+
+    r = dpt.less_equal(ar1, ar2)
+    assert isinstance(r, dpt.usm_ndarray)
+    expected_usm_type = dpctl.utils.get_coerced_usm_type(
+        (op1_usm_type, op2_usm_type)
+    )
+    assert r.usm_type == expected_usm_type
+
+
+def test_less_equal_order():
+    get_queue_or_skip()
+
+    ar1 = dpt.ones((20, 20), dtype="i4", order="C")
+    ar2 = dpt.ones((20, 20), dtype="i4", order="C")
+    r1 = dpt.less_equal(ar1, ar2, order="C")
+    assert r1.flags.c_contiguous
+    r2 = dpt.less_equal(ar1, ar2, order="F")
+    assert r2.flags.f_contiguous
+    r3 = dpt.less_equal(ar1, ar2, order="A")
+    assert r3.flags.c_contiguous
+    r4 = dpt.less_equal(ar1, ar2, order="K")
+    assert r4.flags.c_contiguous
+
+    ar1 = dpt.ones((20, 20), dtype="i4", order="F")
+    ar2 = dpt.ones((20, 20), dtype="i4", order="F")
+    r1 = dpt.less_equal(ar1, ar2, order="C")
+    assert r1.flags.c_contiguous
+    r2 = dpt.less_equal(ar1, ar2, order="F")
+    assert r2.flags.f_contiguous
+    r3 = dpt.less_equal(ar1, ar2, order="A")
+    assert r3.flags.f_contiguous
+    r4 = dpt.less_equal(ar1, ar2, order="K")
+    assert r4.flags.f_contiguous
+
+    ar1 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2]
+    ar2 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2]
+    r4 = dpt.less_equal(ar1, ar2, order="K")
+    assert r4.strides == (20, -1)
+
+    ar1 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2].mT
+    ar2 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2].mT
+    r4 = dpt.less_equal(ar1, ar2, order="K")
+    assert r4.strides == (-1, 20)
+
+
+def test_less_equal_broadcasting():
+    get_queue_or_skip()
+
+    m = dpt.ones((100, 5), dtype="i4")
+    v = dpt.arange(1, 6, dtype="i4")
+
+    r = dpt.less_equal(m, v)
+
+    expected = np.less_equal(
+        np.ones((100, 5), dtype="i4"), np.arange(1, 6, dtype="i4")
+    )
+    assert (dpt.asnumpy(r) == expected.astype(r.dtype)).all()
+
+    r2 = dpt.less_equal(v, m)
+    expected2 = np.less_equal(
+        np.arange(1, 6, dtype="i4"), np.ones((100, 5), dtype="i4")
+    )
+    assert (dpt.asnumpy(r2) == expected2.astype(r2.dtype)).all()
+
+
+@pytest.mark.parametrize("arr_dt", _all_dtypes)
+def test_less_equal_python_scalar(arr_dt):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(arr_dt, q)
+
+    X = dpt.ones((10, 10), dtype=arr_dt, sycl_queue=q)
+    py_ones = (
+        bool(1),
+        int(1),
+        float(1),
+        complex(1),
+        np.float32(1),
+        ctypes.c_int(1),
+    )
+    for sc in py_ones:
+        R = dpt.less_equal(X, sc)
+        assert isinstance(R, dpt.usm_ndarray)
+        R = dpt.less_equal(sc, X)
+        assert isinstance(R, dpt.usm_ndarray)
+
+
+class MockArray:
+    def __init__(self, arr):
+        self.data_ = arr
+
+    @property
+    def __sycl_usm_array_interface__(self):
+        return self.data_.__sycl_usm_array_interface__
+
+
+def test_less_equal_mock_array():
+    get_queue_or_skip()
+    a = dpt.arange(10)
+    b = dpt.ones(10)
+    c = MockArray(b)
+    r = dpt.less_equal(a, c)
+    assert isinstance(r, dpt.usm_ndarray)
+
+
+def test_less_equal_canary_mock_array():
+    get_queue_or_skip()
+    a = dpt.arange(10)
+
+    class Canary:
+        def __init__(self):
+            pass
+
+        @property
+        def __sycl_usm_array_interface__(self):
+            return None
+
+    c = Canary()
+    with pytest.raises(ValueError):
+        dpt.less_equal(a, c)
+
+
+def test_less_equal_mixed_integer_kinds():
+    get_queue_or_skip()
+
+    x1 = dpt.flip(dpt.arange(-9, 1, dtype="i8"))
+    x2 = dpt.arange(10, dtype="u8")
+
+    # u8 - i8
+    res = dpt.less_equal(x2, x1)
+    assert res[0]
+    assert not dpt.any(res[1:])
+    # i8 - u8
+    assert dpt.all(dpt.less_equal(x1, x2))
+
+    # Python scalar
+    assert not dpt.any(dpt.less_equal(x2, -1))
+    assert dpt.all(dpt.less_equal(-1, x2))
+
+
+def test_less_equal_very_large_py_int():
+    get_queue_or_skip()
+
+    py_int = dpt.iinfo(dpt.int64).max + 10
+
+    x = dpt.asarray(3, dtype="u8")
+    assert not py_int <= x
+    assert dpt.less_equal(x, py_int)
+
+    x = dpt.asarray(py_int, dtype="u8")
+    assert not x <= -1
+    assert dpt.less_equal(-1, x)
diff --git a/dpnp/tests/tensor/elementwise/test_log.py b/dpnp/tests/tensor/elementwise/test_log.py
new file mode 100644
index 000000000000..b41fa85df05e
--- /dev/null
+++ b/dpnp/tests/tensor/elementwise/test_log.py
@@ -0,0 +1,149 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+import itertools
+
+import numpy as np
+import pytest
+from numpy.testing import assert_allclose, assert_equal
+
+import dpnp.tensor as dpt
+
+from ..helper import (
+    get_queue_or_skip,
+    skip_if_dtype_not_supported,
+)
+from .utils import (
+    _all_dtypes,
+    _map_to_device_dtype,
+    _usm_types,
+)
+
+
+@pytest.mark.parametrize("dtype", _all_dtypes)
+def test_log_out_type(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    X = dpt.asarray(1, dtype=dtype, sycl_queue=q)
+    expected_dtype = np.log(np.array(1, dtype=dtype)).dtype
+    expected_dtype = _map_to_device_dtype(expected_dtype, q.sycl_device)
+    assert dpt.log(X).dtype == expected_dtype
+
+
+@pytest.mark.parametrize("dtype", ["f2", "f4", "f8", "c8", "c16"])
+def test_log_output_contig(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    n_seq = 1027
+
+    X = dpt.linspace(1, 13, num=n_seq, dtype=dtype, sycl_queue=q)
+    Xnp = dpt.asnumpy(X)
+
+    Y = dpt.log(X)
+    tol = 8 * dpt.finfo(Y.dtype).resolution
+
+    assert_allclose(dpt.asnumpy(Y), np.log(Xnp), atol=tol, rtol=tol)
+
+
+@pytest.mark.parametrize("dtype", ["f2", "f4", "f8", "c8", "c16"])
+def test_log_output_strided(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    n_seq = 2 * 1027
+
+    X = dpt.linspace(1, 13, num=n_seq, dtype=dtype, sycl_queue=q)[::-2]
+    Xnp = dpt.asnumpy(X)
+
+    Y = dpt.log(X)
+    tol = 8 * dpt.finfo(Y.dtype).resolution
+
+    assert_allclose(dpt.asnumpy(Y), np.log(Xnp), atol=tol, rtol=tol)
+
+
+@pytest.mark.parametrize("usm_type", _usm_types)
+def test_log_usm_type(usm_type):
+    q = get_queue_or_skip()
+
+    arg_dt = np.dtype("f4")
+    input_shape = (10, 10, 10, 10)
+    X = dpt.empty(input_shape, dtype=arg_dt, usm_type=usm_type, sycl_queue=q)
+    X[..., 0::2] = 4 * dpt.e
+    X[..., 1::2] = 10 * dpt.e
+
+    Y = dpt.log(X)
+    assert Y.usm_type == X.usm_type
+    assert Y.sycl_queue == X.sycl_queue
+    assert Y.flags.c_contiguous
+
+    expected_Y = np.empty(input_shape, dtype=arg_dt)
+    expected_Y[..., 0::2] = np.log(np.float32(4 * dpt.e))
+    expected_Y[..., 1::2] = np.log(np.float32(10 * dpt.e))
+    tol = 8 * dpt.finfo(Y.dtype).resolution
+
+    assert_allclose(dpt.asnumpy(Y), expected_Y, atol=tol, rtol=tol)
+
+
+@pytest.mark.parametrize("dtype", _all_dtypes)
+def test_log_order(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    arg_dt = np.dtype(dtype)
+    input_shape = (10, 10, 10, 10)
+    X = dpt.empty(input_shape, dtype=arg_dt, sycl_queue=q)
+    X[..., 0::2] = 4 * dpt.e
+    X[..., 1::2] = 10 * dpt.e
+
+    for perms in itertools.permutations(range(4)):
+        U = dpt.permute_dims(X[:, ::-1, ::-1, :], perms)
+        expected_Y = np.log(dpt.asnumpy(U))
+        for ord in ["C", "F", "A", "K"]:
+            Y = dpt.log(U, order=ord)
+            tol = 8 * max(
+                dpt.finfo(Y.dtype).resolution,
+                np.finfo(expected_Y.dtype).resolution,
+            )
+            assert_allclose(dpt.asnumpy(Y), expected_Y, atol=tol, rtol=tol)
+
+
+def test_log_special_cases():
+    q = get_queue_or_skip()
+
+    X = dpt.asarray(
+        [dpt.nan, -dpt.inf, -1.0, -0.0, 0.0, dpt.inf], dtype="f4", sycl_queue=q
+    )
+    Y = dpt.log(X)
+
+    expected = np.array(
+        [np.nan, np.nan, np.nan, -np.inf, -np.inf, np.inf], dtype="f4"
+    )
+
+    assert_equal(dpt.asnumpy(Y), expected)
diff --git a/dpnp/tests/tensor/elementwise/test_log10.py b/dpnp/tests/tensor/elementwise/test_log10.py
new file mode 100644
index 000000000000..02c652293b9d
--- /dev/null
+++ b/dpnp/tests/tensor/elementwise/test_log10.py
@@ -0,0 +1,152 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+import itertools
+
+import numpy as np
+import pytest
+from numpy.testing import assert_equal
+
+import dpnp.tensor as dpt
+
+from ..helper import (
+    get_queue_or_skip,
+    skip_if_dtype_not_supported,
+)
+from .utils import (
+    _all_dtypes,
+    _map_to_device_dtype,
+    _usm_types,
+)
+
+
+@pytest.mark.parametrize("dtype", _all_dtypes)
+def test_log_out_type(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    X = dpt.asarray(1, dtype=dtype, sycl_queue=q)
+    expected_dtype = np.log10(np.array(1, dtype=dtype)).dtype
+    expected_dtype = _map_to_device_dtype(expected_dtype, q.sycl_device)
+    assert dpt.log10(X).dtype == expected_dtype
+
+
+@pytest.mark.parametrize("dtype", ["f2", "f4", "f8", "c8", "c16"])
+def test_log_output_contig(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    n_seq = 1027
+
+    X = dpt.linspace(1, 13, num=n_seq, dtype=dtype, sycl_queue=q)
+    Xnp = dpt.asnumpy(X)
+
+    Y = dpt.log10(X)
+    tol = 8 * dpt.finfo(Y.dtype).resolution
+
+    np.testing.assert_allclose(
+        dpt.asnumpy(Y), np.log10(Xnp), atol=tol, rtol=tol
+    )
+
+
+@pytest.mark.parametrize("dtype", ["f2", "f4", "f8", "c8", "c16"])
+def test_log_output_strided(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    n_seq = 2 * 1027
+
+    X = dpt.linspace(1, 13, num=n_seq, dtype=dtype, sycl_queue=q)[::-2]
+    Xnp = dpt.asnumpy(X)
+
+    Y = dpt.log10(X)
+    tol = 8 * dpt.finfo(Y.dtype).resolution
+
+    np.testing.assert_allclose(
+        dpt.asnumpy(Y), np.log10(Xnp), atol=tol, rtol=tol
+    )
+
+
+@pytest.mark.parametrize("usm_type", _usm_types)
+def test_log_usm_type(usm_type):
+    q = get_queue_or_skip()
+
+    arg_dt = np.dtype("f4")
+    input_shape = (10, 10, 10, 10)
+    X = dpt.empty(input_shape, dtype=arg_dt, usm_type=usm_type, sycl_queue=q)
+    X[..., 0::2] = 4 * dpt.e
+    X[..., 1::2] = 10 * dpt.e
+
+    Y = dpt.log10(X)
+    assert Y.usm_type == X.usm_type
+    assert Y.sycl_queue == X.sycl_queue
+    assert Y.flags.c_contiguous
+
+    expected_Y = np.empty(input_shape, dtype=arg_dt)
+    expected_Y[..., 0::2] = np.log10(np.float32(4 * dpt.e))
+    expected_Y[..., 1::2] = np.log10(np.float32(10 * dpt.e))
+    tol = 8 * dpt.finfo(Y.dtype).resolution
+
+    np.testing.assert_allclose(dpt.asnumpy(Y), expected_Y, atol=tol, rtol=tol)
+
+
+@pytest.mark.parametrize("dtype", _all_dtypes)
+def test_log_order(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    arg_dt = np.dtype(dtype)
+    input_shape = (10, 10, 10, 10)
+    X = dpt.empty(input_shape, dtype=arg_dt, sycl_queue=q)
+    X[..., 0::2] = 4 * dpt.e
+    X[..., 1::2] = 10 * dpt.e
+
+    for perms in itertools.permutations(range(4)):
+        U = dpt.permute_dims(X[:, ::-1, ::-1, :], perms)
+        expected_Y = np.log10(dpt.asnumpy(U))
+        for ord in ["C", "F", "A", "K"]:
+            Y = dpt.log10(U, order=ord)
+            tol = 8 * max(
+                dpt.finfo(Y.dtype).resolution,
+                np.finfo(expected_Y.dtype).resolution,
+            )
+            np.testing.assert_allclose(
+                dpt.asnumpy(Y), expected_Y, atol=tol, rtol=tol
+            )
+
+
+def test_log_special_cases():
+    q = get_queue_or_skip()
+
+    X = dpt.asarray(
+        [dpt.nan, -1.0, 0.0, -0.0, dpt.inf, -dpt.inf], dtype="f4", sycl_queue=q
+    )
+    Xnp = dpt.asnumpy(X)
+
+    with np.errstate(invalid="ignore", divide="ignore"):
+        assert_equal(dpt.asnumpy(dpt.log10(X)), np.log10(Xnp))
diff --git a/dpnp/tests/tensor/elementwise/test_log1p.py b/dpnp/tests/tensor/elementwise/test_log1p.py
new file mode 100644
index 000000000000..eb6205650e10
--- /dev/null
+++ b/dpnp/tests/tensor/elementwise/test_log1p.py
@@ -0,0 +1,188 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+import itertools
+
+import numpy as np
+import pytest
+from numpy.testing import assert_allclose
+
+import dpnp.tensor as dpt
+
+from ..helper import (
+    get_queue_or_skip,
+    skip_if_dtype_not_supported,
+)
+from .utils import (
+    _all_dtypes,
+    _map_to_device_dtype,
+    _usm_types,
+)
+
+
+@pytest.mark.parametrize("dtype", _all_dtypes)
+def test_log1p_out_type(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    X = dpt.asarray(0, dtype=dtype, sycl_queue=q)
+    expected_dtype = np.log1p(np.array(0, dtype=dtype)).dtype
+    expected_dtype = _map_to_device_dtype(expected_dtype, q.sycl_device)
+    assert dpt.log1p(X).dtype == expected_dtype
+
+
+@pytest.mark.parametrize("dtype", ["f2", "f4", "f8", "c8", "c16"])
+def test_log1p_output_contig(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    n_seq = 1027
+
+    X = dpt.linspace(0, 2, num=n_seq, dtype=dtype, sycl_queue=q)
+    Xnp = dpt.asnumpy(X)
+
+    Y = dpt.log1p(X)
+    tol = 8 * dpt.finfo(Y.dtype).resolution
+
+    assert_allclose(dpt.asnumpy(Y), np.log1p(Xnp), atol=tol, rtol=tol)
+
+
+@pytest.mark.parametrize("dtype", ["f2", "f4", "f8", "c8", "c16"])
+def test_log1p_output_strided(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    n_seq = 2 * 1027
+
+    X = dpt.linspace(0, 2, num=n_seq, dtype=dtype, sycl_queue=q)[::-2]
+    Xnp = dpt.asnumpy(X)
+
+    Y = dpt.log1p(X)
+    tol = 8 * dpt.finfo(Y.dtype).resolution
+
+    assert_allclose(dpt.asnumpy(Y), np.log1p(Xnp), atol=tol, rtol=tol)
+
+
+@pytest.mark.parametrize("usm_type", _usm_types)
+def test_log1p_usm_type(usm_type):
+    q = get_queue_or_skip()
+
+    arg_dt = np.dtype("f4")
+    input_shape = (10, 10, 10, 10)
+    X = dpt.empty(input_shape, dtype=arg_dt, usm_type=usm_type, sycl_queue=q)
+    X[..., 0::2] = dpt.e / 1000
+    X[..., 1::2] = dpt.e / 100
+
+    Y = dpt.log1p(X)
+    assert Y.usm_type == X.usm_type
+    assert Y.sycl_queue == X.sycl_queue
+    assert Y.flags.c_contiguous
+
+    expected_Y = np.empty(input_shape, dtype=arg_dt)
+    expected_Y[..., 0::2] = np.log1p(np.float32(dpt.e / 1000))
+    expected_Y[..., 1::2] = np.log1p(np.float32(dpt.e / 100))
+    tol = 8 * dpt.finfo(Y.dtype).resolution
+
+    assert_allclose(dpt.asnumpy(Y), expected_Y, atol=tol, rtol=tol)
+
+
+@pytest.mark.parametrize("dtype", _all_dtypes)
+def test_log1p_order(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    arg_dt = np.dtype(dtype)
+    input_shape = (10, 10, 10, 10)
+    X = dpt.empty(input_shape, dtype=arg_dt, sycl_queue=q)
+    X[..., 0::2] = dpt.e / 1000
+    X[..., 1::2] = dpt.e / 100
+
+    for perms in itertools.permutations(range(4)):
+        U = dpt.permute_dims(X[:, ::-1, ::-1, :], perms)
+        expected_Y = np.log1p(dpt.asnumpy(U))
+        for ord in ["C", "F", "A", "K"]:
+            Y = dpt.log1p(U, order=ord)
+            tol = 8 * max(
+                dpt.finfo(Y.dtype).resolution,
+                np.finfo(expected_Y.dtype).resolution,
+            )
+            assert_allclose(dpt.asnumpy(Y), expected_Y, atol=tol, rtol=tol)
+
+
+def test_log1p_special_cases():
+    q = get_queue_or_skip()
+
+    X = dpt.asarray(
+        [dpt.nan, -2.0, -1.0, -0.0, 0.0, dpt.inf],
+        dtype="f4",
+        sycl_queue=q,
+    )
+    res = np.asarray([np.nan, np.nan, -np.inf, -0.0, 0.0, np.inf], dtype="f4")
+
+    tol = dpt.finfo(X.dtype).resolution
+    with np.errstate(divide="ignore", invalid="ignore"):
+        assert_allclose(dpt.asnumpy(dpt.log1p(X)), res, atol=tol, rtol=tol)
+
+    # special cases for complex
+    vals = [
+        complex(-1.0, 0.0),
+        complex(2.0, dpt.inf),
+        complex(2.0, dpt.nan),
+        complex(-dpt.inf, 1.0),
+        complex(dpt.inf, 1.0),
+        complex(-dpt.inf, dpt.inf),
+        complex(dpt.inf, dpt.inf),
+        complex(dpt.inf, dpt.nan),
+        complex(dpt.nan, 1.0),
+        complex(dpt.nan, dpt.inf),
+        complex(dpt.nan, dpt.nan),
+    ]
+    X = dpt.asarray(vals, dtype=dpt.complex64)
+    c_nan = complex(np.nan, np.nan)
+    res = np.asarray(
+        [
+            complex(-np.inf, 0.0),
+            complex(np.inf, np.pi / 2),
+            c_nan,
+            complex(np.inf, np.pi),
+            complex(np.inf, 0.0),
+            complex(np.inf, 3 * np.pi / 4),
+            complex(np.inf, np.pi / 4),
+            complex(np.inf, np.nan),
+            c_nan,
+            complex(np.inf, np.nan),
+            c_nan,
+        ],
+        dtype=np.complex64,
+    )
+
+    tol = dpt.finfo(X.dtype).resolution
+    with np.errstate(invalid="ignore"):
+        dpt_res = dpt.asnumpy(dpt.log1p(X))
+        assert_allclose(np.real(dpt_res), np.real(res), atol=tol, rtol=tol)
+        assert_allclose(np.imag(dpt_res), np.imag(res), atol=tol, rtol=tol)
diff --git a/dpnp/tests/tensor/elementwise/test_log2.py b/dpnp/tests/tensor/elementwise/test_log2.py
new file mode 100644
index 000000000000..7cd2f4615133
--- /dev/null
+++ b/dpnp/tests/tensor/elementwise/test_log2.py
@@ -0,0 +1,148 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+import itertools
+
+import numpy as np
+import pytest
+from numpy.testing import assert_equal
+
+import dpnp.tensor as dpt
+
+from ..helper import (
+    get_queue_or_skip,
+    skip_if_dtype_not_supported,
+)
+from .utils import (
+    _all_dtypes,
+    _map_to_device_dtype,
+    _usm_types,
+)
+
+
+@pytest.mark.parametrize("dtype", _all_dtypes)
+def test_log_out_type(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    X = dpt.asarray(1, dtype=dtype, sycl_queue=q)
+    expected_dtype = np.log2(np.array(1, dtype=dtype)).dtype
+    expected_dtype = _map_to_device_dtype(expected_dtype, q.sycl_device)
+    assert dpt.log2(X).dtype == expected_dtype
+
+
+@pytest.mark.parametrize("dtype", ["f2", "f4", "f8", "c8", "c16"])
+def test_log_output_contig(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    n_seq = 1027
+
+    X = dpt.linspace(1, 13, num=n_seq, dtype=dtype, sycl_queue=q)
+    Xnp = dpt.asnumpy(X)
+
+    Y = dpt.log2(X)
+    tol = 8 * dpt.finfo(Y.dtype).resolution
+
+    np.testing.assert_allclose(dpt.asnumpy(Y), np.log2(Xnp), atol=tol, rtol=tol)
+
+
+@pytest.mark.parametrize("dtype", ["f2", "f4", "f8", "c8", "c16"])
+def test_log_output_strided(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    n_seq = 2 * 1027
+
+    X = dpt.linspace(1, 13, num=n_seq, dtype=dtype, sycl_queue=q)[::-2]
+    Xnp = dpt.asnumpy(X)
+
+    Y = dpt.log2(X)
+    tol = 8 * dpt.finfo(Y.dtype).resolution
+
+    np.testing.assert_allclose(dpt.asnumpy(Y), np.log2(Xnp), atol=tol, rtol=tol)
+
+
+@pytest.mark.parametrize("usm_type", _usm_types)
+def test_log_usm_type(usm_type):
+    q = get_queue_or_skip()
+
+    arg_dt = np.dtype("f4")
+    input_shape = (10, 10, 10, 10)
+    X = dpt.empty(input_shape, dtype=arg_dt, usm_type=usm_type, sycl_queue=q)
+    X[..., 0::2] = 4 * dpt.e
+    X[..., 1::2] = 10 * dpt.e
+
+    Y = dpt.log2(X)
+    assert Y.usm_type == X.usm_type
+    assert Y.sycl_queue == X.sycl_queue
+    assert Y.flags.c_contiguous
+
+    expected_Y = np.empty(input_shape, dtype=arg_dt)
+    expected_Y[..., 0::2] = np.log2(np.float32(4 * dpt.e))
+    expected_Y[..., 1::2] = np.log2(np.float32(10 * dpt.e))
+    tol = 8 * dpt.finfo(Y.dtype).resolution
+
+    np.testing.assert_allclose(dpt.asnumpy(Y), expected_Y, atol=tol, rtol=tol)
+
+
+@pytest.mark.parametrize("dtype", _all_dtypes)
+def test_log_order(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    arg_dt = np.dtype(dtype)
+    input_shape = (10, 10, 10, 10)
+    X = dpt.empty(input_shape, dtype=arg_dt, sycl_queue=q)
+    X[..., 0::2] = 4 * dpt.e
+    X[..., 1::2] = 10 * dpt.e
+
+    for perms in itertools.permutations(range(4)):
+        U = dpt.permute_dims(X[:, ::-1, ::-1, :], perms)
+        expected_Y = np.log2(dpt.asnumpy(U))
+        for ord in ["C", "F", "A", "K"]:
+            Y = dpt.log2(U, order=ord)
+            tol = 8 * max(
+                dpt.finfo(Y.dtype).resolution,
+                np.finfo(expected_Y.dtype).resolution,
+            )
+            np.testing.assert_allclose(
+                dpt.asnumpy(Y), expected_Y, atol=tol, rtol=tol
+            )
+
+
+def test_log_special_cases():
+    q = get_queue_or_skip()
+
+    X = dpt.asarray(
+        [dpt.nan, -1.0, 0.0, -0.0, dpt.inf, -dpt.inf], dtype="f4", sycl_queue=q
+    )
+    Xnp = dpt.asnumpy(X)
+
+    with np.errstate(invalid="ignore", divide="ignore"):
+        assert_equal(dpt.asnumpy(dpt.log2(X)), np.log2(Xnp))
diff --git a/dpnp/tests/tensor/elementwise/test_logaddexp.py b/dpnp/tests/tensor/elementwise/test_logaddexp.py
new file mode 100644
index 000000000000..a1502f4c3d11
--- /dev/null
+++ b/dpnp/tests/tensor/elementwise/test_logaddexp.py
@@ -0,0 +1,213 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+import ctypes
+import re
+
+import dpctl
+import numpy as np
+import pytest
+from numpy.testing import assert_allclose
+
+import dpnp.tensor as dpt
+
+from ..helper import (
+    get_queue_or_skip,
+    skip_if_dtype_not_supported,
+)
+from .utils import (
+    _compare_dtypes,
+    _no_complex_dtypes,
+    _usm_types,
+)
+
+
+@pytest.mark.parametrize("op1_dtype", _no_complex_dtypes)
+@pytest.mark.parametrize("op2_dtype", _no_complex_dtypes)
+def test_logaddexp_dtype_matrix(op1_dtype, op2_dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(op1_dtype, q)
+    skip_if_dtype_not_supported(op2_dtype, q)
+
+    sz = 127
+    ar1 = dpt.ones(sz, dtype=op1_dtype)
+    ar2 = dpt.ones_like(ar1, dtype=op2_dtype)
+
+    r = dpt.logaddexp(ar1, ar2)
+    assert isinstance(r, dpt.usm_ndarray)
+    expected = np.logaddexp(dpt.asnumpy(ar1), dpt.asnumpy(ar2))
+    assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q)
+    assert r.shape == ar1.shape
+    tol = 8 * max(
+        np.finfo(r.dtype).resolution, np.finfo(expected.dtype).resolution
+    )
+    assert_allclose(
+        dpt.asnumpy(r), expected.astype(r.dtype), atol=tol, rtol=tol
+    )
+    assert r.sycl_queue == ar1.sycl_queue
+
+    ar3 = dpt.ones(sz, dtype=op1_dtype)
+    ar4 = dpt.ones(2 * sz, dtype=op2_dtype)
+
+    r = dpt.logaddexp(ar3[::-1], ar4[::2])
+    assert isinstance(r, dpt.usm_ndarray)
+    expected = np.logaddexp(dpt.asnumpy(ar3)[::-1], dpt.asnumpy(ar4)[::2])
+    assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q)
+    assert r.shape == ar3.shape
+    assert_allclose(
+        dpt.asnumpy(r), expected.astype(r.dtype), atol=tol, rtol=tol
+    )
+
+
+@pytest.mark.parametrize("op1_usm_type", _usm_types)
+@pytest.mark.parametrize("op2_usm_type", _usm_types)
+def test_logaddexp_usm_type_matrix(op1_usm_type, op2_usm_type):
+    get_queue_or_skip()
+
+    sz = 128
+    ar1 = dpt.ones(sz, dtype="i4", usm_type=op1_usm_type)
+    ar2 = dpt.ones_like(ar1, dtype="i4", usm_type=op2_usm_type)
+
+    r = dpt.logaddexp(ar1, ar2)
+    assert isinstance(r, dpt.usm_ndarray)
+    expected_usm_type = dpctl.utils.get_coerced_usm_type(
+        (op1_usm_type, op2_usm_type)
+    )
+    assert r.usm_type == expected_usm_type
+
+
+def test_logaddexp_order():
+    get_queue_or_skip()
+
+    test_shape = (
+        20,
+        20,
+    )
+    test_shape2 = tuple(2 * dim for dim in test_shape)
+    n = test_shape[-1]
+
+    for dt1, dt2 in zip(["i4", "i4", "f4"], ["i4", "f4", "i4"]):
+        ar1 = dpt.ones(test_shape, dtype=dt1, order="C")
+        ar2 = dpt.ones(test_shape, dtype=dt2, order="C")
+        r1 = dpt.logaddexp(ar1, ar2, order="C")
+        assert r1.flags.c_contiguous
+        r2 = dpt.logaddexp(ar1, ar2, order="F")
+        assert r2.flags.f_contiguous
+        r3 = dpt.logaddexp(ar1, ar2, order="A")
+        assert r3.flags.c_contiguous
+        r4 = dpt.logaddexp(ar1, ar2, order="K")
+        assert r4.flags.c_contiguous
+
+        ar1 = dpt.ones(test_shape, dtype=dt1, order="F")
+        ar2 = dpt.ones(test_shape, dtype=dt2, order="F")
+        r1 = dpt.logaddexp(ar1, ar2, order="C")
+        assert r1.flags.c_contiguous
+        r2 = dpt.logaddexp(ar1, ar2, order="F")
+        assert r2.flags.f_contiguous
+        r3 = dpt.logaddexp(ar1, ar2, order="A")
+        assert r3.flags.f_contiguous
+        r4 = dpt.logaddexp(ar1, ar2, order="K")
+        assert r4.flags.f_contiguous
+
+        ar1 = dpt.ones(test_shape2, dtype=dt1, order="C")[:20, ::-2]
+        ar2 = dpt.ones(test_shape2, dtype=dt2, order="C")[:20, ::-2]
+        r4 = dpt.logaddexp(ar1, ar2, order="K")
+        assert r4.strides == (n, -1)
+        r5 = dpt.logaddexp(ar1, ar2, order="C")
+        assert r5.strides == (n, 1)
+
+        ar1 = dpt.ones(test_shape2, dtype=dt1, order="C")[:20, ::-2].mT
+        ar2 = dpt.ones(test_shape2, dtype=dt2, order="C")[:20, ::-2].mT
+        r4 = dpt.logaddexp(ar1, ar2, order="K")
+        assert r4.strides == (-1, n)
+        r5 = dpt.logaddexp(ar1, ar2, order="C")
+        assert r5.strides == (n, 1)
+
+
+def test_logaddexp_broadcasting():
+    get_queue_or_skip()
+
+    m = dpt.ones((100, 5), dtype="i4")
+    v = dpt.arange(1, 6, dtype="i4")
+
+    r = dpt.logaddexp(m, v)
+
+    expected = np.logaddexp(
+        np.ones((100, 5), dtype="i4"), np.arange(1, 6, dtype="i4")
+    )
+    assert (dpt.asnumpy(r) == expected.astype(r.dtype)).all()
+
+    r2 = dpt.logaddexp(v, m)
+    expected2 = np.logaddexp(
+        np.arange(1, 6, dtype="i4"), np.ones((100, 5), dtype="i4")
+    )
+    assert (dpt.asnumpy(r2) == expected2.astype(r2.dtype)).all()
+
+
+def test_logaddexp_broadcasting_error():
+    get_queue_or_skip()
+    m = dpt.ones((10, 10), dtype="i4")
+    v = dpt.ones((3,), dtype="i4")
+    with pytest.raises(ValueError):
+        dpt.logaddexp(m, v)
+
+
+@pytest.mark.parametrize("arr_dt", _no_complex_dtypes)
+def test_logaddexp_python_scalar(arr_dt):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(arr_dt, q)
+
+    X = dpt.zeros((10, 10), dtype=arr_dt, sycl_queue=q)
+    py_zeros = (
+        bool(0),
+        int(0),
+        float(0),
+        np.float32(0),
+        ctypes.c_int(0),
+    )
+    for sc in py_zeros:
+        R = dpt.logaddexp(X, sc)
+        assert isinstance(R, dpt.usm_ndarray)
+        R = dpt.logaddexp(sc, X)
+        assert isinstance(R, dpt.usm_ndarray)
+
+
+@pytest.mark.parametrize("dtype", _no_complex_dtypes)
+def test_logaddexp_dtype_error(
+    dtype,
+):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    ar1 = dpt.ones(5, dtype=dtype)
+    ar2 = dpt.ones_like(ar1, dtype="f4")
+
+    y = dpt.zeros_like(ar1, dtype="int8")
+    with pytest.raises(ValueError) as excinfo:
+        dpt.logaddexp(ar1, ar2, out=y)
+    assert re.match("Output array of type.*is needed", str(excinfo.value))
diff --git a/dpnp/tests/tensor/elementwise/test_logical_and.py b/dpnp/tests/tensor/elementwise/test_logical_and.py
new file mode 100644
index 000000000000..064c295812b1
--- /dev/null
+++ b/dpnp/tests/tensor/elementwise/test_logical_and.py
@@ -0,0 +1,323 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+import ctypes
+
+import dpctl
+import numpy as np
+import pytest
+
+import dpnp.tensor as dpt
+
+from ..helper import (
+    get_queue_or_skip,
+    skip_if_dtype_not_supported,
+)
+from .utils import (
+    _all_dtypes,
+    _compare_dtypes,
+    _usm_types,
+)
+
+
+@pytest.mark.parametrize("op1_dtype", _all_dtypes)
+@pytest.mark.parametrize("op2_dtype", _all_dtypes)
+def test_logical_and_dtype_matrix(op1_dtype, op2_dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(op1_dtype, q)
+    skip_if_dtype_not_supported(op2_dtype, q)
+
+    sz = 127
+    ar1 = dpt.asarray(np.random.randint(0, 2, sz), dtype=op1_dtype)
+    ar2 = dpt.asarray(np.random.randint(0, 2, sz), dtype=op2_dtype)
+
+    r = dpt.logical_and(ar1, ar2)
+    assert isinstance(r, dpt.usm_ndarray)
+
+    expected = np.logical_and(dpt.asnumpy(ar1), dpt.asnumpy(ar2))
+    assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q)
+    assert r.shape == ar1.shape
+    assert (dpt.asnumpy(r) == expected).all()
+    assert r.sycl_queue == ar1.sycl_queue
+
+    r2 = dpt.empty_like(r, dtype=r.dtype)
+    dpt.logical_and(ar1, ar2, out=r2)
+    assert (dpt.asnumpy(r) == dpt.asnumpy(r2)).all()
+
+    ar3 = dpt.zeros(sz, dtype=op1_dtype)
+    ar4 = dpt.ones(2 * sz, dtype=op2_dtype)
+
+    r = dpt.logical_and(ar3[::-1], ar4[::2])
+    assert isinstance(r, dpt.usm_ndarray)
+    expected = np.logical_and(
+        np.zeros(1, dtype=op1_dtype), np.ones(1, dtype=op2_dtype)
+    )
+    assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q)
+    assert r.shape == ar3.shape
+    assert (dpt.asnumpy(r) == expected).all()
+
+    r2 = dpt.empty_like(r, dtype=r.dtype)
+    dpt.logical_and(ar3[::-1], ar4[::2], out=r2)
+    assert (dpt.asnumpy(r) == dpt.asnumpy(r2)).all()
+
+
+@pytest.mark.parametrize("op_dtype", ["c8", "c16"])
+def test_logical_and_complex_matrix(op_dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(op_dtype, q)
+
+    sz = 127
+    ar1_np_real = np.random.randint(0, 2, sz)
+    ar1_np_imag = np.random.randint(0, 2, sz)
+    ar1_np = ar1_np_real + 1j * ar1_np_imag
+    ar1 = dpt.asarray(ar1_np, dtype=op_dtype)
+
+    ar2_np_real = np.random.randint(0, 2, sz)
+    ar2_np_imag = np.random.randint(0, 2, sz)
+    ar2_np = ar2_np_real + 1j * ar2_np_imag
+    ar2 = dpt.asarray(ar2_np, dtype=op_dtype)
+
+    r = dpt.logical_and(ar1, ar2)
+    expected = np.logical_and(ar1_np, ar2_np)
+    assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q)
+    assert r.shape == expected.shape
+    assert (dpt.asnumpy(r) == expected).all()
+
+    r1 = dpt.logical_and(ar1[::-2], ar2[::2])
+    expected1 = np.logical_and(ar1_np[::-2], ar2_np[::2])
+    assert _compare_dtypes(r.dtype, expected1.dtype, sycl_queue=q)
+    assert r1.shape == expected1.shape
+    assert (dpt.asnumpy(r1) == expected1).all()
+
+    ar3 = dpt.asarray(
+        [
+            2.0 + 0j,
+            dpt.nan,
+            dpt.nan * 1j,
+            dpt.inf,
+            dpt.inf * 1j,
+            -dpt.inf,
+            -dpt.inf * 1j,
+        ],
+        dtype=op_dtype,
+    )
+    ar4 = dpt.full(ar3.shape, fill_value=1.0 + 2j, dtype=op_dtype)
+
+    ar3_np = dpt.asnumpy(ar3)
+    ar4_np = dpt.asnumpy(ar4)
+
+    r2 = dpt.logical_and(ar3, ar4)
+    with np.errstate(invalid="ignore"):
+        expected2 = np.logical_and(ar3_np, ar4_np)
+    assert (dpt.asnumpy(r2) == expected2).all()
+
+    r3 = dpt.logical_and(ar4, ar4)
+    with np.errstate(invalid="ignore"):
+        expected3 = np.logical_and(ar4_np, ar4_np)
+    assert (dpt.asnumpy(r3) == expected3).all()
+
+
+def test_logical_and_complex_float():
+    get_queue_or_skip()
+
+    ar1 = dpt.asarray([1j, 1.0 + 9j, 2.0 + 0j, 2.0 + 1j], dtype="c8")
+    ar2 = dpt.full(ar1.shape, 2, dtype="f4")
+
+    ar1_np = dpt.asnumpy(ar1)
+    ar2_np = dpt.asnumpy(ar2)
+
+    r = dpt.logical_and(ar1, ar2)
+    expected = np.logical_and(ar1_np, ar2_np)
+    assert (dpt.asnumpy(r) == expected).all()
+
+    r1 = dpt.logical_and(ar2, ar1)
+    expected1 = np.logical_and(ar2_np, ar1_np)
+    assert (dpt.asnumpy(r1) == expected1).all()
+    with np.errstate(invalid="ignore"):
+        for tp in [
+            dpt.nan,
+            dpt.nan * 1j,
+            dpt.inf,
+            dpt.inf * 1j,
+            -dpt.inf,
+            -dpt.inf * 1j,
+        ]:
+            ar3 = dpt.full(ar1.shape, tp)
+            ar3_np = dpt.asnumpy(ar3)
+            r2 = dpt.logical_and(ar1, ar3)
+            expected2 = np.logical_and(ar1_np, ar3_np)
+            assert (dpt.asnumpy(r2) == expected2).all()
+
+            r3 = dpt.logical_and(ar3, ar1)
+            expected3 = np.logical_and(ar3_np, ar1_np)
+            assert (dpt.asnumpy(r3) == expected3).all()
+
+
+@pytest.mark.parametrize("op1_usm_type", _usm_types)
+@pytest.mark.parametrize("op2_usm_type", _usm_types)
+def test_logical_and_usm_type_matrix(op1_usm_type, op2_usm_type):
+    get_queue_or_skip()
+
+    sz = 128
+    ar1 = dpt.asarray(
+        np.random.randint(0, 2, sz), dtype="i4", usm_type=op1_usm_type
+    )
+    ar2 = dpt.asarray(
+        np.random.randint(0, 2, sz), dtype=ar1.dtype, usm_type=op2_usm_type
+    )
+
+    r = dpt.logical_and(ar1, ar2)
+    assert isinstance(r, dpt.usm_ndarray)
+    expected_usm_type = dpctl.utils.get_coerced_usm_type(
+        (op1_usm_type, op2_usm_type)
+    )
+    assert r.usm_type == expected_usm_type
+
+
+def test_logical_and_order():
+    get_queue_or_skip()
+
+    ar1 = dpt.ones((20, 20), dtype="i4", order="C")
+    ar2 = dpt.ones((20, 20), dtype="i4", order="C")
+    r1 = dpt.logical_and(ar1, ar2, order="C")
+    assert r1.flags.c_contiguous
+    r2 = dpt.logical_and(ar1, ar2, order="F")
+    assert r2.flags.f_contiguous
+    r3 = dpt.logical_and(ar1, ar2, order="A")
+    assert r3.flags.c_contiguous
+    r4 = dpt.logical_and(ar1, ar2, order="K")
+    assert r4.flags.c_contiguous
+
+    ar1 = dpt.ones((20, 20), dtype="i4", order="F")
+    ar2 = dpt.ones((20, 20), dtype="i4", order="F")
+    r1 = dpt.logical_and(ar1, ar2, order="C")
+    assert r1.flags.c_contiguous
+    r2 = dpt.logical_and(ar1, ar2, order="F")
+    assert r2.flags.f_contiguous
+    r3 = dpt.logical_and(ar1, ar2, order="A")
+    assert r3.flags.f_contiguous
+    r4 = dpt.logical_and(ar1, ar2, order="K")
+    assert r4.flags.f_contiguous
+
+    ar1 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2]
+    ar2 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2]
+    r4 = dpt.logical_and(ar1, ar2, order="K")
+    assert r4.strides == (20, -1)
+
+    ar1 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2].mT
+    ar2 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2].mT
+    r4 = dpt.logical_and(ar1, ar2, order="K")
+    assert r4.strides == (-1, 20)
+
+
+def test_logical_and_broadcasting():
+    get_queue_or_skip()
+
+    m = dpt.asarray(np.random.randint(0, 2, (100, 5)), dtype="i4")
+    v = dpt.arange(1, 6, dtype="i4")
+
+    r = dpt.logical_and(m, v)
+
+    expected = np.logical_and(dpt.asnumpy(m), dpt.asnumpy(v))
+    assert (dpt.asnumpy(r) == expected).all()
+
+    r2 = dpt.logical_and(v, m)
+    expected2 = np.logical_and(dpt.asnumpy(v), dpt.asnumpy(m))
+    assert (dpt.asnumpy(r2) == expected2).all()
+
+    r3 = dpt.empty_like(r)
+    dpt.logical_and(m, v, out=r3)
+    assert (dpt.asnumpy(r3) == expected).all()
+
+    r4 = dpt.empty_like(r)
+    dpt.logical_and(v, m, out=r4)
+    assert (dpt.asnumpy(r4) == expected).all()
+
+
+@pytest.mark.parametrize("arr_dt", _all_dtypes)
+@pytest.mark.parametrize("scalar_val", [0, 1])
+def test_logical_and_python_scalar(arr_dt, scalar_val):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(arr_dt, q)
+
+    X = dpt.asarray(
+        np.random.randint(0, 2, (10, 10)), dtype=arr_dt, sycl_queue=q
+    )
+    py_ones = (
+        bool(scalar_val),
+        int(scalar_val),
+        float(scalar_val),
+        complex(scalar_val),
+        np.float32(scalar_val),
+        ctypes.c_int(scalar_val),
+    )
+    for sc in py_ones:
+        R = dpt.logical_and(X, sc)
+        assert isinstance(R, dpt.usm_ndarray)
+        E = np.logical_and(dpt.asnumpy(X), sc)
+        assert (dpt.asnumpy(R) == E).all()
+
+        R = dpt.logical_and(sc, X)
+        assert isinstance(R, dpt.usm_ndarray)
+        E = np.logical_and(sc, dpt.asnumpy(X))
+        assert (dpt.asnumpy(R) == E).all()
+
+
+class MockArray:
+    def __init__(self, arr):
+        self.data_ = arr
+
+    @property
+    def __sycl_usm_array_interface__(self):
+        return self.data_.__sycl_usm_array_interface__
+
+
+def test_logical_and_mock_array():
+    get_queue_or_skip()
+    a = dpt.arange(10)
+    b = dpt.ones(10)
+    c = MockArray(b)
+    r = dpt.logical_and(a, c)
+    assert isinstance(r, dpt.usm_ndarray)
+
+
+def test_logical_and_canary_mock_array():
+    get_queue_or_skip()
+    a = dpt.arange(10)
+
+    class Canary:
+        def __init__(self):
+            pass
+
+        @property
+        def __sycl_usm_array_interface__(self):
+            return None
+
+    c = Canary()
+    with pytest.raises(ValueError):
+        dpt.logical_and(a, c)
diff --git a/dpnp/tests/tensor/elementwise/test_logical_not.py b/dpnp/tests/tensor/elementwise/test_logical_not.py
new file mode 100644
index 000000000000..fa1d5e786bd3
--- /dev/null
+++ b/dpnp/tests/tensor/elementwise/test_logical_not.py
@@ -0,0 +1,198 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+import numpy as np
+import pytest
+
+import dpnp.tensor as dpt
+
+from ..helper import (
+    get_queue_or_skip,
+    skip_if_dtype_not_supported,
+)
+from .utils import (
+    _all_dtypes,
+    _compare_dtypes,
+    _usm_types,
+)
+
+
+@pytest.mark.parametrize("op_dtype", _all_dtypes)
+def test_logical_not_dtype_matrix(op_dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(op_dtype, q)
+
+    sz = 7
+    ar1_np = np.random.randint(0, 2, sz)
+    ar1 = dpt.asarray(ar1_np, dtype=op_dtype)
+
+    r = dpt.logical_not(ar1)
+    assert isinstance(r, dpt.usm_ndarray)
+
+    expected = np.logical_not(ar1_np)
+    assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q)
+    assert r.shape == ar1.shape
+    assert (dpt.asnumpy(r) == expected).all()
+    assert r.sycl_queue == ar1.sycl_queue
+
+    r2 = dpt.empty_like(r, dtype=r.dtype)
+    dpt.logical_not(ar1, out=r2)
+    assert (dpt.asnumpy(r) == dpt.asnumpy(r2)).all()
+
+    ar2 = dpt.zeros(sz, dtype=op_dtype)
+    r = dpt.logical_not(ar2[::-1])
+    assert isinstance(r, dpt.usm_ndarray)
+
+    expected = np.logical_not(np.zeros(ar2.shape, dtype=op_dtype))
+    assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q)
+    assert r.shape == ar2.shape
+    assert (dpt.asnumpy(r) == expected).all()
+
+    ar3 = dpt.ones(sz, dtype=op_dtype)
+    r2 = dpt.logical_not(ar3[::2])
+    assert isinstance(r, dpt.usm_ndarray)
+
+    expected = np.logical_not(np.ones(ar3.shape, dtype=op_dtype)[::2])
+    assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q)
+    assert (dpt.asnumpy(r2) == expected).all()
+
+    r3 = dpt.empty_like(r, dtype=r.dtype)
+    dpt.logical_not(ar2[::-1], out=r3)
+    assert (dpt.asnumpy(r) == dpt.asnumpy(r3)).all()
+
+
+@pytest.mark.parametrize("op_dtype", ["c8", "c16"])
+def test_logical_not_complex_matrix(op_dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(op_dtype, q)
+
+    sz = 127
+    ar1_np_real = np.random.randint(0, 2, sz)
+    ar1_np_imag = np.random.randint(0, 2, sz)
+    ar1_np = ar1_np_real + 1j * ar1_np_imag
+    ar1 = dpt.asarray(ar1_np, dtype=op_dtype)
+
+    r = dpt.logical_not(ar1)
+    expected = np.logical_not(ar1_np)
+    assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q)
+    assert r.shape == expected.shape
+    assert (dpt.asnumpy(r) == expected).all()
+
+    r1 = dpt.logical_not(ar1[::-2])
+    expected1 = np.logical_not(ar1_np[::-2])
+    assert _compare_dtypes(r.dtype, expected1.dtype, sycl_queue=q)
+    assert r1.shape == expected1.shape
+    assert (dpt.asnumpy(r1) == expected1).all()
+
+    ar2 = dpt.asarray(
+        [
+            2.0 + 0j,
+            dpt.nan,
+            dpt.nan * 1j,
+            dpt.inf,
+            dpt.inf * 1j,
+            -dpt.inf,
+            -dpt.inf * 1j,
+        ],
+        dtype=op_dtype,
+    )
+    ar2_np = dpt.asnumpy(ar2)
+    r2 = dpt.logical_not(ar2)
+    with np.errstate(invalid="ignore"):
+        expected2 = np.logical_not(ar2_np)
+    assert (dpt.asnumpy(r2) == expected2).all()
+
+
+def test_logical_not_complex_float():
+    get_queue_or_skip()
+
+    ar1 = dpt.asarray([1j, 1.0 + 9j, 2.0 + 0j, 2.0 + 1j], dtype="c8")
+
+    r = dpt.logical_not(ar1)
+    expected = np.logical_not(dpt.asnumpy(ar1))
+    assert (dpt.asnumpy(r) == expected).all()
+
+    with np.errstate(invalid="ignore"):
+        for tp in [
+            dpt.nan,
+            dpt.nan * 1j,
+            dpt.inf,
+            dpt.inf * 1j,
+            -dpt.inf,
+            -dpt.inf * 1j,
+        ]:
+            ar2 = dpt.full(ar1.shape, tp)
+            r2 = dpt.logical_not(ar2)
+            expected2 = np.logical_not(dpt.asnumpy(ar2))
+            assert (dpt.asnumpy(r2) == expected2).all()
+
+
+@pytest.mark.parametrize("op_usm_type", _usm_types)
+def test_logical_not_usm_type_matrix(op_usm_type):
+    get_queue_or_skip()
+
+    sz = 128
+    ar1 = dpt.asarray(
+        np.random.randint(0, 2, sz), dtype="i4", usm_type=op_usm_type
+    )
+
+    r = dpt.logical_not(ar1)
+    assert isinstance(r, dpt.usm_ndarray)
+    assert r.usm_type == op_usm_type
+
+
+def test_logical_not_order():
+    get_queue_or_skip()
+
+    ar1 = dpt.ones((20, 20), dtype="i4", order="C")
+    r1 = dpt.logical_not(ar1, order="C")
+    assert r1.flags.c_contiguous
+    r2 = dpt.logical_not(ar1, order="F")
+    assert r2.flags.f_contiguous
+    r3 = dpt.logical_not(ar1, order="A")
+    assert r3.flags.c_contiguous
+    r4 = dpt.logical_not(ar1, order="K")
+    assert r4.flags.c_contiguous
+
+    ar1 = dpt.zeros((20, 20), dtype="i4", order="F")
+    r1 = dpt.logical_not(ar1, order="C")
+    assert r1.flags.c_contiguous
+    r2 = dpt.logical_not(ar1, order="F")
+    assert r2.flags.f_contiguous
+    r3 = dpt.logical_not(ar1, order="A")
+    assert r3.flags.f_contiguous
+    r4 = dpt.logical_not(ar1, order="K")
+    assert r4.flags.f_contiguous
+
+    ar1 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2]
+    r4 = dpt.logical_not(ar1, order="K")
+    assert r4.strides == (20, -1)
+
+    ar1 = dpt.zeros((40, 40), dtype="i4", order="C")[:20, ::-2].mT
+    r4 = dpt.logical_not(ar1, order="K")
+    assert r4.strides == (-1, 20)
diff --git a/dpnp/tests/tensor/elementwise/test_logical_or.py b/dpnp/tests/tensor/elementwise/test_logical_or.py
new file mode 100644
index 000000000000..6987183e37a7
--- /dev/null
+++ b/dpnp/tests/tensor/elementwise/test_logical_or.py
@@ -0,0 +1,324 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+import ctypes
+
+import dpctl
+import numpy as np
+import pytest
+
+import dpnp.tensor as dpt
+
+from ..helper import (
+    get_queue_or_skip,
+    skip_if_dtype_not_supported,
+)
+from .utils import (
+    _all_dtypes,
+    _compare_dtypes,
+    _usm_types,
+)
+
+
+@pytest.mark.parametrize("op1_dtype", _all_dtypes)
+@pytest.mark.parametrize("op2_dtype", _all_dtypes)
+def test_logical_or_dtype_matrix(op1_dtype, op2_dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(op1_dtype, q)
+    skip_if_dtype_not_supported(op2_dtype, q)
+
+    sz = 127
+    ar1 = dpt.asarray(np.random.randint(0, 2, sz), dtype=op1_dtype)
+    ar2 = dpt.asarray(np.random.randint(0, 2, sz), dtype=op2_dtype)
+
+    r = dpt.logical_or(ar1, ar2)
+    assert isinstance(r, dpt.usm_ndarray)
+
+    expected = np.logical_or(dpt.asnumpy(ar1), dpt.asnumpy(ar2))
+    assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q)
+    assert r.shape == ar1.shape
+    assert (dpt.asnumpy(r) == expected).all()
+    assert r.sycl_queue == ar1.sycl_queue
+
+    r2 = dpt.empty_like(r, dtype=r.dtype)
+    dpt.logical_or(ar1, ar2, out=r2)
+    assert (dpt.asnumpy(r) == dpt.asnumpy(r2)).all()
+
+    ar3 = dpt.zeros(sz, dtype=op1_dtype)
+    ar4 = dpt.ones(2 * sz, dtype=op2_dtype)
+
+    r = dpt.logical_or(ar3[::-1], ar4[::2])
+    assert isinstance(r, dpt.usm_ndarray)
+    expected = np.logical_or(
+        np.zeros(1, dtype=op1_dtype), np.ones(1, dtype=op2_dtype)
+    )
+    assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q)
+    assert r.shape == ar3.shape
+    assert (dpt.asnumpy(r) == expected).all()
+
+    r2 = dpt.empty_like(r, dtype=r.dtype)
+    dpt.logical_or(ar3[::-1], ar4[::2], out=r2)
+    assert (dpt.asnumpy(r) == dpt.asnumpy(r2)).all()
+
+
+@pytest.mark.parametrize("op_dtype", ["c8", "c16"])
+def test_logical_or_complex_matrix(op_dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(op_dtype, q)
+
+    sz = 127
+    ar1_np_real = np.random.randint(0, 2, sz)
+    ar1_np_imag = np.random.randint(0, 2, sz)
+    ar1_np = ar1_np_real + 1j * ar1_np_imag
+    ar1 = dpt.asarray(ar1_np, dtype=op_dtype)
+
+    ar2_np_real = np.random.randint(0, 2, sz)
+    ar2_np_imag = np.random.randint(0, 2, sz)
+    ar2_np = ar2_np_real + 1j * ar2_np_imag
+    ar2 = dpt.asarray(ar2_np, dtype=op_dtype)
+
+    r = dpt.logical_or(ar1, ar2)
+    expected = np.logical_or(ar1_np, ar2_np)
+    assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q)
+    assert r.shape == expected.shape
+    assert (dpt.asnumpy(r) == expected).all()
+
+    r1 = dpt.logical_or(ar1[::-2], ar2[::2])
+    expected1 = np.logical_or(ar1_np[::-2], ar2_np[::2])
+    assert _compare_dtypes(r.dtype, expected1.dtype, sycl_queue=q)
+    assert r1.shape == expected1.shape
+    assert (dpt.asnumpy(r1) == expected1).all()
+
+    ar3 = dpt.asarray(
+        [
+            2.0 + 0j,
+            dpt.nan,
+            dpt.nan * 1j,
+            dpt.inf,
+            dpt.inf * 1j,
+            -dpt.inf,
+            -dpt.inf * 1j,
+        ],
+        dtype=op_dtype,
+    )
+    ar4 = dpt.full(ar3.shape, fill_value=1.0 + 2j, dtype=op_dtype)
+
+    ar3_np = dpt.asnumpy(ar3)
+    ar4_np = dpt.asnumpy(ar4)
+
+    r2 = dpt.logical_or(ar3, ar4)
+    with np.errstate(invalid="ignore"):
+        expected2 = np.logical_or(ar3_np, ar4_np)
+    assert (dpt.asnumpy(r2) == expected2).all()
+
+    r3 = dpt.logical_or(ar4, ar4)
+    with np.errstate(invalid="ignore"):
+        expected3 = np.logical_or(ar4_np, ar4_np)
+    assert (dpt.asnumpy(r3) == expected3).all()
+
+
+def test_logical_or_complex_float():
+    get_queue_or_skip()
+
+    ar1 = dpt.asarray([1j, 1.0 + 9j, 2.0 + 0j, 2.0 + 1j], dtype="c8")
+    ar2 = dpt.full(ar1.shape, 2, dtype="f4")
+
+    ar1_np = dpt.asnumpy(ar1)
+    ar2_np = dpt.asnumpy(ar2)
+
+    r = dpt.logical_or(ar1, ar2)
+    expected = np.logical_or(ar1_np, ar2_np)
+    assert (dpt.asnumpy(r) == expected).all()
+
+    r1 = dpt.logical_or(ar2, ar1)
+    expected1 = np.logical_or(ar2_np, ar1_np)
+    assert (dpt.asnumpy(r1) == expected1).all()
+    with np.errstate(invalid="ignore"):
+        for tp in [
+            dpt.nan,
+            dpt.nan * 1j,
+            dpt.inf,
+            dpt.inf * 1j,
+            -dpt.inf,
+            -dpt.inf * 1j,
+        ]:
+            ar3 = dpt.full(ar1.shape, tp)
+            ar3_np = dpt.asnumpy(ar3)
+
+            r2 = dpt.logical_or(ar1, ar3)
+            expected2 = np.logical_or(ar1_np, ar3_np)
+            assert (dpt.asnumpy(r2) == expected2).all()
+
+            r3 = dpt.logical_or(ar3, ar1)
+            expected3 = np.logical_or(ar3_np, ar1_np)
+            assert (dpt.asnumpy(r3) == expected3).all()
+
+
+@pytest.mark.parametrize("op1_usm_type", _usm_types)
+@pytest.mark.parametrize("op2_usm_type", _usm_types)
+def test_logical_or_usm_type_matrix(op1_usm_type, op2_usm_type):
+    get_queue_or_skip()
+
+    sz = 128
+    ar1 = dpt.asarray(
+        np.random.randint(0, 2, sz), dtype="i4", usm_type=op1_usm_type
+    )
+    ar2 = dpt.asarray(
+        np.random.randint(0, 2, sz), dtype=ar1.dtype, usm_type=op2_usm_type
+    )
+
+    r = dpt.logical_or(ar1, ar2)
+    assert isinstance(r, dpt.usm_ndarray)
+    expected_usm_type = dpctl.utils.get_coerced_usm_type(
+        (op1_usm_type, op2_usm_type)
+    )
+    assert r.usm_type == expected_usm_type
+
+
+def test_logical_or_order():
+    get_queue_or_skip()
+
+    ar1 = dpt.ones((20, 20), dtype="i4", order="C")
+    ar2 = dpt.ones((20, 20), dtype="i4", order="C")
+    r1 = dpt.logical_or(ar1, ar2, order="C")
+    assert r1.flags.c_contiguous
+    r2 = dpt.logical_or(ar1, ar2, order="F")
+    assert r2.flags.f_contiguous
+    r3 = dpt.logical_or(ar1, ar2, order="A")
+    assert r3.flags.c_contiguous
+    r4 = dpt.logical_or(ar1, ar2, order="K")
+    assert r4.flags.c_contiguous
+
+    ar1 = dpt.ones((20, 20), dtype="i4", order="F")
+    ar2 = dpt.ones((20, 20), dtype="i4", order="F")
+    r1 = dpt.logical_or(ar1, ar2, order="C")
+    assert r1.flags.c_contiguous
+    r2 = dpt.logical_or(ar1, ar2, order="F")
+    assert r2.flags.f_contiguous
+    r3 = dpt.logical_or(ar1, ar2, order="A")
+    assert r3.flags.f_contiguous
+    r4 = dpt.logical_or(ar1, ar2, order="K")
+    assert r4.flags.f_contiguous
+
+    ar1 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2]
+    ar2 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2]
+    r4 = dpt.logical_or(ar1, ar2, order="K")
+    assert r4.strides == (20, -1)
+
+    ar1 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2].mT
+    ar2 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2].mT
+    r4 = dpt.logical_or(ar1, ar2, order="K")
+    assert r4.strides == (-1, 20)
+
+
+def test_logical_or_broadcasting():
+    get_queue_or_skip()
+
+    m = dpt.asarray(np.random.randint(0, 2, (100, 5)), dtype="i4")
+    v = dpt.arange(1, 6, dtype="i4")
+
+    r = dpt.logical_or(m, v)
+
+    expected = np.logical_or(dpt.asnumpy(m), dpt.asnumpy(v))
+    assert (dpt.asnumpy(r) == expected).all()
+
+    r2 = dpt.logical_or(v, m)
+    expected2 = np.logical_or(dpt.asnumpy(v), dpt.asnumpy(m))
+    assert (dpt.asnumpy(r2) == expected2).all()
+
+    r3 = dpt.empty_like(r)
+    dpt.logical_or(m, v, out=r3)
+    assert (dpt.asnumpy(r3) == expected).all()
+
+    r4 = dpt.empty_like(r)
+    dpt.logical_or(v, m, out=r4)
+    assert (dpt.asnumpy(r4) == expected).all()
+
+
+@pytest.mark.parametrize("arr_dt", _all_dtypes)
+@pytest.mark.parametrize("scalar_val", [0, 1])
+def test_logical_or_python_scalar(arr_dt, scalar_val):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(arr_dt, q)
+
+    X = dpt.asarray(
+        np.random.randint(0, 2, (10, 10)), dtype=arr_dt, sycl_queue=q
+    )
+    py_ones = (
+        bool(scalar_val),
+        int(scalar_val),
+        float(scalar_val),
+        complex(scalar_val),
+        np.float32(scalar_val),
+        ctypes.c_int(scalar_val),
+    )
+    for sc in py_ones:
+        R = dpt.logical_or(X, sc)
+        assert isinstance(R, dpt.usm_ndarray)
+        E = np.logical_or(dpt.asnumpy(X), sc)
+        assert (dpt.asnumpy(R) == E).all()
+
+        R = dpt.logical_or(sc, X)
+        assert isinstance(R, dpt.usm_ndarray)
+        E = np.logical_or(sc, dpt.asnumpy(X))
+        assert (dpt.asnumpy(R) == E).all()
+
+
+class MockArray:
+    def __init__(self, arr):
+        self.data_ = arr
+
+    @property
+    def __sycl_usm_array_interface__(self):
+        return self.data_.__sycl_usm_array_interface__
+
+
+def test_logical_or_mock_array():
+    get_queue_or_skip()
+    a = dpt.arange(10)
+    b = dpt.ones(10)
+    c = MockArray(b)
+    r = dpt.logical_or(a, c)
+    assert isinstance(r, dpt.usm_ndarray)
+
+
+def test_logical_or_canary_mock_array():
+    get_queue_or_skip()
+    a = dpt.arange(10)
+
+    class Canary:
+        def __init__(self):
+            pass
+
+        @property
+        def __sycl_usm_array_interface__(self):
+            return None
+
+    c = Canary()
+    with pytest.raises(ValueError):
+        dpt.logical_or(a, c)
diff --git a/dpnp/tests/tensor/elementwise/test_logical_xor.py b/dpnp/tests/tensor/elementwise/test_logical_xor.py
new file mode 100644
index 000000000000..043c704bcf4b
--- /dev/null
+++ b/dpnp/tests/tensor/elementwise/test_logical_xor.py
@@ -0,0 +1,325 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+import ctypes
+
+import dpctl
+import numpy as np
+import pytest
+
+import dpnp.tensor as dpt
+
+from ..helper import (
+    get_queue_or_skip,
+    skip_if_dtype_not_supported,
+)
+from .utils import (
+    _all_dtypes,
+    _compare_dtypes,
+    _usm_types,
+)
+
+
+@pytest.mark.parametrize("op1_dtype", _all_dtypes)
+@pytest.mark.parametrize("op2_dtype", _all_dtypes)
+def test_logical_xor_dtype_matrix(op1_dtype, op2_dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(op1_dtype, q)
+    skip_if_dtype_not_supported(op2_dtype, q)
+
+    sz = 127
+    ar1_np = np.random.randint(0, 2, sz)
+    ar1 = dpt.asarray(ar1_np, dtype=op1_dtype)
+    ar2_np = np.random.randint(0, 2, sz)
+    ar2 = dpt.asarray(ar2_np, dtype=op2_dtype)
+
+    r = dpt.logical_xor(ar1, ar2)
+    assert isinstance(r, dpt.usm_ndarray)
+
+    expected = np.logical_xor(ar1_np, ar2_np)
+    assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q)
+    assert r.shape == ar1.shape
+    assert (dpt.asnumpy(r) == expected).all()
+    assert r.sycl_queue == ar1.sycl_queue
+
+    r2 = dpt.empty_like(r, dtype=r.dtype)
+    dpt.logical_xor(ar1, ar2, out=r2)
+    assert (dpt.asnumpy(r) == dpt.asnumpy(r2)).all()
+
+    ar3 = dpt.zeros(sz, dtype=op1_dtype)
+    ar4 = dpt.ones(2 * sz, dtype=op2_dtype)
+
+    r = dpt.logical_xor(ar3[::-1], ar4[::2])
+    assert isinstance(r, dpt.usm_ndarray)
+    expected = np.logical_xor(
+        np.zeros(1, dtype=op1_dtype), np.ones(1, dtype=op2_dtype)
+    )
+    assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q)
+    assert r.shape == ar3.shape
+    assert (dpt.asnumpy(r) == expected).all()
+
+    r2 = dpt.empty_like(r, dtype=r.dtype)
+    dpt.logical_xor(ar3[::-1], ar4[::2], out=r2)
+    assert (dpt.asnumpy(r) == dpt.asnumpy(r2)).all()
+
+
+@pytest.mark.parametrize("op_dtype", ["c8", "c16"])
+def test_logical_xor_complex_matrix(op_dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(op_dtype, q)
+
+    sz = 127
+    ar1_np_real = np.random.randint(0, 2, sz)
+    ar1_np_imag = np.random.randint(0, 2, sz)
+    ar1_np = ar1_np_real + 1j * ar1_np_imag
+    ar1 = dpt.asarray(ar1_np, dtype=op_dtype)
+
+    ar2_np_real = np.random.randint(0, 2, sz)
+    ar2_np_imag = np.random.randint(0, 2, sz)
+    ar2_np = ar2_np_real + 1j * ar2_np_imag
+    ar2 = dpt.asarray(ar2_np, dtype=op_dtype)
+
+    r = dpt.logical_xor(ar1, ar2)
+    expected = np.logical_xor(ar1_np, ar2_np)
+    assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q)
+    assert r.shape == expected.shape
+    assert (dpt.asnumpy(r) == expected).all()
+
+    r1 = dpt.logical_xor(ar1[::-2], ar2[::2])
+    expected1 = np.logical_xor(ar1_np[::-2], ar2_np[::2])
+    assert _compare_dtypes(r.dtype, expected1.dtype, sycl_queue=q)
+    assert r1.shape == expected1.shape
+    assert (dpt.asnumpy(r1) == expected1).all()
+
+    ar3 = dpt.asarray(
+        [
+            2.0 + 0j,
+            dpt.nan,
+            dpt.nan * 1j,
+            dpt.inf,
+            dpt.inf * 1j,
+            -dpt.inf,
+            -dpt.inf * 1j,
+        ],
+        dtype=op_dtype,
+    )
+    ar4 = dpt.full(ar3.shape, fill_value=1.0 + 2j, dtype=op_dtype)
+
+    ar3_np = dpt.asnumpy(ar3)
+    ar4_np = dpt.asnumpy(ar4)
+
+    r2 = dpt.logical_xor(ar3, ar4)
+    with np.errstate(invalid="ignore"):
+        expected2 = np.logical_xor(ar3_np, ar4_np)
+    assert (dpt.asnumpy(r2) == expected2).all()
+
+    r3 = dpt.logical_xor(ar4, ar4)
+    with np.errstate(invalid="ignore"):
+        expected3 = np.logical_xor(ar4_np, ar4_np)
+    assert (dpt.asnumpy(r3) == expected3).all()
+
+
+def test_logical_xor_complex_float():
+    get_queue_or_skip()
+
+    ar1 = dpt.asarray([1j, 1.0 + 9j, 2.0 + 0j, 2.0 + 1j], dtype="c8")
+    ar2 = dpt.full(ar1.shape, 2, dtype="f4")
+
+    ar1_np = dpt.asnumpy(ar1)
+    ar2_np = dpt.asnumpy(ar1)
+
+    r = dpt.logical_xor(ar1, ar2)
+    expected = np.logical_xor(ar1_np, ar2_np)
+    assert (dpt.asnumpy(r) == expected).all()
+
+    r1 = dpt.logical_xor(ar2, ar1)
+    expected1 = np.logical_xor(ar2_np, ar1_np)
+    assert (dpt.asnumpy(r1) == expected1).all()
+    with np.errstate(invalid="ignore"):
+        for tp in [
+            dpt.nan,
+            dpt.nan * 1j,
+            dpt.inf,
+            dpt.inf * 1j,
+            -dpt.inf,
+            -dpt.inf * 1j,
+        ]:
+            ar3 = dpt.full(ar1.shape, tp)
+            ar3_np = dpt.asnumpy(ar3)
+            r2 = dpt.logical_xor(ar1, ar3)
+            expected2 = np.logical_xor(ar1_np, ar3_np)
+            assert (dpt.asnumpy(r2) == expected2).all()
+
+            r3 = dpt.logical_xor(ar3, ar1)
+            expected3 = np.logical_xor(ar3_np, ar1_np)
+            assert (dpt.asnumpy(r3) == expected3).all()
+
+
+@pytest.mark.parametrize("op1_usm_type", _usm_types)
+@pytest.mark.parametrize("op2_usm_type", _usm_types)
+def test_logical_xor_usm_type_matrix(op1_usm_type, op2_usm_type):
+    get_queue_or_skip()
+
+    sz = 128
+    ar1 = dpt.asarray(
+        np.random.randint(0, 2, sz), dtype="i4", usm_type=op1_usm_type
+    )
+    ar2 = dpt.asarray(
+        np.random.randint(0, 2, sz), dtype=ar1.dtype, usm_type=op2_usm_type
+    )
+
+    r = dpt.logical_xor(ar1, ar2)
+    assert isinstance(r, dpt.usm_ndarray)
+    expected_usm_type = dpctl.utils.get_coerced_usm_type(
+        (op1_usm_type, op2_usm_type)
+    )
+    assert r.usm_type == expected_usm_type
+
+
+def test_logical_xor_order():
+    get_queue_or_skip()
+
+    ar1 = dpt.ones((20, 20), dtype="i4", order="C")
+    ar2 = dpt.ones((20, 20), dtype="i4", order="C")
+    r1 = dpt.logical_xor(ar1, ar2, order="C")
+    assert r1.flags.c_contiguous
+    r2 = dpt.logical_xor(ar1, ar2, order="F")
+    assert r2.flags.f_contiguous
+    r3 = dpt.logical_xor(ar1, ar2, order="A")
+    assert r3.flags.c_contiguous
+    r4 = dpt.logical_xor(ar1, ar2, order="K")
+    assert r4.flags.c_contiguous
+
+    ar1 = dpt.ones((20, 20), dtype="i4", order="F")
+    ar2 = dpt.ones((20, 20), dtype="i4", order="F")
+    r1 = dpt.logical_xor(ar1, ar2, order="C")
+    assert r1.flags.c_contiguous
+    r2 = dpt.logical_xor(ar1, ar2, order="F")
+    assert r2.flags.f_contiguous
+    r3 = dpt.logical_xor(ar1, ar2, order="A")
+    assert r3.flags.f_contiguous
+    r4 = dpt.logical_xor(ar1, ar2, order="K")
+    assert r4.flags.f_contiguous
+
+    ar1 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2]
+    ar2 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2]
+    r4 = dpt.logical_xor(ar1, ar2, order="K")
+    assert r4.strides == (20, -1)
+
+    ar1 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2].mT
+    ar2 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2].mT
+    r4 = dpt.logical_xor(ar1, ar2, order="K")
+    assert r4.strides == (-1, 20)
+
+
+def test_logical_xor_broadcasting():
+    get_queue_or_skip()
+
+    m = dpt.asarray(np.random.randint(0, 2, (100, 5)), dtype="i4")
+    v = dpt.arange(1, 6, dtype="i4")
+
+    r = dpt.logical_xor(m, v)
+
+    expected = np.logical_xor(dpt.asnumpy(m), dpt.asnumpy(v))
+    assert (dpt.asnumpy(r) == expected).all()
+
+    r2 = dpt.logical_xor(v, m)
+    expected2 = np.logical_xor(dpt.asnumpy(v), dpt.asnumpy(m))
+    assert (dpt.asnumpy(r2) == expected2).all()
+
+    r3 = dpt.empty_like(r)
+    dpt.logical_xor(m, v, out=r3)
+    assert (dpt.asnumpy(r3) == expected).all()
+
+    r4 = dpt.empty_like(r)
+    dpt.logical_xor(v, m, out=r4)
+    assert (dpt.asnumpy(r4) == expected).all()
+
+
+@pytest.mark.parametrize("arr_dt", _all_dtypes)
+@pytest.mark.parametrize("scalar_val", [0, 1])
+def test_logical_xor_python_scalar(arr_dt, scalar_val):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(arr_dt, q)
+
+    X = dpt.asarray(
+        np.random.randint(0, 2, (10, 10)), dtype=arr_dt, sycl_queue=q
+    )
+    py_ones = (
+        bool(scalar_val),
+        int(scalar_val),
+        float(scalar_val),
+        complex(scalar_val),
+        np.float32(scalar_val),
+        ctypes.c_int(scalar_val),
+    )
+    for sc in py_ones:
+        R = dpt.logical_xor(X, sc)
+        assert isinstance(R, dpt.usm_ndarray)
+        E = np.logical_xor(dpt.asnumpy(X), sc)
+        assert (dpt.asnumpy(R) == E).all()
+
+        R = dpt.logical_xor(sc, X)
+        assert isinstance(R, dpt.usm_ndarray)
+        E = np.logical_xor(sc, dpt.asnumpy(X))
+        assert (dpt.asnumpy(R) == E).all()
+
+
+class MockArray:
+    def __init__(self, arr):
+        self.data_ = arr
+
+    @property
+    def __sycl_usm_array_interface__(self):
+        return self.data_.__sycl_usm_array_interface__
+
+
+def test_logical_xor_mock_array():
+    get_queue_or_skip()
+    a = dpt.arange(10)
+    b = dpt.ones(10)
+    c = MockArray(b)
+    r = dpt.logical_xor(a, c)
+    assert isinstance(r, dpt.usm_ndarray)
+
+
+def test_logical_xor_canary_mock_array():
+    get_queue_or_skip()
+    a = dpt.arange(10)
+
+    class Canary:
+        def __init__(self):
+            pass
+
+        @property
+        def __sycl_usm_array_interface__(self):
+            return None
+
+    c = Canary()
+    with pytest.raises(ValueError):
+        dpt.logical_xor(a, c)
diff --git a/dpnp/tests/tensor/elementwise/test_maximum_minimum.py b/dpnp/tests/tensor/elementwise/test_maximum_minimum.py
new file mode 100644
index 000000000000..7e0bce95baf9
--- /dev/null
+++ b/dpnp/tests/tensor/elementwise/test_maximum_minimum.py
@@ -0,0 +1,333 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+import ctypes
+import itertools
+
+import dpctl
+import numpy as np
+import pytest
+from numpy.testing import assert_array_equal
+
+import dpnp.tensor as dpt
+
+from ..helper import (
+    get_queue_or_skip,
+    skip_if_dtype_not_supported,
+)
+from .utils import (
+    _all_dtypes,
+    _compare_dtypes,
+    _usm_types,
+)
+
+
+@pytest.mark.parametrize("op1_dtype", _all_dtypes)
+@pytest.mark.parametrize("op2_dtype", _all_dtypes)
+def test_maximum_minimum_dtype_matrix(op1_dtype, op2_dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(op1_dtype, q)
+    skip_if_dtype_not_supported(op2_dtype, q)
+
+    sz = 127
+    ar1_np = np.arange(sz)
+    np.random.shuffle(ar1_np)
+    ar1 = dpt.asarray(ar1_np, dtype=op1_dtype)
+    ar2_np = np.arange(sz)
+    np.random.shuffle(ar2_np)
+    ar2 = dpt.asarray(ar2_np, dtype=op2_dtype)
+
+    r = dpt.maximum(ar1, ar2)
+    assert isinstance(r, dpt.usm_ndarray)
+    expected = np.maximum(ar1_np.astype(op1_dtype), ar2_np.astype(op2_dtype))
+
+    assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q)
+    assert r.shape == ar1.shape
+    assert (dpt.asnumpy(r) == expected).all()
+    assert r.sycl_queue == ar1.sycl_queue
+
+    r = dpt.minimum(ar1, ar2)
+    assert isinstance(r, dpt.usm_ndarray)
+    expected = np.minimum(ar1_np.astype(op1_dtype), ar2_np.astype(op2_dtype))
+
+    assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q)
+    assert r.shape == ar1.shape
+    assert (dpt.asnumpy(r) == expected).all()
+    assert r.sycl_queue == ar1.sycl_queue
+
+    ar3_np = np.arange(sz)
+    np.random.shuffle(ar3_np)
+    ar3 = dpt.asarray(ar3_np, dtype=op1_dtype)
+    ar4_np = np.arange(2 * sz)
+    np.random.shuffle(ar4_np)
+    ar4 = dpt.asarray(ar4_np, dtype=op2_dtype)
+
+    r = dpt.maximum(ar3[::-1], ar4[::2])
+    assert isinstance(r, dpt.usm_ndarray)
+    expected = np.maximum(
+        ar3_np[::-1].astype(op1_dtype), ar4_np[::2].astype(op2_dtype)
+    )
+
+    assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q)
+    assert r.shape == ar3.shape
+    assert (dpt.asnumpy(r) == expected).all()
+
+    r = dpt.minimum(ar3[::-1], ar4[::2])
+    assert isinstance(r, dpt.usm_ndarray)
+    expected = np.minimum(
+        ar3_np[::-1].astype(op1_dtype), ar4_np[::2].astype(op2_dtype)
+    )
+
+    assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q)
+    assert r.shape == ar3.shape
+    assert (dpt.asnumpy(r) == expected).all()
+
+
+@pytest.mark.parametrize("op_dtype", ["c8", "c16"])
+def test_maximum_minimum_complex_matrix(op_dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(op_dtype, q)
+
+    sz = 127
+    ar1_np_real = np.random.randint(0, 10, sz)
+    ar1_np_imag = np.random.randint(0, 10, sz)
+    ar1 = dpt.asarray(ar1_np_real + 1j * ar1_np_imag, dtype=op_dtype)
+
+    ar2_np_real = np.random.randint(0, 10, sz)
+    ar2_np_imag = np.random.randint(0, 10, sz)
+    ar2 = dpt.asarray(ar2_np_real + 1j * ar2_np_imag, dtype=op_dtype)
+
+    r = dpt.maximum(ar1, ar2)
+    expected = np.maximum(dpt.asnumpy(ar1), dpt.asnumpy(ar2))
+    assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q)
+    assert r.shape == expected.shape
+    assert_array_equal(dpt.asnumpy(r), expected)
+
+    r1 = dpt.maximum(ar1[::-2], ar2[::2])
+    expected1 = np.maximum(dpt.asnumpy(ar1[::-2]), dpt.asnumpy(ar2[::2]))
+    assert _compare_dtypes(r.dtype, expected1.dtype, sycl_queue=q)
+    assert r1.shape == expected1.shape
+    assert_array_equal(dpt.asnumpy(r1), expected1)
+
+    r = dpt.minimum(ar1, ar2)
+    expected = np.minimum(dpt.asnumpy(ar1), dpt.asnumpy(ar2))
+    assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q)
+    assert r.shape == expected.shape
+    assert_array_equal(dpt.asnumpy(r), expected)
+
+    r1 = dpt.minimum(ar1[::-2], ar2[::2])
+    expected1 = np.minimum(dpt.asnumpy(ar1[::-2]), dpt.asnumpy(ar2[::2]))
+    assert _compare_dtypes(r.dtype, expected1.dtype, sycl_queue=q)
+    assert r1.shape == expected1.shape
+    assert_array_equal(dpt.asnumpy(r1), expected1)
+
+
+@pytest.mark.parametrize("dtype", ["f2", "f4", "f8"])
+def test_maximum_minimum_real_special_cases(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    x = [np.nan, np.inf, -np.inf, 5.0, -3.0]
+    x = list(itertools.product(x, repeat=2))
+    Xnp = np.array([tup[0] for tup in x], dtype=dtype)
+    Ynp = np.array([tup[1] for tup in x], dtype=dtype)
+    X = dpt.asarray(Xnp, dtype=dtype)
+    Y = dpt.asarray(Ynp, dtype=dtype)
+
+    R = dpt.maximum(X, Y)
+    Rnp = np.maximum(Xnp, Ynp)
+    assert_array_equal(dpt.asnumpy(R), Rnp)
+
+    R = dpt.minimum(X, Y)
+    Rnp = np.minimum(Xnp, Ynp)
+    assert_array_equal(dpt.asnumpy(R), Rnp)
+
+
+@pytest.mark.parametrize("dtype", ["c8", "c16"])
+def test_maximum_minimum_complex_special_cases(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    x = [np.nan, -np.inf, -np.inf, +2.0, -1.0]
+    x = [complex(*val) for val in itertools.product(x, repeat=2)]
+    x = list(itertools.product(x, repeat=2))
+
+    Xnp = np.array([tup[0] for tup in x], dtype=dtype)
+    Ynp = np.array([tup[1] for tup in x], dtype=dtype)
+    X = dpt.asarray(Xnp, dtype=dtype, sycl_queue=q)
+    Y = dpt.asarray(Ynp, dtype=dtype, sycl_queue=q)
+
+    R = dpt.maximum(X, Y)
+    Rnp = np.maximum(Xnp, Ynp)
+    assert_array_equal(dpt.asnumpy(dpt.real(R)), np.real(Rnp))
+    assert_array_equal(dpt.asnumpy(dpt.imag(R)), np.imag(Rnp))
+
+    R = dpt.minimum(X, Y)
+    Rnp = np.minimum(Xnp, Ynp)
+    assert_array_equal(dpt.asnumpy(dpt.real(R)), np.real(Rnp))
+    assert_array_equal(dpt.asnumpy(dpt.imag(R)), np.imag(Rnp))
+
+
+@pytest.mark.parametrize("op1_usm_type", _usm_types)
+@pytest.mark.parametrize("op2_usm_type", _usm_types)
+def test_maximum_minimum_usm_type_matrix(op1_usm_type, op2_usm_type):
+    get_queue_or_skip()
+
+    sz = 128
+    ar1_np = np.arange(sz, dtype="i4")
+    np.random.shuffle(ar1_np)
+    ar1 = dpt.asarray(ar1_np, usm_type=op1_usm_type)
+    ar2_np = np.arange(sz, dtype="i4")
+    np.random.shuffle(ar2_np)
+    ar2 = dpt.asarray(ar2_np, usm_type=op2_usm_type)
+
+    r = dpt.maximum(ar1, ar2)
+    assert isinstance(r, dpt.usm_ndarray)
+    expected_usm_type = dpctl.utils.get_coerced_usm_type(
+        (op1_usm_type, op2_usm_type)
+    )
+    assert r.usm_type == expected_usm_type
+
+    r = dpt.minimum(ar1, ar2)
+    assert isinstance(r, dpt.usm_ndarray)
+    expected_usm_type = dpctl.utils.get_coerced_usm_type(
+        (op1_usm_type, op2_usm_type)
+    )
+    assert r.usm_type == expected_usm_type
+
+
+def test_maximum_minimum_order():
+    get_queue_or_skip()
+
+    ar1_np = np.arange(20 * 20, dtype="i4").reshape(20, 20)
+    np.random.shuffle(ar1_np)
+    ar1 = dpt.asarray(ar1_np, order="C")
+    ar2_np = np.arange(20 * 20, dtype="i4").reshape(20, 20)
+    np.random.shuffle(ar2_np)
+    ar2 = dpt.asarray(ar2_np, order="C")
+
+    r1 = dpt.maximum(ar1, ar2, order="C")
+    assert r1.flags.c_contiguous
+    r2 = dpt.maximum(ar1, ar2, order="F")
+    assert r2.flags.f_contiguous
+    r3 = dpt.maximum(ar1, ar2, order="A")
+    assert r3.flags.c_contiguous
+    r4 = dpt.maximum(ar1, ar2, order="K")
+    assert r4.flags.c_contiguous
+
+    ar1 = dpt.asarray(ar1_np, order="F")
+    ar2 = dpt.asarray(ar2_np, order="F")
+    r1 = dpt.maximum(ar1, ar2, order="C")
+    assert r1.flags.c_contiguous
+    r2 = dpt.maximum(ar1, ar2, order="F")
+    assert r2.flags.f_contiguous
+    r3 = dpt.maximum(ar1, ar2, order="A")
+    assert r3.flags.f_contiguous
+    r4 = dpt.maximum(ar1, ar2, order="K")
+    assert r4.flags.f_contiguous
+
+    ar1_np = np.arange(40 * 40, dtype="i4").reshape(40, 40)
+    np.random.shuffle(ar1_np)
+    ar1 = dpt.asarray(ar1_np, order="C")[:20, ::-2]
+    ar2_np = np.arange(40 * 40, dtype="i4").reshape(40, 40)
+    np.random.shuffle(ar2_np)
+    ar2 = dpt.asarray(ar2_np, order="C")[:20, ::-2]
+    r4 = dpt.maximum(ar1, ar2, order="K")
+    assert r4.strides == (20, -1)
+
+    ar1 = dpt.asarray(ar1_np, order="C")[:20, ::-2].mT
+    ar2 = dpt.asarray(ar2_np, order="C")[:20, ::-2].mT
+    r4 = dpt.maximum(ar1, ar2, order="K")
+    assert r4.strides == (-1, 20)
+
+
+@pytest.mark.parametrize("arr_dt", _all_dtypes)
+def test_maximum_minimum_python_scalar(arr_dt):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(arr_dt, q)
+
+    X = dpt.zeros((10, 10), dtype=arr_dt, sycl_queue=q)
+    py_ones = (
+        bool(1),
+        int(1),
+        float(1),
+        complex(1),
+        np.float32(1),
+        ctypes.c_int(1),
+    )
+    for sc in py_ones:
+        R = dpt.maximum(X, sc)
+        assert isinstance(R, dpt.usm_ndarray)
+        R = dpt.maximum(sc, X)
+        assert isinstance(R, dpt.usm_ndarray)
+
+        R = dpt.minimum(X, sc)
+        assert isinstance(R, dpt.usm_ndarray)
+        R = dpt.minimum(sc, X)
+        assert isinstance(R, dpt.usm_ndarray)
+
+
+class MockArray:
+    def __init__(self, arr):
+        self.data_ = arr
+
+    @property
+    def __sycl_usm_array_interface__(self):
+        return self.data_.__sycl_usm_array_interface__
+
+
+def test_maximum_minimum_mock_array():
+    get_queue_or_skip()
+    a = dpt.arange(10)
+    b = dpt.ones(10)
+    c = MockArray(b)
+    r = dpt.maximum(a, c)
+    assert isinstance(r, dpt.usm_ndarray)
+
+    r = dpt.minimum(a, c)
+    assert isinstance(r, dpt.usm_ndarray)
+
+
+def test_maximum_canary_mock_array():
+    get_queue_or_skip()
+    a = dpt.arange(10)
+
+    class Canary:
+        def __init__(self):
+            pass
+
+        @property
+        def __sycl_usm_array_interface__(self):
+            return None
+
+    c = Canary()
+    with pytest.raises(ValueError):
+        dpt.maximum(a, c)
+
+    with pytest.raises(ValueError):
+        dpt.minimum(a, c)
diff --git a/dpnp/tests/tensor/elementwise/test_multiply.py b/dpnp/tests/tensor/elementwise/test_multiply.py
new file mode 100644
index 000000000000..df0defc7cfc6
--- /dev/null
+++ b/dpnp/tests/tensor/elementwise/test_multiply.py
@@ -0,0 +1,253 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+import ctypes
+
+import dpctl
+import numpy as np
+import pytest
+
+import dpnp.tensor as dpt
+from dpnp.tensor._type_utils import _can_cast
+
+from ..helper import (
+    get_queue_or_skip,
+    skip_if_dtype_not_supported,
+)
+from .utils import (
+    _all_dtypes,
+    _compare_dtypes,
+    _usm_types,
+)
+
+
+@pytest.mark.parametrize("op1_dtype", _all_dtypes)
+@pytest.mark.parametrize("op2_dtype", _all_dtypes)
+def test_multiply_dtype_matrix(op1_dtype, op2_dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(op1_dtype, q)
+    skip_if_dtype_not_supported(op2_dtype, q)
+
+    sz = 127
+    ar1 = dpt.ones(sz, dtype=op1_dtype)
+    ar2 = dpt.ones_like(ar1, dtype=op2_dtype)
+
+    r = dpt.multiply(ar1, ar2)
+    assert isinstance(r, dpt.usm_ndarray)
+    expected = np.multiply(
+        np.ones(1, dtype=op1_dtype), np.ones(1, dtype=op2_dtype)
+    )
+    assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q)
+    assert r.shape == ar1.shape
+    assert (dpt.asnumpy(r) == expected.astype(r.dtype)).all()
+    assert r.sycl_queue == ar1.sycl_queue
+
+    ar3 = dpt.ones(sz, dtype=op1_dtype)
+    ar4 = dpt.ones(2 * sz, dtype=op2_dtype)
+
+    r = dpt.multiply(ar3[::-1], ar4[::2])
+    assert isinstance(r, dpt.usm_ndarray)
+    expected = np.multiply(
+        np.ones(1, dtype=op1_dtype), np.ones(1, dtype=op2_dtype)
+    )
+    assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q)
+    assert r.shape == ar3.shape
+    assert (dpt.asnumpy(r) == expected.astype(r.dtype)).all()
+
+
+@pytest.mark.parametrize("op1_usm_type", _usm_types)
+@pytest.mark.parametrize("op2_usm_type", _usm_types)
+def test_multiply_usm_type_matrix(op1_usm_type, op2_usm_type):
+    get_queue_or_skip()
+
+    sz = 128
+    ar1 = dpt.ones(sz, dtype="i4", usm_type=op1_usm_type)
+    ar2 = dpt.ones_like(ar1, dtype="i4", usm_type=op2_usm_type)
+
+    r = dpt.multiply(ar1, ar2)
+    assert isinstance(r, dpt.usm_ndarray)
+    expected_usm_type = dpctl.utils.get_coerced_usm_type(
+        (op1_usm_type, op2_usm_type)
+    )
+    assert r.usm_type == expected_usm_type
+
+
+def test_multiply_order():
+    get_queue_or_skip()
+
+    ar1 = dpt.ones((20, 20), dtype="i4", order="C")
+    ar2 = dpt.ones((20, 20), dtype="i4", order="C")
+    r1 = dpt.multiply(ar1, ar2, order="C")
+    assert r1.flags.c_contiguous
+    r2 = dpt.multiply(ar1, ar2, order="F")
+    assert r2.flags.f_contiguous
+    r3 = dpt.multiply(ar1, ar2, order="A")
+    assert r3.flags.c_contiguous
+    r4 = dpt.multiply(ar1, ar2, order="K")
+    assert r4.flags.c_contiguous
+
+    ar1 = dpt.ones((20, 20), dtype="i4", order="F")
+    ar2 = dpt.ones((20, 20), dtype="i4", order="F")
+    r1 = dpt.multiply(ar1, ar2, order="C")
+    assert r1.flags.c_contiguous
+    r2 = dpt.multiply(ar1, ar2, order="F")
+    assert r2.flags.f_contiguous
+    r3 = dpt.multiply(ar1, ar2, order="A")
+    assert r3.flags.f_contiguous
+    r4 = dpt.multiply(ar1, ar2, order="K")
+    assert r4.flags.f_contiguous
+
+    ar1 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2]
+    ar2 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2]
+    r4 = dpt.multiply(ar1, ar2, order="K")
+    assert r4.strides == (20, -1)
+
+    ar1 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2].mT
+    ar2 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2].mT
+    r4 = dpt.multiply(ar1, ar2, order="K")
+    assert r4.strides == (-1, 20)
+
+
+def test_multiply_broadcasting():
+    get_queue_or_skip()
+
+    m = dpt.ones((100, 5), dtype="i4")
+    v = dpt.arange(1, 6, dtype="i4")
+
+    r = dpt.multiply(m, v)
+
+    expected = np.multiply(
+        np.ones((100, 5), dtype="i4"), np.arange(1, 6, dtype="i4")
+    )
+    assert (dpt.asnumpy(r) == expected.astype(r.dtype)).all()
+
+    r2 = dpt.multiply(v, m)
+    expected2 = np.multiply(
+        np.arange(1, 6, dtype="i4"), np.ones((100, 5), dtype="i4")
+    )
+    assert (dpt.asnumpy(r2) == expected2.astype(r2.dtype)).all()
+
+
+@pytest.mark.parametrize("arr_dt", _all_dtypes)
+def test_multiply_python_scalar(arr_dt):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(arr_dt, q)
+
+    X = dpt.ones((10, 10), dtype=arr_dt, sycl_queue=q)
+    py_ones = (
+        bool(1),
+        int(1),
+        float(1),
+        complex(1),
+        np.float32(1),
+        ctypes.c_int(1),
+    )
+    for sc in py_ones:
+        R = dpt.multiply(X, sc)
+        assert isinstance(R, dpt.usm_ndarray)
+        R = dpt.multiply(sc, X)
+        assert isinstance(R, dpt.usm_ndarray)
+
+
+@pytest.mark.parametrize("arr_dt", _all_dtypes)
+@pytest.mark.parametrize("sc", [bool(1), int(1), float(1), complex(1)])
+def test_multiply_python_scalar_gh1219(arr_dt, sc):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(arr_dt, q)
+
+    Xnp = np.ones(4, dtype=arr_dt)
+
+    X = dpt.ones(4, dtype=arr_dt, sycl_queue=q)
+
+    R = dpt.multiply(X, sc)
+    Rnp = np.multiply(Xnp, sc)
+    assert _compare_dtypes(R.dtype, Rnp.dtype, sycl_queue=q)
+
+    # symmetric case
+    R = dpt.multiply(sc, X)
+    Rnp = np.multiply(sc, Xnp)
+    assert _compare_dtypes(R.dtype, Rnp.dtype, sycl_queue=q)
+
+
+@pytest.mark.parametrize("dtype", _all_dtypes)
+def test_multiply_inplace_python_scalar(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+    X = dpt.ones((10, 10), dtype=dtype, sycl_queue=q)
+    dt_kind = X.dtype.kind
+    if dt_kind in "ui":
+        X *= int(1)
+    elif dt_kind == "f":
+        X *= float(1)
+    elif dt_kind == "c":
+        X *= complex(1)
+    elif dt_kind == "b":
+        X *= bool(1)
+
+
+@pytest.mark.parametrize("op1_dtype", _all_dtypes)
+@pytest.mark.parametrize("op2_dtype", _all_dtypes)
+def test_multiply_inplace_dtype_matrix(op1_dtype, op2_dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(op1_dtype, q)
+    skip_if_dtype_not_supported(op2_dtype, q)
+
+    sz = 127
+    ar1 = dpt.ones(sz, dtype=op1_dtype)
+    ar2 = dpt.ones_like(ar1, dtype=op2_dtype)
+
+    dev = q.sycl_device
+    _fp16 = dev.has_aspect_fp16
+    _fp64 = dev.has_aspect_fp64
+    if _can_cast(ar2.dtype, ar1.dtype, _fp16, _fp64, casting="same_kind"):
+        ar1 *= ar2
+        assert (
+            dpt.asnumpy(ar1) == np.full(ar1.shape, 1, dtype=ar1.dtype)
+        ).all()
+
+        ar3 = dpt.ones(sz, dtype=op1_dtype)
+        ar4 = dpt.ones(2 * sz, dtype=op2_dtype)
+
+        ar3[::-1] *= ar4[::2]
+        assert (
+            dpt.asnumpy(ar3) == np.full(ar3.shape, 1, dtype=ar3.dtype)
+        ).all()
+
+    else:
+        with pytest.raises(ValueError):
+            ar1 *= ar2
+
+
+def test_multiply_inplace_broadcasting():
+    get_queue_or_skip()
+
+    m = dpt.ones((100, 5), dtype="i4")
+    v = dpt.arange(5, dtype="i4")
+
+    m *= v
+    assert (dpt.asnumpy(m) == np.arange(0, 5, dtype="i4")[np.newaxis, :]).all()
diff --git a/dpnp/tests/tensor/elementwise/test_negative.py b/dpnp/tests/tensor/elementwise/test_negative.py
new file mode 100644
index 000000000000..9713f0ecb364
--- /dev/null
+++ b/dpnp/tests/tensor/elementwise/test_negative.py
@@ -0,0 +1,101 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+import itertools
+
+import numpy as np
+import pytest
+
+import dpnp.tensor as dpt
+
+from ..helper import (
+    get_queue_or_skip,
+    skip_if_dtype_not_supported,
+)
+from .utils import _all_dtypes, _usm_types
+
+
+@pytest.mark.parametrize("dtype", _all_dtypes[1:])
+def test_negative_out_type(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    arg_dt = np.dtype(dtype)
+    X = dpt.asarray(0, dtype=arg_dt, sycl_queue=q)
+    assert dpt.negative(X).dtype == arg_dt
+
+    r = dpt.empty_like(X, dtype=arg_dt)
+    dpt.negative(X, out=r)
+    assert np.allclose(dpt.asnumpy(r), dpt.asnumpy(dpt.negative(X)))
+
+
+def test_negative_bool():
+    get_queue_or_skip()
+    x = dpt.ones(64, dtype="?")
+    with pytest.raises(ValueError):
+        dpt.negative(x)
+
+
+@pytest.mark.parametrize("usm_type", _usm_types)
+def test_negative_usm_type(usm_type):
+    q = get_queue_or_skip()
+
+    arg_dt = np.dtype("i4")
+    input_shape = (10, 10, 10, 10)
+    X = dpt.empty(input_shape, dtype=arg_dt, usm_type=usm_type, sycl_queue=q)
+    X[..., 0::2] = 1
+    X[..., 1::2] = 0
+
+    Y = dpt.negative(X)
+    assert Y.usm_type == X.usm_type
+    assert Y.sycl_queue == X.sycl_queue
+    assert Y.flags.c_contiguous
+
+    expected_Y = np.negative(dpt.asnumpy(X))
+    assert np.allclose(dpt.asnumpy(Y), expected_Y)
+
+
+@pytest.mark.parametrize("dtype", _all_dtypes[1:])
+def test_negative_order(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    arg_dt = np.dtype(dtype)
+    input_shape = (10, 10, 10, 10)
+    X = dpt.empty(input_shape, dtype=arg_dt, sycl_queue=q)
+    X[..., 0::2] = 1
+    X[..., 1::2] = 0
+
+    for perms in itertools.permutations(range(4)):
+        U = dpt.permute_dims(X[:, ::-1, ::-1, :], perms)
+        expected_Y = np.negative(np.ones(U.shape, dtype=U.dtype))
+        expected_Y[..., 1::2] = 0
+        expected_Y = np.transpose(expected_Y, perms)
+        for ord in ["C", "F", "A", "K"]:
+            Y = dpt.negative(U, order=ord)
+            assert np.allclose(dpt.asnumpy(Y), expected_Y)
diff --git a/dpnp/tests/tensor/elementwise/test_nextafter.py b/dpnp/tests/tensor/elementwise/test_nextafter.py
new file mode 100644
index 000000000000..b904bc42c6b7
--- /dev/null
+++ b/dpnp/tests/tensor/elementwise/test_nextafter.py
@@ -0,0 +1,169 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+import ctypes
+
+import dpctl
+import numpy as np
+import pytest
+
+import dpnp.tensor as dpt
+
+from ..helper import (
+    get_queue_or_skip,
+    skip_if_dtype_not_supported,
+)
+from .utils import (
+    _compare_dtypes,
+    _no_complex_dtypes,
+)
+
+
+@pytest.mark.parametrize("op1_dtype", _no_complex_dtypes[1:])
+@pytest.mark.parametrize("op2_dtype", _no_complex_dtypes[1:])
+def test_nextafter_dtype_matrix(op1_dtype, op2_dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(op1_dtype, q)
+    skip_if_dtype_not_supported(op2_dtype, q)
+
+    sz = 127
+    ar1 = dpt.ones(sz, dtype=op1_dtype, sycl_queue=q)
+    ar2 = dpt.ones_like(ar1, dtype=op2_dtype, sycl_queue=q)
+
+    r = dpt.nextafter(ar1, ar2)
+    assert isinstance(r, dpt.usm_ndarray)
+    expected = np.nextafter(
+        np.ones(sz, dtype=op1_dtype), np.ones(sz, dtype=op2_dtype)
+    )
+    assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q)
+    assert r.shape == ar1.shape
+    assert (dpt.asnumpy(r) == expected.astype(r.dtype)).all()
+    assert r.sycl_queue == ar1.sycl_queue
+
+    ar3 = dpt.ones(sz, dtype=op1_dtype, sycl_queue=q)
+    ar4 = dpt.ones(2 * sz, dtype=op2_dtype, sycl_queue=q)
+
+    r = dpt.nextafter(ar3[::-1], ar4[::2])
+    assert isinstance(r, dpt.usm_ndarray)
+    expected = np.nextafter(
+        np.ones(sz, dtype=op1_dtype), np.ones(sz, dtype=op2_dtype)
+    )
+    assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q)
+    assert r.shape == ar3.shape
+    assert (dpt.asnumpy(r) == expected.astype(r.dtype)).all()
+
+
+@pytest.mark.parametrize("arr_dt", _no_complex_dtypes[1:])
+def test_nextafter_python_scalar(arr_dt):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(arr_dt, q)
+
+    X = dpt.ones((10, 10), dtype=arr_dt, sycl_queue=q)
+    py_ones = (
+        bool(1),
+        int(1),
+        float(1),
+        np.float32(1),
+        ctypes.c_int(1),
+    )
+    for sc in py_ones:
+        R = dpt.nextafter(X, sc)
+        assert isinstance(R, dpt.usm_ndarray)
+        R = dpt.nextafter(sc, X)
+        assert isinstance(R, dpt.usm_ndarray)
+
+
+@pytest.mark.parametrize("dt", ["f2", "f4", "f8"])
+def test_nextafter_special_cases_nan(dt):
+    """If either x1_i or x2_i is NaN, the result is NaN."""
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dt, q)
+
+    x1 = dpt.asarray([2.0, dpt.nan, dpt.nan], dtype=dt)
+    x2 = dpt.asarray([dpt.nan, 2.0, dpt.nan], dtype=dt)
+
+    y = dpt.nextafter(x1, x2)
+    assert dpt.all(dpt.isnan(y))
+
+
+@pytest.mark.parametrize("dt", ["f2", "f4", "f8"])
+def test_nextafter_special_cases_zero(dt):
+    """If x1_i is equal to x2_i, the result is x2_i."""
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dt, q)
+
+    x1 = dpt.asarray([-0.0, 0.0, -0.0, 0.0], dtype=dt)
+    x2 = dpt.asarray([0.0, -0.0, -0.0, 0.0], dtype=dt)
+
+    y = dpt.nextafter(x1, x2)
+    assert dpt.all(y == 0)
+
+    skip_checking_signs = (
+        x1.dtype == dpt.float16
+        and x1.sycl_device.backend == dpctl.backend_type.cuda
+    )
+    if skip_checking_signs:
+        pytest.skip(
+            "Skipped checking signs for nextafter due to "
+            "known issue in DPC++ support for CUDA devices"
+        )
+    else:
+        assert dpt.all(dpt.signbit(y) == dpt.signbit(x2))
+
+
+@pytest.mark.parametrize("dt", ["f2", "f4", "f8"])
+def test_nextafter_basic(dt):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dt, q)
+
+    s = 10
+    x1 = dpt.ones(s, dtype=dt, sycl_queue=q)
+    x2 = dpt.full(s, 2, dtype=dt, sycl_queue=q)
+
+    r = dpt.nextafter(x1, x2)
+    expected_diff = dpt.asarray(dpt.finfo(dt).eps, dtype=dt, sycl_queue=q)
+
+    assert dpt.all(r > 0)
+    assert dpt.all(r - x1 == expected_diff)
+
+    x3 = dpt.zeros(s, dtype=dt, sycl_queue=q)
+
+    r = dpt.nextafter(x3, x1)
+    assert dpt.all(r > 0)
+
+    r = dpt.nextafter(x1, x3)
+    assert dpt.all((r - x1) < 0)
+
+    r = dpt.nextafter(x1, 0)
+    assert dpt.all(x1 - r == (expected_diff) / 2)
+
+    r = dpt.nextafter(x3, dpt.inf)
+    assert dpt.all(r > 0)
+
+    r = dpt.nextafter(x3, -dpt.inf)
+    assert dpt.all(r < 0)
diff --git a/dpnp/tests/tensor/elementwise/test_not_equal.py b/dpnp/tests/tensor/elementwise/test_not_equal.py
new file mode 100644
index 000000000000..718105d2689b
--- /dev/null
+++ b/dpnp/tests/tensor/elementwise/test_not_equal.py
@@ -0,0 +1,227 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+import ctypes
+
+import dpctl
+import numpy as np
+import pytest
+
+import dpnp.tensor as dpt
+
+from ..helper import (
+    get_queue_or_skip,
+    skip_if_dtype_not_supported,
+)
+from .utils import (
+    _all_dtypes,
+    _compare_dtypes,
+    _usm_types,
+)
+
+
+@pytest.mark.parametrize("op1_dtype", _all_dtypes)
+@pytest.mark.parametrize("op2_dtype", _all_dtypes)
+def test_not_equal_dtype_matrix(op1_dtype, op2_dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(op1_dtype, q)
+    skip_if_dtype_not_supported(op2_dtype, q)
+
+    sz = 127
+    ar1 = dpt.ones(sz, dtype=op1_dtype)
+    ar2 = dpt.ones_like(ar1, dtype=op2_dtype)
+
+    r = dpt.not_equal(ar1, ar2)
+    assert isinstance(r, dpt.usm_ndarray)
+    expected_dtype = np.not_equal(
+        np.zeros(1, dtype=op1_dtype), np.zeros(1, dtype=op2_dtype)
+    ).dtype
+    assert _compare_dtypes(r.dtype, expected_dtype, sycl_queue=q)
+    assert r.shape == ar1.shape
+    assert (dpt.asnumpy(r) == np.full(r.shape, False, dtype=r.dtype)).all()
+    assert r.sycl_queue == ar1.sycl_queue
+
+    ar3 = dpt.ones(sz, dtype=op1_dtype)
+    ar4 = dpt.ones(2 * sz, dtype=op2_dtype)
+
+    r = dpt.not_equal(ar3[::-1], ar4[::2])
+    assert isinstance(r, dpt.usm_ndarray)
+    expected_dtype = np.not_equal(
+        np.ones(1, dtype=op1_dtype), np.ones(1, dtype=op2_dtype)
+    ).dtype
+    assert _compare_dtypes(r.dtype, expected_dtype, sycl_queue=q)
+    assert r.shape == ar3.shape
+    assert (dpt.asnumpy(r) == np.full(r.shape, False, dtype=r.dtype)).all()
+
+
+@pytest.mark.parametrize("op1_usm_type", _usm_types)
+@pytest.mark.parametrize("op2_usm_type", _usm_types)
+def test_not_equal_usm_type_matrix(op1_usm_type, op2_usm_type):
+    get_queue_or_skip()
+
+    sz = 128
+    ar1 = dpt.ones(sz, dtype="i4", usm_type=op1_usm_type)
+    ar2 = dpt.ones_like(ar1, dtype="i4", usm_type=op2_usm_type)
+
+    r = dpt.not_equal(ar1, ar2)
+    assert isinstance(r, dpt.usm_ndarray)
+    expected_usm_type = dpctl.utils.get_coerced_usm_type(
+        (op1_usm_type, op2_usm_type)
+    )
+    assert r.usm_type == expected_usm_type
+
+
+def test_not_equal_order():
+    get_queue_or_skip()
+
+    ar1 = dpt.ones((20, 20), dtype="i4", order="C")
+    ar2 = dpt.ones((20, 20), dtype="i4", order="C")
+    r1 = dpt.not_equal(ar1, ar2, order="C")
+    assert r1.flags.c_contiguous
+    r2 = dpt.not_equal(ar1, ar2, order="F")
+    assert r2.flags.f_contiguous
+    r3 = dpt.not_equal(ar1, ar2, order="A")
+    assert r3.flags.c_contiguous
+    r4 = dpt.not_equal(ar1, ar2, order="K")
+    assert r4.flags.c_contiguous
+
+    ar1 = dpt.ones((20, 20), dtype="i4", order="F")
+    ar2 = dpt.ones((20, 20), dtype="i4", order="F")
+    r1 = dpt.not_equal(ar1, ar2, order="C")
+    assert r1.flags.c_contiguous
+    r2 = dpt.not_equal(ar1, ar2, order="F")
+    assert r2.flags.f_contiguous
+    r3 = dpt.not_equal(ar1, ar2, order="A")
+    assert r3.flags.f_contiguous
+    r4 = dpt.not_equal(ar1, ar2, order="K")
+    assert r4.flags.f_contiguous
+
+    ar1 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2]
+    ar2 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2]
+    r4 = dpt.not_equal(ar1, ar2, order="K")
+    assert r4.strides == (20, -1)
+
+    ar1 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2].mT
+    ar2 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2].mT
+    r4 = dpt.not_equal(ar1, ar2, order="K")
+    assert r4.strides == (-1, 20)
+
+
+def test_not_equal_broadcasting():
+    get_queue_or_skip()
+
+    m = dpt.ones((100, 5), dtype="i4")
+    v = dpt.arange(5, dtype="i4")
+
+    r = dpt.not_equal(m, v)
+    expected = np.full((100, 5), [True, False, True, True, True], dtype="?")
+
+    assert (dpt.asnumpy(r) == expected).all()
+
+    r2 = dpt.not_equal(v, m)
+    assert (dpt.asnumpy(r2) == expected).all()
+
+    r3 = dpt.empty_like(m, dtype="?")
+    dpt.not_equal(m, v, out=r3)
+    assert (dpt.asnumpy(r3) == expected).all()
+
+
+@pytest.mark.parametrize("arr_dt", _all_dtypes)
+def test_not_equal_python_scalar(arr_dt):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(arr_dt, q)
+
+    X = dpt.zeros((10, 10), dtype=arr_dt, sycl_queue=q)
+    py_zeros = (
+        bool(0),
+        int(0),
+        float(0),
+        complex(0),
+        np.float32(0),
+        ctypes.c_int(0),
+    )
+    for sc in py_zeros:
+        R = dpt.not_equal(X, sc)
+        assert isinstance(R, dpt.usm_ndarray)
+        assert not dpt.all(R)
+        R = dpt.not_equal(sc, X)
+        assert isinstance(R, dpt.usm_ndarray)
+        assert not dpt.all(R)
+
+
+class MockArray:
+    def __init__(self, arr):
+        self.data_ = arr
+
+    @property
+    def __sycl_usm_array_interface__(self):
+        return self.data_.__sycl_usm_array_interface__
+
+
+def test_not_equal_mock_array():
+    get_queue_or_skip()
+    a = dpt.arange(10)
+    b = dpt.ones(10)
+    c = MockArray(b)
+    r = dpt.not_equal(a, c)
+    assert isinstance(r, dpt.usm_ndarray)
+
+
+def test_not_equal_canary_mock_array():
+    get_queue_or_skip()
+    a = dpt.arange(10)
+
+    class Canary:
+        def __init__(self):
+            pass
+
+        @property
+        def __sycl_usm_array_interface__(self):
+            return None
+
+    c = Canary()
+    with pytest.raises(ValueError):
+        dpt.not_equal(a, c)
+
+
+@pytest.mark.parametrize("dtype", _all_dtypes)
+def test_not_equal_alignment(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    n = 256
+    s = dpt.concat((dpt.zeros(n, dtype=dtype), dpt.zeros(n, dtype=dtype)))
+
+    mask = s[:-1] != s[1:]
+    (pos,) = dpt.nonzero(mask)
+    assert dpt.all(pos == n)
+
+    out_arr = dpt.zeros(2 * n, dtype=mask.dtype)
+    dpt.not_equal(s[:-1], s[1:], out=out_arr[1:])
+    (pos,) = dpt.nonzero(mask)
+    assert dpt.all(pos == (n + 1))
diff --git a/dpnp/tests/tensor/elementwise/test_positive.py b/dpnp/tests/tensor/elementwise/test_positive.py
new file mode 100644
index 000000000000..d4358e5827da
--- /dev/null
+++ b/dpnp/tests/tensor/elementwise/test_positive.py
@@ -0,0 +1,94 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+import itertools
+
+import numpy as np
+import pytest
+
+import dpnp.tensor as dpt
+
+from ..helper import (
+    get_queue_or_skip,
+    skip_if_dtype_not_supported,
+)
+from .utils import _all_dtypes, _usm_types
+
+
+@pytest.mark.parametrize("dtype", _all_dtypes[1:])
+def test_positive_out_type(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    arg_dt = np.dtype(dtype)
+    X = dpt.asarray(0, dtype=arg_dt, sycl_queue=q)
+    assert dpt.positive(X).dtype == arg_dt
+
+    r = dpt.empty_like(X, dtype=arg_dt)
+    dpt.positive(X, out=r)
+    assert np.allclose(dpt.asnumpy(r), dpt.asnumpy(dpt.positive(X)))
+
+
+@pytest.mark.parametrize("usm_type", _usm_types)
+def test_positive_usm_type(usm_type):
+    q = get_queue_or_skip()
+
+    arg_dt = np.dtype("i4")
+    input_shape = (10, 10, 10, 10)
+    X = dpt.empty(input_shape, dtype=arg_dt, usm_type=usm_type, sycl_queue=q)
+    X[..., 0::2] = 1
+    X[..., 1::2] = 0
+
+    Y = dpt.positive(X)
+    assert Y.usm_type == X.usm_type
+    assert Y.sycl_queue == X.sycl_queue
+    assert Y.flags.c_contiguous
+
+    expected_Y = dpt.asnumpy(X)
+    assert np.allclose(dpt.asnumpy(Y), expected_Y)
+
+
+@pytest.mark.parametrize("dtype", _all_dtypes[1:])
+def test_positive_order(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    arg_dt = np.dtype(dtype)
+    input_shape = (10, 10, 10, 10)
+    X = dpt.empty(input_shape, dtype=arg_dt, sycl_queue=q)
+    X[..., 0::2] = 1
+    X[..., 1::2] = 0
+
+    for perms in itertools.permutations(range(4)):
+        U = dpt.permute_dims(X[:, ::-1, ::-1, :], perms)
+        expected_Y = np.ones(U.shape, dtype=U.dtype)
+        expected_Y[..., 1::2] = 0
+        expected_Y = np.transpose(expected_Y, perms)
+        for ord in ["C", "F", "A", "K"]:
+            Y = dpt.positive(U, order=ord)
+            assert np.allclose(dpt.asnumpy(Y), expected_Y)
diff --git a/dpnp/tests/tensor/elementwise/test_pow.py b/dpnp/tests/tensor/elementwise/test_pow.py
new file mode 100644
index 000000000000..17d54058c320
--- /dev/null
+++ b/dpnp/tests/tensor/elementwise/test_pow.py
@@ -0,0 +1,231 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+import ctypes
+
+import dpctl
+import numpy as np
+import pytest
+
+import dpnp.tensor as dpt
+from dpnp.tensor._type_utils import _can_cast
+
+from ..helper import (
+    get_queue_or_skip,
+    skip_if_dtype_not_supported,
+)
+from .utils import (
+    _all_dtypes,
+    _compare_dtypes,
+    _usm_types,
+)
+
+
+@pytest.mark.parametrize("op1_dtype", _all_dtypes[1:])
+@pytest.mark.parametrize("op2_dtype", _all_dtypes[1:])
+def test_power_dtype_matrix(op1_dtype, op2_dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(op1_dtype, q)
+    skip_if_dtype_not_supported(op2_dtype, q)
+
+    sz = 127
+    ar1 = dpt.ones(sz, dtype=op1_dtype)
+    ar2 = dpt.ones_like(ar1, dtype=op2_dtype)
+
+    r = dpt.pow(ar1, ar2)
+    assert isinstance(r, dpt.usm_ndarray)
+    expected = np.power(
+        np.ones(1, dtype=op1_dtype), np.ones(1, dtype=op2_dtype)
+    )
+    assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q)
+    assert r.shape == ar1.shape
+    assert (dpt.asnumpy(r) == expected.astype(r.dtype)).all()
+    assert r.sycl_queue == ar1.sycl_queue
+
+    ar3 = dpt.ones(sz, dtype=op1_dtype)
+    ar4 = dpt.ones(2 * sz, dtype=op2_dtype)
+
+    r = dpt.pow(ar3[::-1], ar4[::2])
+    assert isinstance(r, dpt.usm_ndarray)
+    expected = np.power(
+        np.ones(1, dtype=op1_dtype), np.ones(1, dtype=op2_dtype)
+    )
+    assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q)
+    assert r.shape == ar3.shape
+    assert (dpt.asnumpy(r) == expected.astype(r.dtype)).all()
+
+
+@pytest.mark.parametrize("op1_usm_type", _usm_types)
+@pytest.mark.parametrize("op2_usm_type", _usm_types)
+def test_power_usm_type_matrix(op1_usm_type, op2_usm_type):
+    get_queue_or_skip()
+
+    sz = 128
+    ar1 = dpt.ones(sz, dtype="i4", usm_type=op1_usm_type)
+    ar2 = dpt.ones_like(ar1, dtype="i4", usm_type=op2_usm_type)
+
+    r = dpt.pow(ar1, ar2)
+    assert isinstance(r, dpt.usm_ndarray)
+    expected_usm_type = dpctl.utils.get_coerced_usm_type(
+        (op1_usm_type, op2_usm_type)
+    )
+    assert r.usm_type == expected_usm_type
+
+
+def test_pow_order():
+    get_queue_or_skip()
+
+    ar1 = dpt.ones((20, 20), dtype="i4", order="C")
+    ar2 = dpt.ones((20, 20), dtype="i4", order="C")
+    r1 = dpt.pow(ar1, ar2, order="C")
+    assert r1.flags.c_contiguous
+    r2 = dpt.pow(ar1, ar2, order="F")
+    assert r2.flags.f_contiguous
+    r3 = dpt.pow(ar1, ar2, order="A")
+    assert r3.flags.c_contiguous
+    r4 = dpt.pow(ar1, ar2, order="K")
+    assert r4.flags.c_contiguous
+
+    ar1 = dpt.ones((20, 20), dtype="i4", order="F")
+    ar2 = dpt.ones((20, 20), dtype="i4", order="F")
+    r1 = dpt.pow(ar1, ar2, order="C")
+    assert r1.flags.c_contiguous
+    r2 = dpt.pow(ar1, ar2, order="F")
+    assert r2.flags.f_contiguous
+    r3 = dpt.pow(ar1, ar2, order="A")
+    assert r3.flags.f_contiguous
+    r4 = dpt.pow(ar1, ar2, order="K")
+    assert r4.flags.f_contiguous
+
+    ar1 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2]
+    ar2 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2]
+    r4 = dpt.pow(ar1, ar2, order="K")
+    assert r4.strides == (20, -1)
+
+    ar1 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2].mT
+    ar2 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2].mT
+    r4 = dpt.pow(ar1, ar2, order="K")
+    assert r4.strides == (-1, 20)
+
+
+def test_pow_broadcasting():
+    get_queue_or_skip()
+
+    v = dpt.arange(1, 6, dtype="i4")
+    m = dpt.full((100, 5), 2, dtype="i4")
+
+    r = dpt.pow(m, v)
+
+    expected = np.power(
+        np.full((100, 5), 2, dtype="i4"), np.arange(1, 6, dtype="i4")
+    )
+    assert (dpt.asnumpy(r) == expected.astype(r.dtype)).all()
+
+    r2 = dpt.pow(v, m)
+    expected2 = np.power(
+        np.arange(1, 6, dtype="i4"), np.full((100, 5), 2, dtype="i4")
+    )
+    assert (dpt.asnumpy(r2) == expected2.astype(r2.dtype)).all()
+
+
+@pytest.mark.parametrize("arr_dt", _all_dtypes)
+def test_pow_python_scalar(arr_dt):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(arr_dt, q)
+
+    X = dpt.ones((10, 10), dtype=arr_dt, sycl_queue=q)
+    py_ones = (
+        bool(1),
+        int(1),
+        float(1),
+        complex(1),
+        np.float32(1),
+        ctypes.c_int(1),
+    )
+    for sc in py_ones:
+        R = dpt.pow(X, sc)
+        assert isinstance(R, dpt.usm_ndarray)
+        R = dpt.pow(sc, X)
+        assert isinstance(R, dpt.usm_ndarray)
+
+
+@pytest.mark.parametrize("dtype", _all_dtypes[1:])
+def test_pow_inplace_python_scalar(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+    X = dpt.ones((10, 10), dtype=dtype, sycl_queue=q)
+    dt_kind = X.dtype.kind
+    if dt_kind in "ui":
+        X **= int(1)
+    elif dt_kind == "f":
+        X **= float(1)
+    elif dt_kind == "c":
+        X **= complex(1)
+
+
+@pytest.mark.parametrize("op1_dtype", _all_dtypes[1:])
+@pytest.mark.parametrize("op2_dtype", _all_dtypes[1:])
+def test_pow_inplace_dtype_matrix(op1_dtype, op2_dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(op1_dtype, q)
+    skip_if_dtype_not_supported(op2_dtype, q)
+
+    sz = 127
+    ar1 = dpt.ones(sz, dtype=op1_dtype)
+    ar2 = dpt.ones_like(ar1, dtype=op2_dtype)
+
+    dev = q.sycl_device
+    _fp16 = dev.has_aspect_fp16
+    _fp64 = dev.has_aspect_fp64
+    if _can_cast(ar2.dtype, ar1.dtype, _fp16, _fp64, casting="same_kind"):
+        ar1 **= ar2
+        assert (
+            dpt.asnumpy(ar1) == np.full(ar1.shape, 1, dtype=ar1.dtype)
+        ).all()
+
+        ar3 = dpt.ones(sz, dtype=op1_dtype)
+        ar4 = dpt.ones(2 * sz, dtype=op2_dtype)
+
+        ar3[::-1] *= ar4[::2]
+        assert (
+            dpt.asnumpy(ar3) == np.full(ar3.shape, 1, dtype=ar3.dtype)
+        ).all()
+
+    else:
+        with pytest.raises(ValueError):
+            ar1 **= ar2
+
+
+def test_pow_inplace_basic():
+    get_queue_or_skip()
+
+    x = dpt.arange(10, dtype="i4")
+    expected = dpt.square(x)
+    x **= 2
+
+    assert dpt.all(x == expected)
diff --git a/dpnp/tests/tensor/elementwise/test_reciprocal.py b/dpnp/tests/tensor/elementwise/test_reciprocal.py
new file mode 100644
index 000000000000..dd31c3323f68
--- /dev/null
+++ b/dpnp/tests/tensor/elementwise/test_reciprocal.py
@@ -0,0 +1,108 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+import itertools
+
+import pytest
+
+import dpnp.tensor as dpt
+
+from ..helper import (
+    get_queue_or_skip,
+    skip_if_dtype_not_supported,
+)
+from .utils import _all_dtypes, _complex_fp_dtypes
+
+
+@pytest.mark.parametrize("dtype", _all_dtypes)
+def test_reciprocal_out_type(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    x = dpt.asarray(1, dtype=dtype, sycl_queue=q)
+    one = dpt.asarray(1, dtype=dtype, sycl_queue=q)
+    expected_dtype = dpt.divide(one, x).dtype
+    assert dpt.reciprocal(x).dtype == expected_dtype
+
+
+@pytest.mark.parametrize("dtype", _all_dtypes)
+def test_reciprocal_output_contig(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    n_seq = 1027
+
+    x = dpt.linspace(1, 13, num=n_seq, dtype=dtype, sycl_queue=q)
+    res = dpt.reciprocal(x)
+    expected = 1 / x
+    tol = 8 * dpt.finfo(res.dtype).resolution
+    assert dpt.allclose(res, expected, atol=tol, rtol=tol)
+
+
+@pytest.mark.parametrize("dtype", _all_dtypes)
+def test_reciprocal_output_strided(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    n_seq = 2054
+
+    x = dpt.linspace(1, 13, num=n_seq, dtype=dtype, sycl_queue=q)[::-2]
+    res = dpt.reciprocal(x)
+    expected = 1 / x
+    tol = 8 * dpt.finfo(res.dtype).resolution
+    assert dpt.allclose(res, expected, atol=tol, rtol=tol)
+
+
+def test_reciprocal_special_cases():
+    get_queue_or_skip()
+
+    x = dpt.asarray([dpt.nan, 0.0, -0.0, dpt.inf, -dpt.inf], dtype="f4")
+    res = dpt.reciprocal(x)
+    expected = dpt.asarray([dpt.nan, dpt.inf, -dpt.inf, 0.0, -0.0], dtype="f4")
+    assert dpt.allclose(res, expected, equal_nan=True)
+
+
+@pytest.mark.parametrize("dtype", _complex_fp_dtypes)
+def test_reciprocal_complex_special_cases(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    nans_ = [dpt.nan, -dpt.nan]
+    infs_ = [dpt.inf, -dpt.inf]
+    finites_ = [-1.0, -0.0, 0.0, 1.0]
+    inps_ = nans_ + infs_ + finites_
+    c_ = [complex(*v) for v in itertools.product(inps_, repeat=2)]
+
+    z = dpt.asarray(c_, dtype=dtype)
+    r = dpt.reciprocal(z)
+
+    expected = 1 / z
+
+    tol = dpt.finfo(r.dtype).resolution
+
+    assert dpt.allclose(r, expected, atol=tol, rtol=tol, equal_nan=True)
diff --git a/dpnp/tests/tensor/elementwise/test_remainder.py b/dpnp/tests/tensor/elementwise/test_remainder.py
new file mode 100644
index 000000000000..0770820599d1
--- /dev/null
+++ b/dpnp/tests/tensor/elementwise/test_remainder.py
@@ -0,0 +1,279 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+import ctypes
+
+import dpctl
+import numpy as np
+import pytest
+
+import dpnp.tensor as dpt
+from dpnp.tensor._type_utils import _can_cast
+
+from ..helper import (
+    get_queue_or_skip,
+    skip_if_dtype_not_supported,
+)
+from .utils import (
+    _compare_dtypes,
+    _no_complex_dtypes,
+    _usm_types,
+)
+
+
+@pytest.mark.parametrize("op1_dtype", _no_complex_dtypes)
+@pytest.mark.parametrize("op2_dtype", _no_complex_dtypes)
+def test_remainder_dtype_matrix(op1_dtype, op2_dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(op1_dtype, q)
+    skip_if_dtype_not_supported(op2_dtype, q)
+
+    sz = 127
+    ar1 = dpt.ones(sz, dtype=op1_dtype)
+    ar2 = dpt.ones_like(ar1, dtype=op2_dtype)
+
+    r = dpt.remainder(ar1, ar2)
+    assert isinstance(r, dpt.usm_ndarray)
+    expected = np.remainder(
+        np.ones(1, dtype=op1_dtype), np.ones(1, dtype=op2_dtype)
+    )
+    assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q)
+    assert r.shape == ar1.shape
+    assert (dpt.asnumpy(r) == expected.astype(r.dtype)).all()
+    assert r.sycl_queue == ar1.sycl_queue
+
+    ar3 = dpt.ones(sz, dtype=op1_dtype)
+    ar4 = dpt.ones(2 * sz, dtype=op2_dtype)
+
+    r = dpt.remainder(ar3[::-1], ar4[::2])
+    assert isinstance(r, dpt.usm_ndarray)
+    expected = np.remainder(
+        np.ones(1, dtype=op1_dtype), np.ones(1, dtype=op2_dtype)
+    )
+    assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q)
+    assert r.shape == ar3.shape
+    assert (dpt.asnumpy(r) == expected.astype(r.dtype)).all()
+
+
+@pytest.mark.parametrize("op1_usm_type", _usm_types)
+@pytest.mark.parametrize("op2_usm_type", _usm_types)
+def test_remainder_usm_type_matrix(op1_usm_type, op2_usm_type):
+    get_queue_or_skip()
+
+    sz = 128
+    ar1 = dpt.ones(sz, dtype="i4", usm_type=op1_usm_type)
+    ar2 = dpt.ones_like(ar1, dtype="i4", usm_type=op2_usm_type)
+
+    r = dpt.remainder(ar1, ar2)
+    assert isinstance(r, dpt.usm_ndarray)
+    expected_usm_type = dpctl.utils.get_coerced_usm_type(
+        (op1_usm_type, op2_usm_type)
+    )
+    assert r.usm_type == expected_usm_type
+
+
+def test_remainder_order():
+    get_queue_or_skip()
+
+    ar1 = dpt.ones((20, 20), dtype="i4", order="C")
+    ar2 = dpt.ones((20, 20), dtype="i4", order="C")
+    r1 = dpt.remainder(ar1, ar2, order="C")
+    assert r1.flags.c_contiguous
+    r2 = dpt.remainder(ar1, ar2, order="F")
+    assert r2.flags.f_contiguous
+    r3 = dpt.remainder(ar1, ar2, order="A")
+    assert r3.flags.c_contiguous
+    r4 = dpt.remainder(ar1, ar2, order="K")
+    assert r4.flags.c_contiguous
+
+    ar1 = dpt.ones((20, 20), dtype="i4", order="F")
+    ar2 = dpt.ones((20, 20), dtype="i4", order="F")
+    r1 = dpt.remainder(ar1, ar2, order="C")
+    assert r1.flags.c_contiguous
+    r2 = dpt.remainder(ar1, ar2, order="F")
+    assert r2.flags.f_contiguous
+    r3 = dpt.remainder(ar1, ar2, order="A")
+    assert r3.flags.f_contiguous
+    r4 = dpt.remainder(ar1, ar2, order="K")
+    assert r4.flags.f_contiguous
+
+    ar1 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2]
+    ar2 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2]
+    r4 = dpt.remainder(ar1, ar2, order="K")
+    assert r4.strides == (20, -1)
+
+    ar1 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2].mT
+    ar2 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2].mT
+    r4 = dpt.remainder(ar1, ar2, order="K")
+    assert r4.strides == (-1, 20)
+
+
+@pytest.mark.parametrize("dt", _no_complex_dtypes[1:8:2])
+def test_remainder_negative_integers(dt):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dt, q)
+
+    x = dpt.arange(-5, -1, 1, dtype=dt, sycl_queue=q)
+    x_np = np.arange(-5, -1, 1, dtype=dt)
+    val = 3
+
+    r1 = dpt.remainder(x, val)
+    expected = np.remainder(x_np, val)
+    assert (dpt.asnumpy(r1) == expected).all()
+
+    r2 = dpt.remainder(val, x)
+    expected = np.remainder(val, x_np)
+    assert (dpt.asnumpy(r2) == expected).all()
+
+
+def test_remainder_integer_zero():
+    get_queue_or_skip()
+
+    for dt in ["i4", "u4"]:
+        x = dpt.ones(1, dtype=dt)
+        y = dpt.zeros_like(x)
+
+        assert (dpt.asnumpy(dpt.remainder(x, y)) == np.zeros(1, dtype=dt)).all()
+
+        x = dpt.astype(x, dt)
+        y = dpt.zeros_like(x)
+
+        assert (dpt.asnumpy(dpt.remainder(x, y)) == np.zeros(1, dtype=dt)).all()
+
+
+@pytest.mark.parametrize("dt", _no_complex_dtypes[9:])
+def test_remainder_negative_floats(dt):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dt, q)
+
+    x = dpt.linspace(-5, 5, 20, dtype=dt, sycl_queue=q)
+    x_np = np.linspace(-5, 5, 20, dtype=dt)
+    val = 3
+
+    tol = 8 * dpt.finfo(dt).resolution
+
+    r1 = dpt.remainder(x, val)
+    expected = np.remainder(x_np, val)
+    with np.errstate(invalid="ignore"):
+        np.allclose(
+            dpt.asnumpy(r1), expected, rtol=tol, atol=tol, equal_nan=True
+        )
+
+    r2 = dpt.remainder(val, x)
+    expected = np.remainder(val, x_np)
+    with np.errstate(invalid="ignore"):
+        np.allclose(
+            dpt.asnumpy(r2), expected, rtol=tol, atol=tol, equal_nan=True
+        )
+
+
+def test_remainder_special_cases():
+    get_queue_or_skip()
+
+    lhs = [dpt.nan, dpt.inf, 0.0, -0.0, -0.0, 1.0, dpt.inf, -dpt.inf]
+    rhs = [dpt.nan, dpt.inf, -0.0, 1.0, 1.0, 0.0, 1.0, -1.0]
+
+    x, y = dpt.asarray(lhs, dtype="f4"), dpt.asarray(rhs, dtype="f4")
+
+    x_np, y_np = np.asarray(lhs, dtype="f4"), np.asarray(rhs, dtype="f4")
+
+    res = dpt.remainder(x, y)
+
+    with np.errstate(invalid="ignore"):
+        np.allclose(dpt.asnumpy(res), np.remainder(x_np, y_np))
+
+
+@pytest.mark.parametrize("arr_dt", _no_complex_dtypes)
+def test_remainder_python_scalar(arr_dt):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(arr_dt, q)
+
+    X = dpt.ones((10, 10), dtype=arr_dt, sycl_queue=q)
+    py_ones = (
+        bool(1),
+        int(1),
+        float(1),
+        np.float32(1),
+        ctypes.c_int(1),
+    )
+    for sc in py_ones:
+        R = dpt.remainder(X, sc)
+        assert isinstance(R, dpt.usm_ndarray)
+        R = dpt.remainder(sc, X)
+        assert isinstance(R, dpt.usm_ndarray)
+
+
+@pytest.mark.parametrize("dtype", _no_complex_dtypes[1:])
+def test_remainder_inplace_python_scalar(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+    X = dpt.ones((10, 10), dtype=dtype, sycl_queue=q)
+    dt_kind = X.dtype.kind
+    if dt_kind in "ui":
+        X %= int(1)
+    elif dt_kind == "f":
+        X %= float(1)
+
+
+@pytest.mark.parametrize("op1_dtype", _no_complex_dtypes[1:])
+@pytest.mark.parametrize("op2_dtype", _no_complex_dtypes[1:])
+def test_remainder_inplace_dtype_matrix(op1_dtype, op2_dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(op1_dtype, q)
+    skip_if_dtype_not_supported(op2_dtype, q)
+
+    sz = 127
+    ar1 = dpt.ones(sz, dtype=op1_dtype)
+    ar2 = dpt.ones_like(ar1, dtype=op2_dtype)
+
+    dev = q.sycl_device
+    _fp16 = dev.has_aspect_fp16
+    _fp64 = dev.has_aspect_fp64
+    if _can_cast(ar2.dtype, ar1.dtype, _fp16, _fp64, casting="same_kind"):
+        ar1 %= ar2
+        assert dpt.all(ar1 == dpt.zeros(ar1.shape, dtype=ar1.dtype))
+
+        ar3 = dpt.ones(sz, dtype=op1_dtype)
+        ar4 = dpt.ones(2 * sz, dtype=op2_dtype)
+
+        ar3[::-1] %= ar4[::2]
+        assert dpt.all(ar3 == dpt.zeros(ar3.shape, dtype=ar3.dtype))
+
+    else:
+        with pytest.raises(ValueError):
+            ar1 %= ar2
+
+
+def test_remainder_inplace_basic():
+    get_queue_or_skip()
+
+    x = dpt.arange(10, dtype="i4")
+    expected = x & 1
+    x %= 2
+
+    assert dpt.all(x == expected)
diff --git a/dpnp/tests/tensor/elementwise/test_round.py b/dpnp/tests/tensor/elementwise/test_round.py
new file mode 100644
index 000000000000..5cfcb6dd598e
--- /dev/null
+++ b/dpnp/tests/tensor/elementwise/test_round.py
@@ -0,0 +1,234 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+import itertools
+
+import numpy as np
+import pytest
+from numpy.testing import assert_allclose, assert_array_equal
+
+import dpnp.tensor as dpt
+
+from ..helper import (
+    get_queue_or_skip,
+    skip_if_dtype_not_supported,
+)
+from .utils import (
+    _all_dtypes,
+    _map_to_device_dtype,
+    _usm_types,
+)
+
+
+@pytest.mark.parametrize("dtype", _all_dtypes[1:])
+def test_round_out_type(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    X = dpt.asarray(0.1, dtype=dtype, sycl_queue=q)
+    expected_dtype = np.round(np.array(0, dtype=dtype)).dtype
+    expected_dtype = _map_to_device_dtype(expected_dtype, q.sycl_device)
+    assert dpt.round(X).dtype == expected_dtype
+
+
+@pytest.mark.parametrize("dtype", ["f2", "f4", "f8"])
+def test_round_real_contig(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    n_seq = 100
+    n_rep = 137
+    Xnp = np.linspace(0.01, 88.1, num=n_seq, dtype=dtype)
+    X = dpt.asarray(np.repeat(Xnp, n_rep), dtype=dtype, sycl_queue=q)
+    Y = dpt.round(X)
+    Ynp = np.round(Xnp)
+
+    tol = 8 * dpt.finfo(dtype).resolution
+    assert_allclose(dpt.asnumpy(Y), np.repeat(Ynp, n_rep), atol=tol, rtol=tol)
+
+    Z = dpt.empty_like(X, dtype=dtype)
+    dpt.round(X, out=Z)
+
+    assert_allclose(dpt.asnumpy(Z), np.repeat(Ynp, n_rep), atol=tol, rtol=tol)
+
+
+@pytest.mark.parametrize("dtype", ["c8", "c16"])
+def test_round_complex_contig(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    n_seq = 100
+    n_rep = 137
+    low = -88.0
+    high = 88.0
+    x1 = np.random.uniform(low=low, high=high, size=n_seq)
+    x2 = np.random.uniform(low=low, high=high, size=n_seq)
+    Xnp = np.array([complex(v1, v2) for v1, v2 in zip(x1, x2)], dtype=dtype)
+
+    X = dpt.asarray(np.repeat(Xnp, n_rep), dtype=dtype, sycl_queue=q)
+    Y = dpt.round(X)
+
+    tol = 8 * dpt.finfo(dtype).resolution
+    assert_allclose(
+        dpt.asnumpy(Y), np.repeat(np.round(Xnp), n_rep), atol=tol, rtol=tol
+    )
+
+    Z = dpt.empty_like(X, dtype=dtype)
+    dpt.round(X, out=Z)
+
+    assert_allclose(
+        dpt.asnumpy(Z), np.repeat(np.round(Xnp), n_rep), atol=tol, rtol=tol
+    )
+
+
+@pytest.mark.parametrize("usm_type", _usm_types)
+def test_round_usm_type(usm_type):
+    q = get_queue_or_skip()
+
+    arg_dt = np.dtype("f4")
+    input_shape = (10, 10, 10, 10)
+    X = dpt.empty(input_shape, dtype=arg_dt, usm_type=usm_type, sycl_queue=q)
+    X[..., 0::2] = 16.2
+    X[..., 1::2] = 23.7
+
+    Y = dpt.round(X)
+    assert Y.usm_type == X.usm_type
+    assert Y.sycl_queue == X.sycl_queue
+    assert Y.flags.c_contiguous
+
+    expected_Y = np.empty(input_shape, dtype=arg_dt)
+    expected_Y[..., 0::2] = np.round(np.float32(16.2))
+    expected_Y[..., 1::2] = np.round(np.float32(23.7))
+    tol = 8 * dpt.finfo(Y.dtype).resolution
+
+    assert_allclose(dpt.asnumpy(Y), expected_Y, atol=tol, rtol=tol)
+
+
+@pytest.mark.parametrize("dtype", _all_dtypes)
+def test_round_order(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    arg_dt = np.dtype(dtype)
+    input_shape = (10, 10, 10, 10)
+    X = dpt.empty(input_shape, dtype=arg_dt, sycl_queue=q)
+    X[..., 0::2] = 8.8
+    X[..., 1::2] = 11.3
+
+    for perms in itertools.permutations(range(4)):
+        U = dpt.permute_dims(X[:, ::-1, ::-1, :], perms)
+        expected_Y = np.round(dpt.asnumpy(U))
+        for ord in ["C", "F", "A", "K"]:
+            Y = dpt.round(U, order=ord)
+            assert_allclose(dpt.asnumpy(Y), expected_Y)
+
+
+@pytest.mark.parametrize("dtype", ["f2", "f4", "f8"])
+def test_round_real_special_cases(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    tol = 8 * dpt.finfo(dtype).resolution
+    x = [np.nan, np.inf, -np.inf, 1.5, 2.5, -1.5, -2.5, 0.0, -0.0]
+    Xnp = np.array(x, dtype=dtype)
+    X = dpt.asarray(x, dtype=dtype)
+
+    Y = dpt.asnumpy(dpt.round(X))
+    Ynp = np.round(Xnp)
+    assert_allclose(Y, Ynp, atol=tol, rtol=tol)
+    assert_array_equal(np.signbit(Y), np.signbit(Ynp))
+
+
+@pytest.mark.parametrize("dtype", ["f2", "f4", "f8"])
+def test_round_real_strided(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    np.random.seed(42)
+    strides = np.array([-4, -3, -2, -1, 1, 2, 3, 4])
+    sizes = [2, 4, 6, 8, 9, 24, 72]
+    tol = 8 * dpt.finfo(dtype).resolution
+
+    for ii in sizes:
+        Xnp = np.random.uniform(low=0.01, high=88.1, size=ii)
+        Xnp.astype(dtype)
+        X = dpt.asarray(Xnp)
+        Ynp = np.round(Xnp)
+        for jj in strides:
+            assert_allclose(
+                dpt.asnumpy(dpt.round(X[::jj])),
+                Ynp[::jj],
+                atol=tol,
+                rtol=tol,
+            )
+
+
+@pytest.mark.parametrize("dtype", ["c8", "c16"])
+def test_round_complex_strided(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    np.random.seed(42)
+    strides = np.array([-4, -3, -2, -1, 1, 2, 3, 4])
+    sizes = [2, 4, 6, 8, 9, 24, 72]
+    tol = 8 * dpt.finfo(dtype).resolution
+
+    low = -88.0
+    high = 88.0
+    for ii in sizes:
+        x1 = np.random.uniform(low=low, high=high, size=ii)
+        x2 = np.random.uniform(low=low, high=high, size=ii)
+        Xnp = np.array([complex(v1, v2) for v1, v2 in zip(x1, x2)], dtype=dtype)
+        X = dpt.asarray(Xnp)
+        Ynp = np.round(Xnp)
+        for jj in strides:
+            assert_allclose(
+                dpt.asnumpy(dpt.round(X[::jj])),
+                Ynp[::jj],
+                atol=tol,
+                rtol=tol,
+            )
+
+
+@pytest.mark.parametrize("dtype", ["c8", "c16"])
+def test_round_complex_special_cases(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    x = [np.nan, np.inf, -np.inf, 1.5, 2.5, -1.5, -2.5, 0.0, -0.0]
+    xc = [complex(*val) for val in itertools.product(x, repeat=2)]
+
+    Xc_np = np.array(xc, dtype=dtype)
+    Xc = dpt.asarray(Xc_np, dtype=dtype, sycl_queue=q)
+
+    Ynp = np.round(Xc_np)
+    Y = dpt.round(Xc)
+
+    tol = 8 * dpt.finfo(dtype).resolution
+    assert_allclose(dpt.asnumpy(dpt.real(Y)), np.real(Ynp), atol=tol, rtol=tol)
+    assert_allclose(dpt.asnumpy(dpt.imag(Y)), np.imag(Ynp), atol=tol, rtol=tol)
diff --git a/dpnp/tests/tensor/elementwise/test_rsqrt.py b/dpnp/tests/tensor/elementwise/test_rsqrt.py
new file mode 100644
index 000000000000..559de121e9be
--- /dev/null
+++ b/dpnp/tests/tensor/elementwise/test_rsqrt.py
@@ -0,0 +1,93 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+import numpy as np
+import pytest
+from numpy.testing import assert_allclose
+
+import dpnp.tensor as dpt
+
+from ..helper import (
+    get_queue_or_skip,
+    skip_if_dtype_not_supported,
+)
+from .utils import (
+    _map_to_device_dtype,
+    _no_complex_dtypes,
+    _real_fp_dtypes,
+)
+
+
+@pytest.mark.parametrize("dtype", _no_complex_dtypes)
+def test_rsqrt_out_type(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    x = dpt.asarray(1, dtype=dtype, sycl_queue=q)
+    expected_dtype = np.reciprocal(np.sqrt(np.array(1, dtype=dtype))).dtype
+    expected_dtype = _map_to_device_dtype(expected_dtype, q.sycl_device)
+    assert dpt.rsqrt(x).dtype == expected_dtype
+
+
+@pytest.mark.parametrize("dtype", _real_fp_dtypes)
+def test_rsqrt_output_contig(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    n_seq = 1027
+
+    x = dpt.linspace(1, 13, num=n_seq, dtype=dtype, sycl_queue=q)
+    res = dpt.rsqrt(x)
+    expected = np.reciprocal(np.sqrt(dpt.asnumpy(x), dtype=dtype))
+    tol = 8 * dpt.finfo(res.dtype).resolution
+    assert_allclose(dpt.asnumpy(res), expected, atol=tol, rtol=tol)
+
+
+@pytest.mark.parametrize("dtype", _real_fp_dtypes)
+def test_rsqrt_output_strided(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    n_seq = 2054
+
+    x = dpt.linspace(1, 13, num=n_seq, dtype=dtype, sycl_queue=q)[::-2]
+    res = dpt.rsqrt(x)
+    expected = np.reciprocal(np.sqrt(dpt.asnumpy(x), dtype=dtype))
+    tol = 8 * dpt.finfo(res.dtype).resolution
+    assert_allclose(dpt.asnumpy(res), expected, atol=tol, rtol=tol)
+
+
+def test_rsqrt_special_cases():
+    get_queue_or_skip()
+
+    x = dpt.asarray([dpt.nan, -1.0, 0.0, -0.0, dpt.inf, -dpt.inf], dtype="f4")
+    res = dpt.rsqrt(x)
+    expected = dpt.asarray(
+        [dpt.nan, dpt.nan, dpt.inf, -dpt.inf, 0.0, dpt.nan], dtype="f4"
+    )
+    assert dpt.allclose(res, expected, equal_nan=True)
diff --git a/dpnp/tests/tensor/elementwise/test_sign.py b/dpnp/tests/tensor/elementwise/test_sign.py
new file mode 100644
index 000000000000..e2addb23b711
--- /dev/null
+++ b/dpnp/tests/tensor/elementwise/test_sign.py
@@ -0,0 +1,140 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+import itertools
+
+import numpy as np
+import pytest
+
+import dpnp.tensor as dpt
+
+from ..helper import (
+    get_queue_or_skip,
+    skip_if_dtype_not_supported,
+)
+from .utils import (
+    _all_dtypes,
+    _no_complex_dtypes,
+    _usm_types,
+)
+
+
+@pytest.mark.parametrize("dtype", _all_dtypes[1:])
+def test_sign_out_type(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    arg_dt = np.dtype(dtype)
+    X = dpt.asarray(0, dtype=arg_dt, sycl_queue=q)
+    assert dpt.sign(X).dtype == arg_dt
+
+    r = dpt.empty_like(X, dtype=arg_dt)
+    dpt.sign(X, out=r)
+    assert np.allclose(dpt.asnumpy(r), dpt.asnumpy(dpt.sign(X)))
+
+
+@pytest.mark.parametrize("usm_type", _usm_types)
+def test_sign_usm_type(usm_type):
+    q = get_queue_or_skip()
+
+    arg_dt = np.dtype("i4")
+    input_shape = (10, 10, 10, 10)
+    X = dpt.empty(input_shape, dtype=arg_dt, usm_type=usm_type, sycl_queue=q)
+    X[..., 0::2] = 1
+    X[..., 1::2] = 0
+
+    Y = dpt.sign(X)
+    assert Y.usm_type == X.usm_type
+    assert Y.sycl_queue == X.sycl_queue
+    assert Y.flags.c_contiguous
+
+    expected_Y = dpt.asnumpy(X)
+    assert np.allclose(dpt.asnumpy(Y), expected_Y)
+
+
+@pytest.mark.parametrize("dtype", _all_dtypes[1:])
+def test_sign_order(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    arg_dt = np.dtype(dtype)
+    expected_dt = np.sign(np.ones(tuple(), dtype=arg_dt)).dtype
+    input_shape = (10, 10, 10, 10)
+    X = dpt.empty(input_shape, dtype=arg_dt, sycl_queue=q)
+    X[..., 0::2] = 1
+    X[..., 1::2] = 0
+
+    for perms in itertools.permutations(range(4)):
+        U = dpt.permute_dims(X[:, ::-1, ::-1, :], perms)
+        expected_Y = np.ones(U.shape, dtype=expected_dt)
+        expected_Y[..., 1::2] = 0
+        expected_Y = np.transpose(expected_Y, perms)
+        for ord in ["C", "F", "A", "K"]:
+            Y = dpt.sign(U, order=ord)
+            assert np.allclose(dpt.asnumpy(Y), expected_Y)
+
+
+@pytest.mark.parametrize("dtype", ["c8", "c16"])
+def test_sign_complex(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    arg_dt = np.dtype(dtype)
+    input_shape = (10, 10, 10, 10)
+    X = dpt.empty(input_shape, dtype=arg_dt, sycl_queue=q)
+    Xnp = np.random.standard_normal(
+        size=input_shape
+    ) + 1j * np.random.standard_normal(size=input_shape)
+    Xnp = Xnp.astype(arg_dt)
+    X[...] = Xnp
+
+    for ord in ["C", "F", "A", "K"]:
+        for perms in itertools.permutations(range(4)):
+            U = dpt.permute_dims(X[:, ::-1, ::-1, :], perms)
+            Y = dpt.sign(U, order=ord)
+            X_t = np.transpose(Xnp[:, ::-1, ::-1, :], perms)
+            expected_Y = X_t / np.abs(X_t)
+            tol = dpt.finfo(Y.dtype).resolution
+            np.testing.assert_allclose(
+                dpt.asnumpy(Y), expected_Y, atol=tol, rtol=tol
+            )
+
+
+# test for all signed real data types
+@pytest.mark.parametrize(
+    "dt", _no_complex_dtypes[1:8:2] + _no_complex_dtypes[9:]
+)
+def test_sign_negative(dt):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dt, q)
+
+    x = dpt.arange(-20, 20, 1, dtype=dt, sycl_queue=q)
+    x_np = np.arange(-20, 20, 1, dtype=dt)
+    res = dpt.sign(x)
+
+    assert (dpt.asnumpy(res) == np.sign(x_np)).all()
diff --git a/dpnp/tests/tensor/elementwise/test_signbit.py b/dpnp/tests/tensor/elementwise/test_signbit.py
new file mode 100644
index 000000000000..9006bcafbd2d
--- /dev/null
+++ b/dpnp/tests/tensor/elementwise/test_signbit.py
@@ -0,0 +1,124 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+import numpy as np
+import pytest
+
+import dpnp.tensor as dpt
+
+from ..helper import (
+    get_queue_or_skip,
+    skip_if_dtype_not_supported,
+)
+
+
+@pytest.mark.parametrize("dtype", ["f2", "f4", "f8"])
+def test_signbit_out_type_contig(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    arg_dt = np.dtype(dtype)
+    x = dpt.linspace(1, 10, num=256, dtype=arg_dt)
+    sb = dpt.signbit(x)
+    assert sb.dtype == dpt.bool
+
+    assert not dpt.any(sb)
+
+    x2 = dpt.linspace(-10, -1, num=256, dtype=arg_dt)
+    sb2 = dpt.signbit(x2)
+    assert dpt.all(sb2)
+
+
+@pytest.mark.parametrize("dtype", ["f2", "f4", "f8"])
+def test_signbit_out_type_strided(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    arg_dt = np.dtype(dtype)
+    x = dpt.linspace(1, 10, num=256, dtype=arg_dt)
+    sb = dpt.signbit(x[::-3])
+    assert sb.dtype == dpt.bool
+
+    assert not dpt.any(sb)
+
+    x2 = dpt.linspace(-10, -1, num=256, dtype=arg_dt)
+    sb2 = dpt.signbit(x2[::-3])
+    assert dpt.all(sb2)
+
+
+@pytest.mark.parametrize("dtype", ["f2", "f4", "f8"])
+def test_signbit_special_cases_contig(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    arg_dt = np.dtype(dtype)
+    n = 63
+    x1 = dpt.full(n, -dpt.inf, dtype=arg_dt)
+    x2 = dpt.full(n, -0.0, dtype=arg_dt)
+    x3 = dpt.full(n, 0.0, dtype=arg_dt)
+    x4 = dpt.full(n, dpt.inf, dtype=arg_dt)
+
+    x = dpt.concat((x1, x2, x3, x4))
+    actual = dpt.signbit(x)
+
+    expected = dpt.concat(
+        (
+            dpt.full(x1.size, True),
+            dpt.full(x2.size, True),
+            dpt.full(x3.size, False),
+            dpt.full(x4.size, False),
+        )
+    )
+
+    assert dpt.all(dpt.equal(actual, expected))
+
+
+@pytest.mark.parametrize("dtype", ["f2", "f4", "f8"])
+def test_signbit_special_cases_strided(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    arg_dt = np.dtype(dtype)
+    x1 = dpt.full(63, -dpt.inf, dtype=arg_dt)
+    x2 = dpt.full(63, -0.0, dtype=arg_dt)
+    x3 = dpt.full(63, 0.0, dtype=arg_dt)
+    x4 = dpt.full(63, dpt.inf, dtype=arg_dt)
+
+    x = dpt.concat((x1, x2, x3, x4))
+    actual = dpt.signbit(x[::-1])
+
+    expected = dpt.concat(
+        (
+            dpt.full(x4.size, False),
+            dpt.full(x3.size, False),
+            dpt.full(x2.size, True),
+            dpt.full(x1.size, True),
+        )
+    )
+
+    assert dpt.all(dpt.equal(actual, expected))
diff --git a/dpnp/tests/tensor/elementwise/test_sqrt.py b/dpnp/tests/tensor/elementwise/test_sqrt.py
new file mode 100644
index 000000000000..d6bc7a42434e
--- /dev/null
+++ b/dpnp/tests/tensor/elementwise/test_sqrt.py
@@ -0,0 +1,207 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+import itertools
+import warnings
+
+import numpy as np
+import pytest
+from numpy.testing import assert_allclose, assert_equal
+
+import dpnp.tensor as dpt
+
+from ..helper import (
+    get_queue_or_skip,
+    skip_if_dtype_not_supported,
+)
+from .utils import (
+    _all_dtypes,
+    _complex_fp_dtypes,
+    _map_to_device_dtype,
+    _real_fp_dtypes,
+    _usm_types,
+)
+
+
+@pytest.mark.parametrize("dtype", _all_dtypes)
+def test_sqrt_out_type(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    X = dpt.asarray(0, dtype=dtype, sycl_queue=q)
+    expected_dtype = np.sqrt(np.array(0, dtype=dtype)).dtype
+    expected_dtype = _map_to_device_dtype(expected_dtype, q.sycl_device)
+    assert dpt.sqrt(X).dtype == expected_dtype
+
+
+@pytest.mark.parametrize("dtype", ["f2", "f4", "f8", "c8", "c16"])
+def test_sqrt_output_contig(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    n_seq = 1027
+
+    X = dpt.linspace(0, 13, num=n_seq, dtype=dtype, sycl_queue=q)
+    Xnp = dpt.asnumpy(X)
+
+    Y = dpt.sqrt(X)
+    tol = 8 * dpt.finfo(Y.dtype).resolution
+
+    assert_allclose(dpt.asnumpy(Y), np.sqrt(Xnp), atol=tol, rtol=tol)
+
+
+@pytest.mark.parametrize("dtype", ["f2", "f4", "f8", "c8", "c16"])
+def test_sqrt_output_strided(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    n_seq = 2054
+
+    X = dpt.linspace(0, 13, num=n_seq, dtype=dtype, sycl_queue=q)[::-2]
+    Xnp = dpt.asnumpy(X)
+
+    Y = dpt.sqrt(X)
+    tol = 8 * dpt.finfo(Y.dtype).resolution
+
+    assert_allclose(dpt.asnumpy(Y), np.sqrt(Xnp), atol=tol, rtol=tol)
+
+
+@pytest.mark.parametrize("usm_type", _usm_types)
+def test_sqrt_usm_type(usm_type):
+    q = get_queue_or_skip()
+
+    arg_dt = np.dtype("f4")
+    input_shape = (10, 10, 10, 10)
+    X = dpt.empty(input_shape, dtype=arg_dt, usm_type=usm_type, sycl_queue=q)
+    X[..., 0::2] = 16.0
+    X[..., 1::2] = 23.0
+
+    Y = dpt.sqrt(X)
+    assert Y.usm_type == X.usm_type
+    assert Y.sycl_queue == X.sycl_queue
+    assert Y.flags.c_contiguous
+
+    expected_Y = np.empty(input_shape, dtype=arg_dt)
+    expected_Y[..., 0::2] = np.sqrt(np.float32(16.0))
+    expected_Y[..., 1::2] = np.sqrt(np.float32(23.0))
+    tol = 8 * dpt.finfo(Y.dtype).resolution
+
+    assert_allclose(dpt.asnumpy(Y), expected_Y, atol=tol, rtol=tol)
+
+
+@pytest.mark.parametrize("dtype", _all_dtypes)
+def test_sqrt_order(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    arg_dt = np.dtype(dtype)
+    input_shape = (10, 10, 10, 10)
+    X = dpt.empty(input_shape, dtype=arg_dt, sycl_queue=q)
+    X[..., 0::2] = 16.0
+    X[..., 1::2] = 23.0
+
+    for perms in itertools.permutations(range(4)):
+        U = dpt.permute_dims(X[:, ::-1, ::-1, :], perms)
+        expected_Y = np.sqrt(dpt.asnumpy(U))
+        for ord in ["C", "F", "A", "K"]:
+            Y = dpt.sqrt(U, order=ord)
+            tol = 8 * max(
+                dpt.finfo(Y.dtype).resolution,
+                np.finfo(expected_Y.dtype).resolution,
+            )
+            assert_allclose(dpt.asnumpy(Y), expected_Y, atol=tol, rtol=tol)
+
+
+@pytest.mark.usefixtures("suppress_invalid_numpy_warnings")
+def test_sqrt_special_cases():
+    q = get_queue_or_skip()
+
+    X = dpt.asarray(
+        [dpt.nan, -1.0, 0.0, -0.0, dpt.inf, -dpt.inf], dtype="f4", sycl_queue=q
+    )
+    Xnp = dpt.asnumpy(X)
+
+    assert_equal(dpt.asnumpy(dpt.sqrt(X)), np.sqrt(Xnp))
+
+
+@pytest.mark.parametrize("dtype", _real_fp_dtypes)
+def test_sqrt_real_fp_special_values(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    nans_ = [dpt.nan, -dpt.nan]
+    infs_ = [dpt.inf, -dpt.inf]
+    finites_ = [-1.0, -0.0, 0.0, 1.0]
+    inps_ = nans_ + infs_ + finites_
+
+    x = dpt.asarray(inps_, dtype=dtype)
+    r = dpt.sqrt(x)
+
+    with warnings.catch_warnings():
+        warnings.simplefilter("ignore")
+        expected_np = np.sqrt(np.asarray(inps_, dtype=dtype))
+
+    expected = dpt.asarray(expected_np, dtype=dtype)
+    tol = dpt.finfo(r.dtype).resolution
+
+    assert dpt.allclose(r, expected, atol=tol, rtol=tol, equal_nan=True)
+
+
+@pytest.mark.parametrize("dtype", _complex_fp_dtypes)
+def test_sqrt_complex_fp_special_values(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    nans_ = [dpt.nan, -dpt.nan]
+    infs_ = [dpt.inf, -dpt.inf]
+    finites_ = [-1.0, -0.0, 0.0, 1.0]
+    inps_ = nans_ + infs_ + finites_
+    c_ = [complex(*v) for v in itertools.product(inps_, repeat=2)]
+
+    z = dpt.asarray(c_, dtype=dtype)
+    r = dpt.sqrt(z)
+
+    with warnings.catch_warnings():
+        warnings.simplefilter("ignore")
+        expected_np = np.sqrt(np.asarray(c_, dtype=dtype))
+
+    expected = dpt.asarray(expected_np, dtype=dtype)
+    tol = dpt.finfo(r.dtype).resolution
+
+    if not dpt.allclose(r, expected, atol=tol, rtol=tol, equal_nan=True):
+        for i in range(r.shape[0]):
+            failure_data = []
+            if not dpt.allclose(
+                r[i], expected[i], atol=tol, rtol=tol, equal_nan=True
+            ):
+                msg = (
+                    f"Test failed for input {z[i]}, i.e. {c_[i]} for index {i}"
+                )
+                msg += f", results were {r[i]} vs. {expected[i]}"
+                failure_data.extend(msg)
+        pytest.skip(reason=msg)
diff --git a/dpnp/tests/tensor/elementwise/test_square.py b/dpnp/tests/tensor/elementwise/test_square.py
new file mode 100644
index 000000000000..0b65e9af53ce
--- /dev/null
+++ b/dpnp/tests/tensor/elementwise/test_square.py
@@ -0,0 +1,114 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+import itertools
+
+import numpy as np
+import pytest
+
+import dpnp.tensor as dpt
+
+from ..helper import (
+    get_queue_or_skip,
+    skip_if_dtype_not_supported,
+)
+from .utils import _all_dtypes, _usm_types
+
+
+@pytest.mark.parametrize("dtype", _all_dtypes[1:])
+def test_square_out_type(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    arg_dt = np.dtype(dtype)
+    X = dpt.arange(5, dtype=arg_dt, sycl_queue=q)
+    assert dpt.square(X).dtype == arg_dt
+
+    r = dpt.empty_like(X, dtype=arg_dt)
+    dpt.square(X, out=r)
+    assert np.allclose(dpt.asnumpy(r), dpt.asnumpy(dpt.square(X)))
+
+
+@pytest.mark.parametrize("usm_type", _usm_types)
+def test_square_usm_type(usm_type):
+    q = get_queue_or_skip()
+
+    arg_dt = np.dtype("i4")
+    input_shape = (10, 10, 10, 10)
+    X = dpt.empty(input_shape, dtype=arg_dt, usm_type=usm_type, sycl_queue=q)
+    X[..., 0::2] = 1
+    X[..., 1::2] = 0
+
+    Y = dpt.square(X)
+    assert Y.usm_type == X.usm_type
+    assert Y.sycl_queue == X.sycl_queue
+    assert Y.flags.c_contiguous
+
+    expected_Y = dpt.asnumpy(X)
+    assert np.allclose(dpt.asnumpy(Y), expected_Y)
+
+
+@pytest.mark.parametrize("dtype", _all_dtypes[1:])
+def test_square_order(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    arg_dt = np.dtype(dtype)
+    input_shape = (10, 10, 10, 10)
+    X = dpt.empty(input_shape, dtype=arg_dt, sycl_queue=q)
+    X[..., 0::2] = 2
+    X[..., 1::2] = 0
+
+    for perms in itertools.permutations(range(4)):
+        U = dpt.permute_dims(X[:, ::-1, ::-1, :], perms)
+        expected_Y = np.full(U.shape, 4, dtype=U.dtype)
+        expected_Y[..., 1::2] = 0
+        expected_Y = np.transpose(expected_Y, perms)
+        for ord in ["C", "F", "A", "K"]:
+            Y = dpt.square(U, order=ord)
+            assert np.allclose(dpt.asnumpy(Y), expected_Y)
+
+
+@pytest.mark.parametrize("dtype", ["c8", "c16"])
+def test_square_special_cases(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    vals = [np.nan, np.inf, -np.inf, 0.0, -0.0]
+    X = dpt.asarray(vals, dtype=dtype, sycl_queue=q)
+    X_np = dpt.asnumpy(X)
+
+    tol = 8 * dpt.finfo(dtype).resolution
+    with np.errstate(all="ignore"):
+        assert np.allclose(
+            dpt.asnumpy(dpt.square(X)),
+            np.square(X_np),
+            atol=tol,
+            rtol=tol,
+            equal_nan=True,
+        )
diff --git a/dpnp/tests/tensor/elementwise/test_subtract.py b/dpnp/tests/tensor/elementwise/test_subtract.py
new file mode 100644
index 000000000000..70c652c7e65a
--- /dev/null
+++ b/dpnp/tests/tensor/elementwise/test_subtract.py
@@ -0,0 +1,254 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+import ctypes
+
+import dpctl
+import numpy as np
+import pytest
+
+import dpnp.tensor as dpt
+from dpnp.tensor._type_utils import _can_cast
+
+from ..helper import (
+    get_queue_or_skip,
+    skip_if_dtype_not_supported,
+)
+from .utils import (
+    _all_dtypes,
+    _compare_dtypes,
+    _usm_types,
+)
+
+
+@pytest.mark.parametrize("op1_dtype", _all_dtypes[1:])
+@pytest.mark.parametrize("op2_dtype", _all_dtypes[1:])
+def test_subtract_dtype_matrix(op1_dtype, op2_dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(op1_dtype, q)
+    skip_if_dtype_not_supported(op2_dtype, q)
+
+    sz = 127
+    ar1 = dpt.ones(sz, dtype=op1_dtype)
+    ar2 = dpt.ones_like(ar1, dtype=op2_dtype)
+
+    r = dpt.subtract(ar1, ar2)
+    assert isinstance(r, dpt.usm_ndarray)
+    expected_dtype = np.subtract(
+        np.zeros(1, dtype=op1_dtype), np.zeros(1, dtype=op2_dtype)
+    ).dtype
+    assert _compare_dtypes(r.dtype, expected_dtype, sycl_queue=q)
+    assert r.shape == ar1.shape
+    assert (dpt.asnumpy(r) == np.full(r.shape, 0, dtype=r.dtype)).all()
+    assert r.sycl_queue == ar1.sycl_queue
+
+    r2 = dpt.empty_like(ar1, dtype=r.dtype)
+    dpt.subtract(ar1, ar2, out=r2)
+    assert (dpt.asnumpy(r2) == np.full(r2.shape, 0, dtype=r2.dtype)).all()
+
+    ar3 = dpt.ones(sz, dtype=op1_dtype)
+    ar4 = dpt.ones(2 * sz, dtype=op2_dtype)
+
+    r = dpt.subtract(ar3[::-1], ar4[::2])
+    assert isinstance(r, dpt.usm_ndarray)
+    expected_dtype = np.subtract(
+        np.zeros(1, dtype=op1_dtype), np.zeros(1, dtype=op2_dtype)
+    ).dtype
+    assert _compare_dtypes(r.dtype, expected_dtype, sycl_queue=q)
+    assert r.shape == ar3.shape
+    assert (dpt.asnumpy(r) == np.full(r.shape, 0, dtype=r.dtype)).all()
+
+    r2 = dpt.empty_like(ar1, dtype=r.dtype)
+    dpt.subtract(ar3[::-1], ar4[::2], out=r2)
+    assert (dpt.asnumpy(r2) == np.full(r2.shape, 0, dtype=r2.dtype)).all()
+
+
+def test_subtract_bool():
+    get_queue_or_skip()
+    ar1 = dpt.ones(127, dtype="?")
+    ar2 = dpt.ones_like(ar1, dtype="?")
+    with pytest.raises(ValueError):
+        dpt.subtract(ar1, ar2)
+
+
+@pytest.mark.parametrize("op1_usm_type", _usm_types)
+@pytest.mark.parametrize("op2_usm_type", _usm_types)
+def test_subtract_usm_type_matrix(op1_usm_type, op2_usm_type):
+    get_queue_or_skip()
+
+    sz = 128
+    ar1 = dpt.ones(sz, dtype="i4", usm_type=op1_usm_type)
+    ar2 = dpt.ones_like(ar1, dtype="i4", usm_type=op2_usm_type)
+
+    r = dpt.subtract(ar1, ar2)
+    assert isinstance(r, dpt.usm_ndarray)
+    expected_usm_type = dpctl.utils.get_coerced_usm_type(
+        (op1_usm_type, op2_usm_type)
+    )
+    assert r.usm_type == expected_usm_type
+
+
+def test_subtract_order():
+    get_queue_or_skip()
+
+    test_shape = (
+        20,
+        20,
+    )
+    test_shape2 = tuple(2 * dim for dim in test_shape)
+    n = test_shape[-1]
+
+    for dt1, dt2 in zip(["i4", "i4", "f4"], ["i4", "f4", "i4"]):
+        ar1 = dpt.ones(test_shape, dtype=dt1, order="C")
+        ar2 = dpt.ones(test_shape, dtype=dt2, order="C")
+        r1 = dpt.subtract(ar1, ar2, order="C")
+        assert r1.flags.c_contiguous
+        r2 = dpt.subtract(ar1, ar2, order="F")
+        assert r2.flags.f_contiguous
+        r3 = dpt.subtract(ar1, ar2, order="A")
+        assert r3.flags.c_contiguous
+        r4 = dpt.subtract(ar1, ar2, order="K")
+        assert r4.flags.c_contiguous
+
+        ar1 = dpt.ones(test_shape, dtype=dt1, order="F")
+        ar2 = dpt.ones(test_shape, dtype=dt2, order="F")
+        r1 = dpt.subtract(ar1, ar2, order="C")
+        assert r1.flags.c_contiguous
+        r2 = dpt.subtract(ar1, ar2, order="F")
+        assert r2.flags.f_contiguous
+        r3 = dpt.subtract(ar1, ar2, order="A")
+        assert r3.flags.f_contiguous
+        r4 = dpt.subtract(ar1, ar2, order="K")
+        assert r4.flags.f_contiguous
+
+        ar1 = dpt.ones(test_shape2, dtype=dt1, order="C")[:20, ::-2]
+        ar2 = dpt.ones(test_shape2, dtype=dt2, order="C")[:20, ::-2]
+        r4 = dpt.subtract(ar1, ar2, order="K")
+        assert r4.strides == (n, -1)
+        r5 = dpt.subtract(ar1, ar2, order="C")
+        assert r5.strides == (n, 1)
+
+        ar1 = dpt.ones(test_shape2, dtype=dt1, order="C")[:20, ::-2].mT
+        ar2 = dpt.ones(test_shape2, dtype=dt2, order="C")[:20, ::-2].mT
+        r4 = dpt.subtract(ar1, ar2, order="K")
+        assert r4.strides == (-1, n)
+        r5 = dpt.subtract(ar1, ar2, order="C")
+        assert r5.strides == (n, 1)
+
+
+def test_subtract_broadcasting():
+    get_queue_or_skip()
+
+    m = dpt.ones((100, 5), dtype="i4")
+    v = dpt.arange(5, dtype="i4")
+
+    r = dpt.subtract(m, v)
+    assert (
+        dpt.asnumpy(r) == np.arange(1, -4, step=-1, dtype="i4")[np.newaxis, :]
+    ).all()
+
+    r2 = dpt.subtract(v, m)
+    assert (
+        dpt.asnumpy(r2) == np.arange(-1, 4, dtype="i4")[np.newaxis, :]
+    ).all()
+
+
+@pytest.mark.parametrize("arr_dt", _all_dtypes[1:])
+def test_subtract_python_scalar(arr_dt):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(arr_dt, q)
+
+    X = dpt.zeros((10, 10), dtype=arr_dt, sycl_queue=q)
+    py_zeros = (
+        bool(0),
+        int(0),
+        float(0),
+        complex(0),
+        np.float32(0),
+        ctypes.c_int(0),
+    )
+    for sc in py_zeros:
+        R = dpt.subtract(X, sc)
+        assert isinstance(R, dpt.usm_ndarray)
+        R = dpt.subtract(sc, X)
+        assert isinstance(R, dpt.usm_ndarray)
+
+
+@pytest.mark.parametrize("dtype", _all_dtypes[1:])
+def test_subtract_inplace_python_scalar(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+    X = dpt.zeros((10, 10), dtype=dtype, sycl_queue=q)
+    dt_kind = X.dtype.kind
+    if dt_kind in "ui":
+        X -= int(0)
+    elif dt_kind == "f":
+        X -= float(0)
+    elif dt_kind == "c":
+        X -= complex(0)
+
+
+@pytest.mark.parametrize("op1_dtype", _all_dtypes[1:])
+@pytest.mark.parametrize("op2_dtype", _all_dtypes[1:])
+def test_subtract_inplace_dtype_matrix(op1_dtype, op2_dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(op1_dtype, q)
+    skip_if_dtype_not_supported(op2_dtype, q)
+
+    sz = 127
+    ar1 = dpt.ones(sz, dtype=op1_dtype)
+    ar2 = dpt.ones_like(ar1, dtype=op2_dtype)
+
+    dev = q.sycl_device
+    _fp16 = dev.has_aspect_fp16
+    _fp64 = dev.has_aspect_fp64
+    if _can_cast(ar2.dtype, ar1.dtype, _fp16, _fp64, casting="same_kind"):
+        ar1 -= ar2
+        assert (dpt.asnumpy(ar1) == np.zeros(ar1.shape, dtype=ar1.dtype)).all()
+
+        ar3 = dpt.ones(sz, dtype=op1_dtype)
+        ar4 = dpt.ones(2 * sz, dtype=op2_dtype)
+
+        ar3[::-1] -= ar4[::2]
+        assert (dpt.asnumpy(ar3) == np.zeros(ar3.shape, dtype=ar3.dtype)).all()
+
+    else:
+        with pytest.raises(ValueError):
+            ar1 -= ar2
+
+
+def test_subtract_inplace_broadcasting():
+    get_queue_or_skip()
+
+    m = dpt.ones((100, 5), dtype="i4")
+    v = dpt.arange(5, dtype="i4")
+
+    m -= v
+    assert (
+        dpt.asnumpy(m) == np.arange(1, -4, step=-1, dtype="i4")[np.newaxis, :]
+    ).all()
diff --git a/dpnp/tests/tensor/elementwise/test_trigonometric.py b/dpnp/tests/tensor/elementwise/test_trigonometric.py
new file mode 100644
index 000000000000..497432360306
--- /dev/null
+++ b/dpnp/tests/tensor/elementwise/test_trigonometric.py
@@ -0,0 +1,234 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+import numpy as np
+import pytest
+from numpy.testing import assert_allclose
+
+import dpnp.tensor as dpt
+
+from ..helper import (
+    get_queue_or_skip,
+    skip_if_dtype_not_supported,
+)
+from .utils import (
+    _all_dtypes,
+    _map_to_device_dtype,
+)
+
+_trig_funcs = [(np.sin, dpt.sin), (np.cos, dpt.cos), (np.tan, dpt.tan)]
+_inv_trig_funcs = [
+    (np.arcsin, dpt.asin),
+    (np.arccos, dpt.acos),
+    (np.arctan, dpt.atan),
+]
+_all_funcs = _trig_funcs + _inv_trig_funcs
+
+
+@pytest.mark.parametrize("np_call, dpt_call", _all_funcs)
+@pytest.mark.parametrize("dtype", _all_dtypes)
+def test_trig_out_type(np_call, dpt_call, dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    x = dpt.asarray(0, dtype=dtype, sycl_queue=q)
+    expected_dtype = np_call(np.array(0, dtype=dtype)).dtype
+    expected_dtype = _map_to_device_dtype(expected_dtype, q.sycl_device)
+    assert dpt_call(x).dtype == expected_dtype
+
+
+@pytest.mark.parametrize("np_call, dpt_call", _all_funcs)
+@pytest.mark.parametrize("dtype", ["f2", "f4", "f8"])
+def test_trig_real_contig(np_call, dpt_call, dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    n_seq = 100
+    n_rep = 137
+    if np_call in _trig_funcs:
+        Xnp = np.linspace(
+            -np.pi / 2 * 0.99, np.pi / 2 * 0.99, num=n_seq, dtype=dtype
+        )
+    if np_call == np.arctan:
+        Xnp = np.linspace(-100.0, 100.0, num=n_seq, dtype=dtype)
+    else:
+        Xnp = np.linspace(-1.0, 1.0, num=n_seq, dtype=dtype)
+
+    X = dpt.asarray(np.repeat(Xnp, n_rep), dtype=dtype, sycl_queue=q)
+    Y = dpt_call(X)
+
+    tol = 8 * dpt.finfo(dtype).resolution
+    assert_allclose(
+        dpt.asnumpy(Y), np.repeat(np_call(Xnp), n_rep), atol=tol, rtol=tol
+    )
+
+    Z = dpt.empty_like(X, dtype=dtype)
+    dpt_call(X, out=Z)
+
+    assert_allclose(
+        dpt.asnumpy(Z), np.repeat(np_call(Xnp), n_rep), atol=tol, rtol=tol
+    )
+
+
+@pytest.mark.parametrize("np_call, dpt_call", _all_funcs)
+@pytest.mark.parametrize("dtype", ["c8", "c16"])
+def test_trig_complex_contig(np_call, dpt_call, dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    n_seq = 256
+    n_rep = 137
+    low = -9.0
+    high = 9.0
+    x1 = np.random.uniform(low=low, high=high, size=n_seq)
+    x2 = np.random.uniform(low=low, high=high, size=n_seq)
+    Xnp = x1 + 1j * x2
+
+    # stay away from poles and branch lines
+    modulus = np.abs(Xnp)
+    sel = np.logical_or(
+        modulus < 0.9,
+        np.logical_and(
+            modulus > 1.2, np.minimum(np.abs(x2), np.abs(x1)) > 0.05
+        ),
+    )
+    Xnp = Xnp[sel]
+
+    X = dpt.repeat(dpt.asarray(Xnp, dtype=dtype, sycl_queue=q), n_rep)
+    Y = dpt_call(X)
+
+    expected = np.repeat(np_call(Xnp.astype(dtype)), n_rep)
+
+    tol = 50 * dpt.finfo(dtype).resolution
+    assert_allclose(dpt.asnumpy(Y), expected, atol=tol, rtol=tol)
+
+    Z = dpt.empty_like(X, dtype=dtype)
+    dpt_call(X, out=Z)
+
+    assert_allclose(dpt.asnumpy(Z), expected, atol=tol, rtol=tol)
+
+
+@pytest.mark.parametrize("np_call, dpt_call", _all_funcs)
+@pytest.mark.parametrize("dtype", ["f2", "f4", "f8"])
+def test_trig_real_strided(np_call, dpt_call, dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    np.random.seed(42)
+    strides = np.array([-4, -3, -2, -1, 1, 2, 3, 4])
+    sizes = [2, 3, 4, 6, 8, 9, 24, 50, 72]
+    tol = 8 * dpt.finfo(dtype).resolution
+
+    low = -100.0
+    high = 100.0
+    if np_call in [np.arccos, np.arcsin]:
+        low = -1.0
+        high = 1.0
+    elif np_call in [np.tan]:
+        low = -np.pi / 2 * (0.99)
+        high = np.pi / 2 * (0.99)
+
+    for ii in sizes:
+        Xnp = np.random.uniform(low=low, high=high, size=ii)
+        Xnp.astype(dtype)
+        X = dpt.asarray(Xnp)
+        Ynp = np_call(Xnp)
+        for jj in strides:
+            assert_allclose(
+                dpt.asnumpy(dpt_call(X[::jj])),
+                Ynp[::jj],
+                atol=tol,
+                rtol=tol,
+            )
+
+
+@pytest.mark.parametrize("np_call, dpt_call", _all_funcs)
+@pytest.mark.parametrize("dtype", ["c8", "c16"])
+def test_trig_complex_strided(np_call, dpt_call, dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    np.random.seed(42)
+    strides = np.array([-4, -3, -2, -1, 1, 2, 3, 4])
+    sizes = [2, 4, 6, 8, 9, 24, 72]
+    tol = 50 * dpt.finfo(dtype).resolution
+
+    low = -9.0
+    high = 9.0
+    while True:
+        x1 = np.random.uniform(low=low, high=high, size=2 * sum(sizes))
+        x2 = np.random.uniform(low=low, high=high, size=2 * sum(sizes))
+        Xnp_all = np.array(
+            [complex(v1, v2) for v1, v2 in zip(x1, x2)], dtype=dtype
+        )
+
+        # stay away from poles and branch lines
+        modulus = np.abs(Xnp_all)
+        sel = np.logical_or(
+            modulus < 0.9,
+            np.logical_and(
+                modulus > 1.2, np.minimum(np.abs(x2), np.abs(x1)) > 0.05
+            ),
+        )
+        Xnp_all = Xnp_all[sel]
+        if Xnp_all.size > sum(sizes):
+            break
+
+    pos = 0
+    for ii in sizes:
+        pos = pos + ii
+        Xnp = Xnp_all[:pos]
+        Xnp = Xnp[-ii:]
+        X = dpt.asarray(Xnp)
+        Ynp = np_call(Xnp)
+        for jj in strides:
+            assert_allclose(
+                dpt.asnumpy(dpt_call(X[::jj])),
+                Ynp[::jj],
+                atol=tol,
+                rtol=tol,
+            )
+
+
+@pytest.mark.parametrize("np_call, dpt_call", _all_funcs)
+@pytest.mark.parametrize("dtype", ["f2", "f4", "f8"])
+def test_trig_real_special_cases(np_call, dpt_call, dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    x = [np.nan, np.inf, -np.inf, 2.0, -2.0, +0.0, -0.0, +1.0, -1.0]
+
+    xf = np.array(x, dtype=dtype)
+    yf = dpt.asarray(xf, dtype=dtype, sycl_queue=q)
+
+    with np.errstate(all="ignore"):
+        Y_np = np_call(xf)
+
+    tol = 8 * dpt.finfo(dtype).resolution
+    Y = dpt_call(yf)
+    assert_allclose(dpt.asnumpy(Y), Y_np, atol=tol, rtol=tol)
diff --git a/dpnp/tests/tensor/elementwise/test_type_utils.py b/dpnp/tests/tensor/elementwise/test_type_utils.py
new file mode 100644
index 000000000000..45b1501796a3
--- /dev/null
+++ b/dpnp/tests/tensor/elementwise/test_type_utils.py
@@ -0,0 +1,254 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+import dpctl
+import numpy as np
+import pytest
+
+import dpnp.tensor as dpt
+import dpnp.tensor._type_utils as tu
+
+from .utils import (
+    _all_dtypes,
+    _map_to_device_dtype,
+)
+
+
+class MockDevice:
+    def __init__(self, fp16: bool, fp64: bool):
+        self.has_aspect_fp16 = fp16
+        self.has_aspect_fp64 = fp64
+
+
+@pytest.mark.parametrize("dtype", _all_dtypes)
+def test_type_utils_map_to_device_type(dtype):
+    for fp64 in [
+        True,
+        False,
+    ]:
+        for fp16 in [True, False]:
+            dev = MockDevice(fp16, fp64)
+            dt_in = dpt.dtype(dtype)
+            dt_out = _map_to_device_dtype(dt_in, dev)
+            assert isinstance(dt_out, dpt.dtype)
+
+
+def test_type_util_all_data_types():
+    for fp64 in [
+        True,
+        False,
+    ]:
+        for fp16 in [True, False]:
+            r = tu._all_data_types(fp16, fp64)
+            assert isinstance(r, list)
+            # 11: bool + 4 signed + 4 unsigned inegral + float32 + complex64
+            assert len(r) == 11 + int(fp16) + 2 * int(fp64)
+
+
+def test_type_util_can_cast():
+    for fp64 in [
+        True,
+        False,
+    ]:
+        for fp16 in [True, False]:
+            for from_ in _all_dtypes:
+                for to_ in _all_dtypes:
+                    r = tu._can_cast(
+                        dpt.dtype(from_), dpt.dtype(to_), fp16, fp64
+                    )
+                    assert isinstance(r, bool)
+
+
+def test_type_utils_find_buf_dtype():
+    def _denier_fn(dt):
+        return False
+
+    for fp64 in [
+        True,
+        False,
+    ]:
+        for fp16 in [True, False]:
+            dev = MockDevice(fp16, fp64)
+            arg_dt = dpt.float64
+            r = tu._find_buf_dtype(
+                arg_dt, _denier_fn, dev, tu._acceptance_fn_default_unary
+            )
+            assert r == (
+                None,
+                None,
+            )
+
+
+def test_type_utils_get_device_default_type():
+    with pytest.raises(RuntimeError):
+        tu._get_device_default_dtype("-", MockDevice(True, True))
+    try:
+        dev = dpctl.SyclDevice()
+    except dpctl.SyclDeviceCreationError:
+        pytest.skip("No SYCL devices available")
+    for k in ["b", "i", "u", "f", "c"]:
+        dt = tu._get_device_default_dtype(k, dev)
+        assert isinstance(dt, dpt.dtype)
+        assert dt.kind == k
+
+
+def test_type_utils_find_buf_dtype2():
+    def _denier_fn(dt1, dt2):
+        return False
+
+    for fp64 in [
+        True,
+        False,
+    ]:
+        for fp16 in [True, False]:
+            dev = MockDevice(fp16, fp64)
+            arg1_dt = dpt.float64
+            arg2_dt = dpt.complex64
+            r = tu._find_buf_dtype2(
+                arg1_dt,
+                arg2_dt,
+                _denier_fn,
+                dev,
+                tu._acceptance_fn_default_binary,
+            )
+            assert r == (
+                None,
+                None,
+                None,
+            )
+
+
+def test_unary_func_arg_validation():
+    with pytest.raises(TypeError):
+        dpt.abs([1, 2, 3])
+    try:
+        a = dpt.arange(8)
+    except dpctl.SyclDeviceCreationError:
+        pytest.skip("No SYCL devices available")
+    dpt.abs(a, order="invalid")
+
+
+def test_binary_func_arg_validation():
+    with pytest.raises(dpctl.utils.ExecutionPlacementError):
+        dpt.add([1, 2, 3], 1)
+    try:
+        a = dpt.arange(8)
+    except dpctl.SyclDeviceCreationError:
+        pytest.skip("No SYCL devices available")
+    with pytest.raises(ValueError):
+        dpt.add(a, Ellipsis)
+    dpt.add(a, a, order="invalid")
+
+
+def test_all_data_types():
+    fp16_fp64_types = {dpt.float16, dpt.float64, dpt.complex128}
+    fp64_types = {dpt.float64, dpt.complex128}
+
+    all_dts = tu._all_data_types(True, True)
+    assert fp16_fp64_types.issubset(all_dts)
+
+    all_dts = tu._all_data_types(True, False)
+    assert dpt.float16 in all_dts
+    assert not fp64_types.issubset(all_dts)
+
+    all_dts = tu._all_data_types(False, True)
+    assert dpt.float16 not in all_dts
+    assert fp64_types.issubset(all_dts)
+
+    all_dts = tu._all_data_types(False, False)
+    assert not fp16_fp64_types.issubset(all_dts)
+
+
+@pytest.mark.parametrize("fp16", [True, False])
+@pytest.mark.parametrize("fp64", [True, False])
+def test_maximal_inexact_types(fp16, fp64):
+    assert not tu._is_maximal_inexact_type(dpt.int32, fp16, fp64)
+    assert fp64 == tu._is_maximal_inexact_type(dpt.float64, fp16, fp64)
+    assert fp64 == tu._is_maximal_inexact_type(dpt.complex128, fp16, fp64)
+    assert fp64 != tu._is_maximal_inexact_type(dpt.float32, fp16, fp64)
+    assert fp64 != tu._is_maximal_inexact_type(dpt.complex64, fp16, fp64)
+
+
+def test_can_cast_device():
+    assert tu._can_cast(dpt.int64, dpt.float64, True, True)
+    # if f8 is available, can't cast i8 to f4
+    assert not tu._can_cast(dpt.int64, dpt.float32, True, True)
+    assert not tu._can_cast(dpt.int64, dpt.float32, False, True)
+    # should be able to cast to f8 when f2 unavailable
+    assert tu._can_cast(dpt.int64, dpt.float64, False, True)
+    # casting to f4 acceptable when f8 unavailable
+    assert tu._can_cast(dpt.int64, dpt.float32, True, False)
+    assert tu._can_cast(dpt.int64, dpt.float32, False, False)
+    # can't safely cast inexact type to inexact type of lesser precision
+    assert not tu._can_cast(dpt.float32, dpt.float16, True, False)
+    assert not tu._can_cast(dpt.float64, dpt.float32, False, True)
+
+
+def test_acceptance_fns():
+    """Check type promotion acceptance functions"""
+    try:
+        dev = dpctl.SyclDevice()
+    except dpctl.SyclDeviceCreationError:
+        pytest.skip("Default device is not available")
+    assert tu._acceptance_fn_reciprocal(
+        dpt.float32, dpt.float32, dpt.float32, dev
+    )
+    assert tu._acceptance_fn_negative(dpt.int8, dpt.int16, dpt.int16, dev)
+
+
+def test_weak_types():
+    wbt = tu.WeakBooleanType(True)
+    assert wbt.get()
+    assert tu._weak_type_num_kind(wbt) == 0
+
+    wit = tu.WeakIntegralType(7)
+    assert wit.get() == 7
+    assert tu._weak_type_num_kind(wit) == 1
+
+    wft = tu.WeakFloatingType(3.1415926)
+    assert wft.get() == 3.1415926
+    assert tu._weak_type_num_kind(wft) == 2
+
+    wct = tu.WeakComplexType(2.0 + 3.0j)
+    assert wct.get() == 2 + 3j
+    assert tu._weak_type_num_kind(wct) == 3
+
+
+def test_arg_validation():
+    with pytest.raises(TypeError):
+        tu._weak_type_num_kind(dict())
+
+    with pytest.raises(TypeError):
+        tu._strong_dtype_num_kind(Ellipsis)
+
+    with pytest.raises(ValueError):
+        tu._strong_dtype_num_kind(np.dtype("O"))
+
+    wt = tu.WeakFloatingType(2.0)
+    with pytest.raises(ValueError):
+        tu._resolve_weak_types(wt, wt, None)
diff --git a/dpnp/tests/tensor/elementwise/utils.py b/dpnp/tests/tensor/elementwise/utils.py
new file mode 100644
index 000000000000..6717ea577bd3
--- /dev/null
+++ b/dpnp/tests/tensor/elementwise/utils.py
@@ -0,0 +1,74 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+import dpctl
+
+import dpnp.tensor._type_utils as tu
+
+_integral_dtypes = [
+    "i1",
+    "u1",
+    "i2",
+    "u2",
+    "i4",
+    "u4",
+    "i8",
+    "u8",
+]
+_real_fp_dtypes = ["f2", "f4", "f8"]
+_complex_fp_dtypes = [
+    "c8",
+    "c16",
+]
+_real_value_dtypes = _integral_dtypes + _real_fp_dtypes
+_no_complex_dtypes = [
+    "b1",
+] + _real_value_dtypes
+_all_dtypes = _no_complex_dtypes + _complex_fp_dtypes
+
+_usm_types = ["device", "shared", "host"]
+
+
+def _map_to_device_dtype(dt, dev):
+    return tu._to_device_supported_dtype(dt, dev)
+
+
+def _compare_dtypes(dt, ref_dt, sycl_queue=None):
+    assert isinstance(sycl_queue, dpctl.SyclQueue)
+    dev = sycl_queue.sycl_device
+    expected_dt = _map_to_device_dtype(ref_dt, dev)
+    return dt == expected_dt
+
+
+__all__ = [
+    "_no_complex_dtypes",
+    "_all_dtypes",
+    "_usm_types",
+    "_map_to_device_dtype",
+    "_compare_dtypes",
+]
diff --git a/dpnp/tests/tensor/helper/__init__.py b/dpnp/tests/tensor/helper/__init__.py
new file mode 100644
index 000000000000..7fdb1fbe553b
--- /dev/null
+++ b/dpnp/tests/tensor/helper/__init__.py
@@ -0,0 +1,47 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+"""Helper module for tensor tests"""
+
+from ._helper import (
+    create_invalid_capsule,
+    get_queue_or_skip,
+    has_cpu,
+    has_gpu,
+    has_sycl_platforms,
+    skip_if_dtype_not_supported,
+)
+
+__all__ = [
+    "create_invalid_capsule",
+    "has_cpu",
+    "has_gpu",
+    "has_sycl_platforms",
+    "get_queue_or_skip",
+    "skip_if_dtype_not_supported",
+]
diff --git a/dpnp/tests/tensor/helper/_helper.py b/dpnp/tests/tensor/helper/_helper.py
new file mode 100644
index 000000000000..5d0b4825e953
--- /dev/null
+++ b/dpnp/tests/tensor/helper/_helper.py
@@ -0,0 +1,89 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+import dpctl
+import pytest
+
+
+def has_gpu(backend="opencl"):
+    return bool(dpctl.get_num_devices(backend=backend, device_type="gpu"))
+
+
+def has_cpu(backend="opencl"):
+    return bool(dpctl.get_num_devices(backend=backend, device_type="cpu"))
+
+
+def has_sycl_platforms():
+    return bool(len(dpctl.get_platforms()))
+
+
+def create_invalid_capsule():
+    """Creates an invalid capsule for the purpose of testing dpctl
+    constructors that accept capsules.
+    """
+    import ctypes
+
+    ctor = ctypes.pythonapi.PyCapsule_New
+    ctor.restype = ctypes.py_object
+    ctor.argtypes = [ctypes.c_void_p, ctypes.c_char_p, ctypes.c_void_p]
+    return ctor(id(ctor), b"invalid", 0)
+
+
+def get_queue_or_skip(args=()):
+    try:
+        q = dpctl.SyclQueue(*args)
+    except dpctl.SyclQueueCreationError:
+        pytest.skip(f"Queue could not be created from {args}")
+    return q
+
+
+def skip_if_dtype_not_supported(dt, q_or_dev):
+    import dpnp.tensor as dpt
+
+    dt = dpt.dtype(dt)
+    if type(q_or_dev) is dpctl.SyclQueue:
+        dev = q_or_dev.sycl_device
+    elif type(q_or_dev) is dpctl.SyclDevice:
+        dev = q_or_dev
+    else:
+        raise TypeError(
+            "Expected dpctl.SyclQueue or dpctl.SyclDevice, "
+            f"got {type(q_or_dev)}"
+        )
+    dev_has_dp = dev.has_aspect_fp64
+    if dev_has_dp is False and dt in [dpt.float64, dpt.complex128]:
+        pytest.skip(
+            f"{dev.name} does not support double precision floating point types"
+        )
+    dev_has_hp = dev.has_aspect_fp16
+    if dev_has_hp is False and dt in [
+        dpt.float16,
+    ]:
+        pytest.skip(
+            f"{dev.name} does not support half precision floating point type"
+        )
diff --git a/dpnp/tests/tensor/test_tensor_accumulation.py b/dpnp/tests/tensor/test_tensor_accumulation.py
new file mode 100644
index 000000000000..66e979e63a38
--- /dev/null
+++ b/dpnp/tests/tensor/test_tensor_accumulation.py
@@ -0,0 +1,450 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+from random import randrange
+
+import pytest
+from dpctl.utils import ExecutionPlacementError
+
+import dpnp.tensor as dpt
+
+from .helper import (
+    get_queue_or_skip,
+    skip_if_dtype_not_supported,
+)
+
+sint_types = [
+    dpt.int8,
+    dpt.int16,
+    dpt.int32,
+    dpt.int64,
+]
+uint_types = [
+    dpt.uint8,
+    dpt.uint16,
+    dpt.uint32,
+    dpt.uint64,
+]
+rfp_types = [
+    dpt.float16,
+    dpt.float32,
+    dpt.float64,
+]
+cfp_types = [
+    dpt.complex64,
+    dpt.complex128,
+]
+
+no_complex_types = [dpt.bool] + sint_types + uint_types + rfp_types
+
+all_types = [dpt.bool] + sint_types + uint_types + rfp_types + cfp_types
+
+
+@pytest.mark.parametrize("dt", sint_types)
+def test_contig_cumsum_sint(dt):
+    get_queue_or_skip()
+    n = 10000
+    x = dpt.repeat(dpt.asarray([1, -1], dtype=dt), n)
+
+    res = dpt.cumulative_sum(x, dtype=dt)
+
+    ar = dpt.arange(n, dtype=dt)
+    expected = dpt.concat((1 + ar, dpt.flip(ar)))
+    assert dpt.all(res == expected)
+
+
+@pytest.mark.parametrize("dt", sint_types)
+def test_strided_cumsum_sint(dt):
+    get_queue_or_skip()
+    n = 10000
+    x = dpt.repeat(dpt.asarray([1, -1], dtype=dt), 2 * n)[1::2]
+
+    res = dpt.cumulative_sum(x, dtype=dt)
+
+    ar = dpt.arange(n, dtype=dt)
+    expected = dpt.concat((1 + ar, dpt.flip(ar)))
+    assert dpt.all(res == expected)
+
+    x2 = dpt.repeat(dpt.asarray([-1, 1], dtype=dt), 2 * n)[-1::-2]
+
+    res = dpt.cumulative_sum(x2, dtype=dt)
+
+    ar = dpt.arange(n, dtype=dt)
+    expected = dpt.concat((1 + ar, dpt.flip(ar)))
+    assert dpt.all(res == expected)
+
+
+@pytest.mark.parametrize("dt", sint_types)
+def test_contig_cumsum_axis_sint(dt):
+    get_queue_or_skip()
+    n0, n1 = 1000, 173
+    x = dpt.repeat(dpt.asarray([1, -1], dtype=dt), n0)
+    m = dpt.tile(dpt.expand_dims(x, axis=1), (1, n1))
+
+    res = dpt.cumulative_sum(m, dtype=dt, axis=0)
+
+    ar = dpt.arange(n0, dtype=dt)
+    expected = dpt.concat((1 + ar, dpt.flip(ar)))
+    assert dpt.all(res == dpt.expand_dims(expected, axis=1))
+
+
+@pytest.mark.parametrize("dt", sint_types)
+def test_strided_cumsum_axis_sint(dt):
+    get_queue_or_skip()
+    n0, n1 = 1000, 173
+    x = dpt.repeat(dpt.asarray([1, -1], dtype=dt), 2 * n0)
+    m = dpt.tile(dpt.expand_dims(x, axis=1), (1, n1))[1::2, ::-1]
+
+    res = dpt.cumulative_sum(m, dtype=dt, axis=0)
+
+    ar = dpt.arange(n0, dtype=dt)
+    expected = dpt.concat((1 + ar, dpt.flip(ar)))
+    assert dpt.all(res == dpt.expand_dims(expected, axis=1))
+
+
+def test_accumulate_scalar():
+    get_queue_or_skip()
+
+    s = dpt.asarray(1, dtype="i8")
+    r = dpt.cumulative_sum(s)
+    assert r == s
+    assert r.ndim == s.ndim
+
+    r = dpt.cumulative_sum(s, include_initial=True)
+    r_expected = dpt.asarray([0, 1], dtype="i8")
+    assert dpt.all(r == r_expected)
+
+
+def test_cumulative_sum_include_initial():
+    get_queue_or_skip()
+
+    n0, n1 = 3, 5
+    x = dpt.ones((n0, n1), dtype="i4")
+    r = dpt.cumulative_sum(x, axis=0, include_initial=True)
+    assert dpt.all(r[0, :] == 0)
+
+    r = dpt.cumulative_sum(x, axis=1, include_initial=True)
+    assert dpt.all(r[:, 0] == 0)
+
+    x = dpt.ones(n1, dtype="i4")
+    r = dpt.cumulative_sum(x, include_initial=True)
+    assert r.shape == (n1 + 1,)
+    assert r[0] == 0
+
+
+def test_cumulative_prod_identity():
+    get_queue_or_skip()
+
+    x = dpt.zeros(5, dtype="i4")
+    r = dpt.cumulative_prod(x, include_initial=True)
+    assert r[0] == 1
+
+
+def test_cumulative_logsumexp_identity():
+    get_queue_or_skip()
+
+    x = dpt.ones(5, dtype="f4")
+    r = dpt.cumulative_logsumexp(x, include_initial=True)
+    assert r[0] == -dpt.inf
+
+
+def test_accumulate_zero_size_dims():
+    get_queue_or_skip()
+
+    n0, n1, n2 = 3, 0, 5
+    x = dpt.ones((n0, n1, n2), dtype="i8")
+    r = dpt.cumulative_sum(x, axis=1)
+    assert r.shape == x.shape
+    assert r.size == 0
+
+    r = dpt.cumulative_sum(x, axis=0)
+    assert r.shape == x.shape
+    assert r.size == 0
+
+    r = dpt.cumulative_sum(x, axis=1, include_initial=True)
+    assert r.shape == (n0, n1 + 1, n2)
+    assert r.size == (n0 * n2)
+
+    r = dpt.cumulative_sum(x, axis=0, include_initial=True)
+    assert r.shape == (n0 + 1, n1, n2)
+    assert r.size == 0
+
+
+@pytest.mark.parametrize("arg_dtype", all_types)
+def test_cumsum_arg_dtype_default_output_dtype_matrix(arg_dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(arg_dtype, q)
+
+    n = 100
+    x = dpt.ones(n, dtype=arg_dtype)
+    r = dpt.cumulative_sum(x)
+
+    assert isinstance(r, dpt.usm_ndarray)
+    if x.dtype.kind == "i":
+        assert r.dtype.kind == "i"
+    elif x.dtype.kind == "u":
+        assert r.dtype.kind == "u"
+    elif x.dtype.kind == "fc":
+        assert r.dtype == arg_dtype
+
+    r_expected = dpt.arange(1, n + 1, dtype=r.dtype)
+
+    assert dpt.all(r == r_expected)
+
+
+@pytest.mark.parametrize("arg_dtype", all_types)
+@pytest.mark.parametrize("out_dtype", all_types)
+def test_cumsum_arg_out_dtype_matrix(arg_dtype, out_dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(arg_dtype, q)
+    skip_if_dtype_not_supported(out_dtype, q)
+
+    n = 100
+    x = dpt.ones(n, dtype=arg_dtype)
+    r = dpt.cumulative_sum(x, dtype=out_dtype)
+
+    assert isinstance(r, dpt.usm_ndarray)
+    assert r.dtype == dpt.dtype(out_dtype)
+    if out_dtype == dpt.bool:
+        assert dpt.all(r)
+    else:
+        r_expected = dpt.arange(1, n + 1, dtype=out_dtype)
+        assert dpt.all(r == r_expected)
+
+
+def test_accumulator_out_kwarg():
+    q = get_queue_or_skip()
+
+    n = 100
+
+    expected = dpt.arange(1, n + 1, dtype="i4", sycl_queue=q)
+    x = dpt.ones(n, dtype="i4", sycl_queue=q)
+    out = dpt.empty_like(x, dtype="i4")
+    dpt.cumulative_sum(x, dtype="i4", out=out)
+    assert dpt.all(expected == out)
+
+    # overlap
+    x = dpt.ones(n, dtype="i4", sycl_queue=q)
+    dpt.cumulative_sum(x, dtype="i4", out=x)
+    assert dpt.all(x == expected)
+
+    # axis before final axis
+    expected = dpt.broadcast_to(
+        dpt.arange(1, n + 1, dtype="i4", sycl_queue=q), (n, n)
+    ).mT
+    x = dpt.ones((n, n), dtype="i4", sycl_queue=q)
+    out = dpt.empty_like(x, dtype="i4")
+    dpt.cumulative_sum(x, axis=0, dtype="i4", out=out)
+    assert dpt.all(expected == out)
+
+    # scalar
+    x = dpt.asarray(3, dtype="i4")
+    out = dpt.empty((), dtype="i4")
+    expected = 3
+    dpt.cumulative_sum(x, dtype="i4", out=out)
+    assert expected == out
+
+
+def test_accumulator_arg_validation():
+    q1 = get_queue_or_skip()
+    q2 = get_queue_or_skip()
+
+    n = 5
+    x1 = dpt.ones((n, n), dtype="f4", sycl_queue=q1)
+    x2 = dpt.ones(n, dtype="f4", sycl_queue=q1)
+
+    # must be usm_ndarray
+    with pytest.raises(TypeError):
+        dpt.cumulative_sum(dict())
+
+    # axis must be specified when input not 1D
+    with pytest.raises(ValueError):
+        dpt.cumulative_sum(x1)
+
+    # out must be usm_ndarray
+    with pytest.raises(TypeError):
+        dpt.cumulative_sum(x2, out=dict())
+
+    # out must be writable
+    out_not_writable = dpt.empty_like(x2)
+    out_not_writable.flags.writable = False
+    with pytest.raises(ValueError):
+        dpt.cumulative_sum(x2, out=out_not_writable)
+
+    # out must be expected shape
+    out_wrong_shape = dpt.ones(n + 1, dtype=x2.dtype, sycl_queue=q1)
+    with pytest.raises(ValueError):
+        dpt.cumulative_sum(x2, out=out_wrong_shape)
+
+    # out must be expected dtype
+    out_wrong_dtype = dpt.empty_like(x2, dtype="i4")
+    with pytest.raises(ValueError):
+        dpt.cumulative_sum(x2, out=out_wrong_dtype)
+
+    # compute follows data
+    out_wrong_queue = dpt.empty_like(x2, sycl_queue=q2)
+    with pytest.raises(ExecutionPlacementError):
+        dpt.cumulative_sum(x2, out=out_wrong_queue)
+
+
+def test_cumsum_nan_propagation():
+    get_queue_or_skip()
+
+    n = 100
+    x = dpt.ones(n, dtype="f4")
+    i = randrange(n)
+    x[i] = dpt.nan
+
+    r = dpt.cumulative_sum(x)
+    assert dpt.all(dpt.isnan(r[i:]))
+
+
+def test_cumprod_nan_propagation():
+    get_queue_or_skip()
+
+    n = 100
+    x = dpt.ones(n, dtype="f4")
+    i = randrange(n)
+    x[i] = dpt.nan
+
+    r = dpt.cumulative_prod(x)
+    assert dpt.all(dpt.isnan(r[i:]))
+
+
+def test_logcumsumexp_nan_propagation():
+    get_queue_or_skip()
+
+    n = 100
+    x = dpt.ones(n, dtype="f4")
+    i = randrange(n)
+    x[i] = dpt.nan
+
+    r = dpt.cumulative_logsumexp(x)
+    assert dpt.all(dpt.isnan(r[i:]))
+
+
+@pytest.mark.parametrize("arg_dtype", no_complex_types)
+def test_logcumsumexp_arg_dtype_default_output_dtype_matrix(arg_dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(arg_dtype, q)
+
+    x = dpt.ones(10, dtype=arg_dtype, sycl_queue=q)
+    r = dpt.cumulative_logsumexp(x)
+
+    if arg_dtype.kind in "biu":
+        assert r.dtype.kind == "f"
+    else:
+        assert r.dtype == arg_dtype
+
+
+def test_logcumsumexp_complex_error():
+    get_queue_or_skip()
+
+    x = dpt.ones(10, dtype="c8")
+    with pytest.raises(ValueError):
+        dpt.cumulative_logsumexp(x)
+
+
+def test_cumprod_basic():
+    get_queue_or_skip()
+
+    n = 50
+    val = 2
+    x = dpt.full(n, val, dtype="i8")
+    r = dpt.cumulative_prod(x)
+    expected = dpt.pow(val, dpt.arange(1, n + 1, dtype="i8"))
+
+    assert dpt.all(r == expected)
+
+    x = dpt.tile(dpt.asarray([2, 0.5], dtype="f4"), 10000)
+    expected = dpt.tile(dpt.asarray([2, 1], dtype="f4"), 10000)
+    r = dpt.cumulative_prod(x)
+    assert dpt.all(r == expected)
+
+
+def test_logcumsumexp_basic():
+    get_queue_or_skip()
+
+    dt = dpt.float32
+    x = dpt.ones(1000, dtype=dt)
+    r = dpt.cumulative_logsumexp(x)
+
+    expected = 1 + dpt.log(dpt.arange(1, 1001, dtype=dt))
+
+    tol = 4 * dpt.finfo(dt).resolution
+    assert dpt.allclose(r, expected, atol=tol, rtol=tol)
+
+
+def geometric_series_closed_form(n, dtype=None, device=None):
+    """Closed form for cumulative_logsumexp(dpt.arange(-n, 0))
+
+    :math:`r[k] == -n + k + log(1 - exp(-k-1)) - log(1-exp(-1))`
+    """
+    x = dpt.arange(-n, 0, dtype=dtype, device=device)
+    y = dpt.arange(-1, -n - 1, step=-1, dtype=dtype, device=device)
+    y = dpt.exp(y, out=y)
+    y = dpt.negative(y, out=y)
+    y = dpt.log1p(y, out=y)
+    y -= y[0]
+    return x + y
+
+
+@pytest.mark.parametrize("fpdt", rfp_types)
+def test_cumulative_logsumexp_closed_form(fpdt):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(fpdt, q)
+
+    n = 128
+    r = dpt.cumulative_logsumexp(dpt.arange(-n, 0, dtype=fpdt, device=q))
+    expected = geometric_series_closed_form(n, dtype=fpdt, device=q)
+
+    tol = 4 * dpt.finfo(fpdt).eps
+    assert dpt.allclose(r, expected, atol=tol, rtol=tol)
+
+
+@pytest.mark.parametrize("p", [257, 260, 273, 280, 509, 512])
+def test_cumulative_sum_gh_1901(p):
+    get_queue_or_skip()
+
+    n = p * p
+    dt = dpt.int32
+    inp = dpt.ones(n, dtype=dt)
+    r = dpt.cumulative_sum(inp, dtype=dt)
+    assert dpt.all(r == dpt.arange(1, n + 1, dtype=dt))
+
+
+@pytest.mark.parametrize(
+    "dt", ["i1", "i2", "i4", "i8", "f2", "f4", "f8", "c8", "c16"]
+)
+def test_gh_2017(dt):
+    "See https://github.com/IntelPython/dpctl/issues/2017"
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dt, q)
+    x = dpt.asarray([-1, 1], dtype=dpt.dtype(dt), sycl_queue=q)
+    r = dpt.cumulative_sum(x, dtype="?")
+    assert dpt.all(r)
diff --git a/dpnp/tests/tensor/test_tensor_array_api_inspection.py b/dpnp/tests/tensor/test_tensor_array_api_inspection.py
new file mode 100644
index 000000000000..2eb198944656
--- /dev/null
+++ b/dpnp/tests/tensor/test_tensor_array_api_inspection.py
@@ -0,0 +1,238 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+import dpctl
+import pytest
+
+import dpnp.tensor as dpt
+from dpnp.tensor._tensor_impl import (
+    default_device_complex_type,
+    default_device_fp_type,
+    default_device_index_type,
+    default_device_int_type,
+)
+
+_dtypes_no_fp16_fp64 = {
+    "bool": dpt.bool,
+    "float32": dpt.float32,
+    "complex64": dpt.complex64,
+    "int8": dpt.int8,
+    "int16": dpt.int16,
+    "int32": dpt.int32,
+    "int64": dpt.int64,
+    "uint8": dpt.uint8,
+    "uint16": dpt.uint16,
+    "uint32": dpt.uint32,
+    "uint64": dpt.uint64,
+}
+
+
+def test_array_api_inspection_methods():
+    info = dpt.__array_namespace_info__()
+    assert info.capabilities()
+    try:
+        assert info.default_device()
+    except dpctl.SyclDeviceCreationError:
+        pytest.skip("No default device available")
+    assert info.default_dtypes()
+    assert info.devices()
+    assert info.dtypes()
+
+
+def test_array_api_inspection_default_device():
+    try:
+        dev = dpctl.select_default_device()
+    except dpctl.SyclDeviceCreationError:
+        pytest.skip("No default device available")
+    assert dpt.__array_namespace_info__().default_device() == dev
+
+
+def test_array_api_inspection_devices():
+    try:
+        devices2 = dpctl.get_devices()
+    except dpctl.SyclDeviceCreationError:
+        pytest.skip("No default device available")
+    devices1 = dpt.__array_namespace_info__().devices()
+    assert len(devices1) == len(devices2)
+    assert devices1 == devices2
+
+
+def test_array_api_inspection_capabilities():
+    capabilities = dpt.__array_namespace_info__().capabilities()
+    assert capabilities["boolean indexing"]
+    assert capabilities["data-dependent shapes"]
+    assert capabilities["max dimensions"] is None
+
+
+def test_array_api_inspection_default_dtypes():
+    try:
+        dev = dpctl.select_default_device()
+    except dpctl.SyclDeviceCreationError:
+        pytest.skip("No default device available")
+
+    int_dt = default_device_int_type(dev)
+    ind_dt = default_device_index_type(dev)
+    fp_dt = default_device_fp_type(dev)
+    cm_dt = default_device_complex_type(dev)
+
+    info = dpt.__array_namespace_info__()
+    default_dts_nodev = info.default_dtypes()
+    default_dts_dev = info.default_dtypes(device=dev)
+
+    assert (
+        int_dt == default_dts_nodev["integral"] == default_dts_dev["integral"]
+    )
+    assert (
+        ind_dt == default_dts_nodev["indexing"] == default_dts_dev["indexing"]
+    )
+    assert (
+        fp_dt
+        == default_dts_nodev["real floating"]
+        == default_dts_dev["real floating"]
+    )
+    assert (
+        cm_dt
+        == default_dts_nodev["complex floating"]
+        == default_dts_dev["complex floating"]
+    )
+
+
+def test_array_api_inspection_default_device_dtypes():
+    try:
+        dev = dpctl.select_default_device()
+    except dpctl.SyclDeviceCreationError:
+        pytest.skip("No default device available")
+    dtypes = _dtypes_no_fp16_fp64.copy()
+    if dev.has_aspect_fp64:
+        dtypes["float64"] = dpt.float64
+        dtypes["complex128"] = dpt.complex128
+
+    assert dtypes == dpt.__array_namespace_info__().dtypes()
+
+
+def test_array_api_inspection_device_dtypes():
+    info = dpt.__array_namespace_info__()
+    try:
+        dev = info.default_device()
+    except dpctl.SyclDeviceCreationError:
+        pytest.skip("No default device available")
+    dtypes = _dtypes_no_fp16_fp64.copy()
+    if dev.has_aspect_fp64:
+        dtypes["float64"] = dpt.float64
+        dtypes["complex128"] = dpt.complex128
+
+    assert dtypes == dpt.__array_namespace_info__().dtypes(device=dev)
+
+
+def test_array_api_inspection_dtype_kind():
+    info = dpt.__array_namespace_info__()
+    try:
+        info.default_device()
+    except dpctl.SyclDeviceCreationError:
+        pytest.skip("No default device available")
+
+    f_dtypes = info.dtypes(kind="real floating")
+    assert all([_dt[1].kind == "f" for _dt in f_dtypes.items()])
+
+    i_dtypes = info.dtypes(kind="signed integer")
+    assert all([_dt[1].kind == "i" for _dt in i_dtypes.items()])
+
+    u_dtypes = info.dtypes(kind="unsigned integer")
+    assert all([_dt[1].kind == "u" for _dt in u_dtypes.items()])
+
+    ui_dtypes = info.dtypes(kind="unsigned integer")
+    assert all([_dt[1].kind in "ui" for _dt in ui_dtypes.items()])
+
+    c_dtypes = info.dtypes(kind="complex floating")
+    assert all([_dt[1].kind == "c" for _dt in c_dtypes.items()])
+
+    assert info.dtypes(kind="bool") == {"bool": dpt.bool}
+
+    _signed_ints = {
+        "int8": dpt.int8,
+        "int16": dpt.int16,
+        "int32": dpt.int32,
+        "int64": dpt.int64,
+    }
+    assert (
+        info.dtypes(kind=("signed integer", "signed integer")) == _signed_ints
+    )
+    assert (
+        info.dtypes(
+            kind=("integral", "bool", "real floating", "complex floating")
+        )
+        == info.dtypes()
+    )
+    assert info.dtypes(
+        kind=("integral", "real floating", "complex floating")
+    ) == info.dtypes(kind="numeric")
+
+
+def test_array_api_inspection_dtype_kind_errors():
+    info = dpt.__array_namespace_info__()
+    try:
+        info.default_device()
+    except dpctl.SyclDeviceCreationError:
+        pytest.skip("No default device available")
+
+    with pytest.raises(ValueError):
+        info.dtypes(kind="error")
+
+    with pytest.raises(TypeError):
+        info.dtypes(kind={0: "real floating"})
+
+
+def test_array_api_inspection_device_types():
+    info = dpt.__array_namespace_info__()
+    try:
+        dev = info.default_device()
+    except dpctl.SyclDeviceCreationError:
+        pytest.skip("No default device available")
+
+    q = dpctl.SyclQueue(dev)
+    assert info.default_dtypes(device=q)
+    assert info.dtypes(device=q)
+
+    dev_dpt = dpt.Device.create_device(dev)
+    assert info.default_dtypes(device=dev_dpt)
+    assert info.dtypes(device=dev_dpt)
+
+    filter = dev.get_filter_string()
+    assert info.default_dtypes(device=filter)
+    assert info.dtypes(device=filter)
+
+
+def test_array_api_inspection_device_errors():
+    info = dpt.__array_namespace_info__()
+
+    bad_dev = {}
+    with pytest.raises(TypeError):
+        info.dtypes(device=bad_dev)
+
+    with pytest.raises(TypeError):
+        info.default_dtypes(device=bad_dev)
diff --git a/dpnp/tests/tensor/test_tensor_asarray.py b/dpnp/tests/tensor/test_tensor_asarray.py
new file mode 100644
index 000000000000..33d6d00e3ba8
--- /dev/null
+++ b/dpnp/tests/tensor/test_tensor_asarray.py
@@ -0,0 +1,664 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+import dpctl
+import numpy as np
+import pytest
+
+import dpnp.tensor as dpt
+
+from .helper import (
+    get_queue_or_skip,
+    skip_if_dtype_not_supported,
+)
+
+
+@pytest.mark.parametrize(
+    "src_usm_type, dst_usm_type",
+    [
+        ("device", "shared"),
+        ("device", "host"),
+        ("shared", "device"),
+        ("shared", "host"),
+        ("host", "device"),
+        ("host", "shared"),
+    ],
+)
+def test_asarray_change_usm_type(src_usm_type, dst_usm_type):
+    try:
+        d = dpctl.SyclDevice()
+    except dpctl.SyclDeviceCreationError:
+        pytest.skip("No SYCL devices available")
+    X = dpt.empty(10, dtype="u1", usm_type=src_usm_type)
+    Y = dpt.asarray(X, usm_type=dst_usm_type)
+    assert X.shape == Y.shape
+    assert X.usm_type == src_usm_type
+    assert Y.usm_type == dst_usm_type
+
+    with pytest.raises(ValueError):
+        # zero copy is not possible
+        dpt.asarray(X, usm_type=dst_usm_type, copy=False)
+
+    Y = dpt.asarray(X, usm_type=dst_usm_type, sycl_queue=X.sycl_queue)
+    assert X.shape == Y.shape
+    assert Y.usm_type == dst_usm_type
+
+    Y = dpt.asarray(
+        X,
+        usm_type=dst_usm_type,
+        sycl_queue=X.sycl_queue,
+        device=d.get_filter_string(),
+    )
+    assert X.shape == Y.shape
+    assert Y.usm_type == dst_usm_type
+
+
+def test_asarray_from_numpy():
+    Xnp = np.arange(10)
+    try:
+        Y = dpt.asarray(Xnp, usm_type="device")
+    except dpctl.SyclDeviceCreationError:
+        pytest.skip("No SYCL devices available")
+    assert type(Y) is dpt.usm_ndarray
+    assert Y.shape == Xnp.shape
+    assert Y.dtype == Xnp.dtype
+    # Fortran contiguous case
+    Xnp = np.array([[1, 2, 3], [4, 5, 6]], dtype="f4", order="F")
+    Y = dpt.asarray(Xnp, usm_type="shared")
+    assert type(Y) is dpt.usm_ndarray
+    assert Y.shape == Xnp.shape
+    assert Y.dtype == Xnp.dtype
+    # general strided case
+    Xnp = np.array([[1, 2, 3], [4, 5, 6]], dtype="i8")
+    Y = dpt.asarray(Xnp[::-1, ::-1], usm_type="host")
+    assert type(Y) is dpt.usm_ndarray
+    assert Y.shape == Xnp.shape
+    assert Y.dtype == Xnp.dtype
+
+
+def test_asarray_from_sequence():
+    X = [1, 2, 3]
+    try:
+        Y = dpt.asarray(X, usm_type="device")
+    except dpctl.SyclDeviceCreationError:
+        pytest.skip("No SYCL devices available")
+    assert type(Y) is dpt.usm_ndarray
+
+    X = [(1, 1), (2.0, 2.0 + 1.0j), range(4, 6), np.array([3, 4], dtype="c16")]
+    Y = dpt.asarray(X, usm_type="device")
+    assert type(Y) is dpt.usm_ndarray
+    assert Y.ndim == 2
+    assert Y.shape == (len(X), 2)
+
+    X = []
+    Y = dpt.asarray(X, usm_type="device")
+    assert type(Y) is dpt.usm_ndarray
+    assert Y.shape == (0,)
+
+    X = [[], []]
+    Y = dpt.asarray(X, usm_type="device")
+    assert type(Y) is dpt.usm_ndarray
+    assert Y.shape == (2, 0)
+
+    X = [True, False]
+    Y = dpt.asarray(X, usm_type="device")
+    assert type(Y) is dpt.usm_ndarray
+    assert Y.dtype.kind == "b"
+
+
+def test_asarray_from_object_with_suai():
+    """Test that asarray can deal with opaque objects implementing SUAI"""
+
+    class Dummy:
+        def __init__(self, obj, iface):
+            self.obj = obj
+            self.__sycl_usm_array_interface__ = iface
+
+    try:
+        X = dpt.empty((2, 3, 4), dtype="f4")
+    except dpctl.SyclDeviceCreationError:
+        pytest.skip("No SYCL devices available")
+    Y = dpt.asarray(Dummy(X, X.__sycl_usm_array_interface__))
+    assert Y.shape == X.shape
+    assert X.usm_type == Y.usm_type
+    assert X.dtype == Y.dtype
+    assert X.sycl_device == Y.sycl_device
+
+
+def test_asarray_input_validation():
+    with pytest.raises(TypeError):
+        # copy keyword is not of right type
+        dpt.asarray([1], copy="invalid")
+    with pytest.raises(TypeError):
+        # order keyword is not valid
+        dpt.asarray([1], order=1)
+    with pytest.raises(TypeError):
+        # dtype is not valid
+        dpt.asarray([1], dtype="invalid")
+    with pytest.raises(ValueError):
+        # unexpected value of order
+        dpt.asarray([1], order="Z")
+    with pytest.raises(TypeError):
+        # usm_type is of wrong type
+        dpt.asarray([1], usm_type=dict())
+    with pytest.raises(ValueError):
+        # usm_type has wrong value
+        dpt.asarray([1], usm_type="mistake")
+    try:
+        wrong_queue_type = dpctl.SyclContext()
+    except dpctl.SyclContextCreationError:
+        # use any other type
+        wrong_queue_type = Ellipsis
+    with pytest.raises(TypeError):
+        # sycl_queue type is not right
+        dpt.asarray([1], sycl_queue=wrong_queue_type)
+    with pytest.raises(ValueError):
+        # sequence is not rectangular
+        dpt.asarray([[1], 2])
+    with pytest.raises(OverflowError):
+        # Python int too large for type
+        dpt.asarray(-9223372036854775809, dtype="i4")
+    with pytest.raises(ValueError):
+        # buffer to usm_ndarray requires a copy
+        dpt.asarray(memoryview(np.arange(5)), copy=False)
+    with pytest.raises(ValueError):
+        # Numpy array to usm_ndarray requires a copy
+        dpt.asarray(np.arange(5), copy=False)
+    with pytest.raises(ValueError):
+        # Python sequence to usm_ndarray requires a copy
+        dpt.asarray([1, 2, 3], copy=False)
+    with pytest.raises(ValueError):
+        # Python scalar to usm_ndarray requires a copy
+        dpt.asarray(5, copy=False)
+
+
+def test_asarray_input_validation2():
+    d = dpctl.get_devices()
+    if len(d) < 2:
+        pytest.skip("Not enough SYCL devices available")
+
+    d0, d1 = d[:2]
+    try:
+        q0 = dpctl.SyclQueue(d0)
+    except dpctl.SyclQueueCreationError:
+        pytest.skip(f"SyclQueue could not be created for {d0}")
+    try:
+        q1 = dpctl.SyclQueue(d1)
+    except dpctl.SyclQueueCreationError:
+        pytest.skip(f"SyclQueue could not be created for {d1}")
+    with pytest.raises(TypeError):
+        dpt.asarray([1, 2], sycl_queue=q0, device=q1)
+
+
+def test_asarray_scalars():
+    import ctypes
+
+    try:
+        Y = dpt.asarray(5)
+    except dpctl.SyclDeviceCreationError:
+        pytest.skip("No SYCL devices available")
+    assert Y.dtype == dpt.dtype(int)
+    Y = dpt.asarray(5.2)
+    if Y.sycl_device.has_aspect_fp64:
+        assert Y.dtype == dpt.dtype(float)
+    else:
+        assert Y.dtype == dpt.dtype(dpt.float32)
+    Y = dpt.asarray(np.float32(2.3))
+    assert Y.dtype == dpt.dtype(dpt.float32)
+    Y = dpt.asarray(1.0j)
+    if Y.sycl_device.has_aspect_fp64:
+        assert Y.dtype == dpt.dtype(complex)
+    else:
+        assert Y.dtype == dpt.dtype(dpt.complex64)
+    Y = dpt.asarray(ctypes.c_int(8))
+    assert Y.dtype == dpt.dtype(ctypes.c_int)
+
+
+def test_asarray_copy_false():
+    q = get_queue_or_skip()
+    rng = np.random.default_rng()
+    Xnp = rng.integers(low=-255, high=255, size=(10, 4), dtype=np.int64)
+    X = dpt.from_numpy(Xnp, usm_type="device", sycl_queue=q)
+    Y1 = dpt.asarray(X, copy=False, order="K")
+    assert Y1 is X
+    Y1c = dpt.asarray(X, copy=True, order="K")
+    assert not (Y1c is X)
+    Y2 = dpt.asarray(X, copy=False, order="C")
+    assert Y2 is X
+    Y3 = dpt.asarray(X, copy=False, order="A")
+    assert Y3 is X
+    with pytest.raises(ValueError):
+        Y1 = dpt.asarray(X, copy=False, order="F")
+    Xf = dpt.empty(
+        X.shape,
+        dtype=X.dtype,
+        usm_type="device",
+        sycl_queue=X.sycl_queue,
+        order="F",
+    )
+    Xf[:] = X
+    Y4 = dpt.asarray(Xf, copy=False, order="K")
+    assert Y4 is Xf
+    Y5 = dpt.asarray(Xf, copy=False, order="F")
+    assert Y5 is Xf
+    Y6 = dpt.asarray(Xf, copy=False, order="A")
+    assert Y6 is Xf
+    with pytest.raises(ValueError):
+        dpt.asarray(Xf, copy=False, order="C")
+
+
+def test_asarray_invalid_dtype():
+    q = get_queue_or_skip()
+    Xnp = np.array([1, 2, 3], dtype=object)
+    with pytest.raises(TypeError):
+        dpt.asarray(Xnp, sycl_queue=q)
+
+
+def test_asarray_cross_device():
+    q = get_queue_or_skip()
+    qprof = dpctl.SyclQueue(property="enable_profiling")
+    x = dpt.empty(10, dtype="i8", sycl_queue=q)
+    y = dpt.asarray(x, sycl_queue=qprof)
+    assert y.sycl_queue == qprof
+
+
+def test_asarray_seq_of_arrays_simple():
+    get_queue_or_skip()
+    r = dpt.arange(10)
+    m = dpt.asarray(
+        [
+            r,
+        ]
+        * 4
+    )
+    assert m.shape == (4,) + r.shape
+    assert m.dtype == r.dtype
+    assert m.device == r.device
+
+
+def test_asarray_seq_of_arrays():
+    get_queue_or_skip()
+    m = dpt.ones((2, 4), dtype="i4")
+    w = dpt.zeros(4)
+    v = dpt.full(4, -1)
+    ar = dpt.asarray([m, [w, v]])
+    assert ar.shape == (2, 2, 4)
+    assert ar.device == m.device
+    assert ar.device == w.device
+    assert ar.device == v.device
+
+
+def test_asarray_seq_of_array_different_queue():
+    get_queue_or_skip()
+    m = dpt.ones((2, 4), dtype="i4")
+    w = dpt.zeros(4)
+    v = dpt.full(4, -1)
+    qprof = dpctl.SyclQueue(property="enable_profiling")
+    ar = dpt.asarray([m, [w, v]], sycl_queue=qprof)
+    assert ar.shape == (2, 2, 4)
+    assert ar.sycl_queue == qprof
+
+
+def test_asarray_seq_of_suai():
+    get_queue_or_skip()
+
+    class Dummy:
+        def __init__(self, obj, iface):
+            self.obj = obj
+            self.__sycl_usm_array_interface__ = iface
+
+    o = dpt.empty(0, usm_type="shared")
+    d = Dummy(o, o.__sycl_usm_array_interface__)
+    x = dpt.asarray(d)
+    assert x.shape == (0,)
+    assert x.usm_type == o.usm_type
+    assert x._pointer == o._pointer
+    assert x.sycl_queue == o.sycl_queue
+
+    x = dpt.asarray([d, d])
+    assert x.shape == (2, 0)
+    assert x.usm_type == o.usm_type
+    assert x.sycl_queue == o.sycl_queue
+
+
+def test_asarray_seq_of_suai_different_queue():
+    q = get_queue_or_skip()
+
+    class Dummy:
+        def __init__(self, obj, iface):
+            self.obj = obj
+            self.__sycl_usm_array_interface__ = iface
+
+        @property
+        def shape(self):
+            return self.__sycl_usm_array_interface__["shape"]
+
+    q2 = dpctl.SyclQueue()
+    assert q != q2
+    o = dpt.empty((2, 2), usm_type="shared", sycl_queue=q2)
+    d = Dummy(o, o.__sycl_usm_array_interface__)
+
+    x = dpt.asarray(d, sycl_queue=q)
+    assert x.sycl_queue == q
+    assert x.shape == d.shape
+    x = dpt.asarray([d], sycl_queue=q)
+    assert x.sycl_queue == q
+    assert x.shape == (1,) + d.shape
+    x = dpt.asarray([d, d], sycl_queue=q)
+    assert x.sycl_queue == q
+    assert x.shape == (2,) + d.shape
+
+
+def test_asarray_seq_of_arrays_on_different_queues():
+    q = get_queue_or_skip()
+
+    m = dpt.empty((2, 4), dtype="i2", sycl_queue=q)
+    q2 = dpctl.SyclQueue()
+    w = dpt.empty(4, dtype="i1", sycl_queue=q2)
+    q3 = dpctl.SyclQueue()
+    py_seq = [
+        0,
+    ] * w.shape[0]
+    res = dpt.asarray([m, [w, py_seq]], sycl_queue=q3)
+    assert res.sycl_queue == q3
+    assert dpt.isdtype(res.dtype, "integral")
+
+    res = dpt.asarray([m, [w, range(w.shape[0])]], sycl_queue=q3)
+    assert res.sycl_queue == q3
+    assert dpt.isdtype(res.dtype, "integral")
+
+    res = dpt.asarray([m, [w, w]], sycl_queue=q)
+    assert res.sycl_queue == q
+    assert dpt.isdtype(res.dtype, "integral")
+
+    res = dpt.asarray([m, [w, dpt.asnumpy(w)]], sycl_queue=q2)
+    assert res.sycl_queue == q2
+    assert dpt.isdtype(res.dtype, "integral")
+
+    res = dpt.asarray([w, dpt.asnumpy(w)])
+    assert res.sycl_queue == w.sycl_queue
+    assert dpt.isdtype(res.dtype, "integral")
+
+    with pytest.raises(dpctl.utils.ExecutionPlacementError):
+        dpt.asarray([m, [w, py_seq]])
+
+
+def test_ulonglong_gh_1167():
+    get_queue_or_skip()
+    x = dpt.asarray(9223372036854775807, dtype="u8")
+    assert x.dtype == dpt.uint64
+    x = dpt.asarray(9223372036854775808, dtype="u8")
+    assert x.dtype == dpt.uint64
+
+
+def test_orderK_gh_1350():
+    get_queue_or_skip()
+    a = dpt.empty((2, 3, 4), dtype="u1")
+    b = dpt.permute_dims(a, (2, 0, 1))
+    c = dpt.asarray(b, copy=True, order="K")
+
+    assert c.shape == b.shape
+    assert c.strides == b.strides
+    assert c._element_offset == 0
+    assert not c._pointer == b._pointer
+
+
+def _typesafe_arange(n: int, dtype_: dpt.dtype, device: object):
+    n_half = n // 2
+    if dtype_.kind in "ui":
+        ii = dpt.iinfo(dtype_)
+        m0 = max(ii.min, -n_half)
+        m1 = min(m0 + n, ii.max)
+        n_tiles = (n + m1 - m0 - 1) // (m1 - m0)
+        res = dpt.arange(m0, m1, dtype=dtype_, device=device)
+    elif dtype_.kind == "b":
+        n_tiles = (n + 1) // 2
+        res = dpt.asarray([False, True], dtype=dtype_, device=device)
+    else:
+        m0 = -n_half
+        m1 = m0 + n
+        n_tiles = 1
+        res = dpt.linspace(m0, m1, num=n, dtype=dtype_, device=device)
+    if n_tiles > 1:
+        res = dpt.tile(res, n_tiles)[:n]
+    return res
+
+
+_all_dtypes = [
+    "b1",
+    "i1",
+    "u1",
+    "i2",
+    "u2",
+    "i4",
+    "u4",
+    "i8",
+    "u8",
+    "f2",
+    "f4",
+    "f8",
+    "c8",
+    "c16",
+]
+
+
+@pytest.mark.parametrize("dt", _all_dtypes)
+def test_as_c_contig_rect(dt):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dt, q)
+
+    dtype_ = dpt.dtype(dt)
+    n0, n1, n2 = 6, 35, 37
+
+    arr_flat = _typesafe_arange(n0 * n1 * n2, dtype_, q)
+    x = dpt.reshape(arr_flat, (n0, n1, n2)).mT
+
+    y = dpt.asarray(x, order="C")
+    assert dpt.all(x == y)
+
+    x2 = x[0]
+    y2 = dpt.asarray(x2, order="C")
+    assert dpt.all(x2 == y2)
+
+    x3 = dpt.flip(x, axis=1)
+    y3 = dpt.asarray(x3, order="C")
+    assert dpt.all(x3 == y3)
+
+    x4 = dpt.reshape(arr_flat, (2, 3, n1, n2)).mT
+    x5 = x4[:, :2]
+    y5 = dpt.asarray(x5, order="C")
+    assert dpt.all(x5 == y5)
+
+    x6 = dpt.reshape(arr_flat, (n0, n1, n2), order="F")
+    y6 = dpt.asarray(x6, order="C")
+    assert dpt.all(x6 == y6)
+
+
+@pytest.mark.parametrize("dt", _all_dtypes)
+def test_as_f_contig_rect(dt):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dt, q)
+
+    dtype_ = dpt.dtype(dt)
+    n0, n1, n2 = 6, 35, 37
+
+    arr_flat = _typesafe_arange(n0 * n1 * n2, dtype_, q)
+    x = dpt.reshape(arr_flat, (n0, n1, n2))
+
+    y = dpt.asarray(x, order="F")
+    assert dpt.all(x == y)
+
+    x2 = x[0]
+    y2 = dpt.asarray(x2, order="F")
+    assert dpt.all(x2 == y2)
+
+    x3 = dpt.flip(x, axis=1)
+    y3 = dpt.asarray(x3, order="F")
+    assert dpt.all(x3 == y3)
+
+    x4 = dpt.reshape(arr_flat, (2, 3, n1, n2))
+    x5 = dpt.moveaxis(x4[:, :2], (2, 3), (0, 1))
+    y5 = dpt.asarray(x5, order="F")
+    assert dpt.all(x5 == y5)
+
+
+@pytest.mark.parametrize("dt", _all_dtypes)
+def test_as_c_contig_square(dt):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dt, q)
+
+    dtype_ = dpt.dtype(dt)
+    n0, n1 = 4, 53
+
+    arr_flat = _typesafe_arange(n0 * n1 * n1, dtype_, q)
+    x = dpt.reshape(arr_flat, (n0, n1, n1)).mT
+
+    y = dpt.asarray(x, order="C")
+    assert dpt.all(x == y)
+
+    x2 = x[0]
+    y2 = dpt.asarray(x2, order="C")
+    assert dpt.all(x2 == y2)
+
+    x3 = dpt.flip(x, axis=1)
+    y3 = dpt.asarray(x3, order="C")
+    assert dpt.all(x3 == y3)
+
+
+@pytest.mark.parametrize("dt", _all_dtypes)
+def test_as_f_contig_square(dt):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dt, q)
+
+    dtype_ = dpt.dtype(dt)
+    n0, n1 = 6, 53
+
+    arr_flat = _typesafe_arange(n0 * n1 * n1, dtype_, q)
+    x = dpt.moveaxis(dpt.reshape(arr_flat, (n0, n1, n1)), (1, 2), (0, 1))
+
+    y = dpt.asarray(x, order="F")
+    assert dpt.all(x == y)
+
+    x2 = x[..., 0]
+    y2 = dpt.asarray(x2, order="F")
+    assert dpt.all(x2 == y2)
+
+    x3 = dpt.flip(x, axis=1)
+    y3 = dpt.asarray(x3, order="F")
+    assert dpt.all(x3 == y3)
+
+
+class MockArrayWithBothProtocols:
+    """
+    Object that implements both __sycl_usm_array_interface__
+    and __usm_ndarray__ properties.
+    """
+
+    def __init__(self, usm_ar):
+        if not isinstance(usm_ar, dpt.usm_ndarray):
+            raise TypeError
+        self._arr = usm_ar
+
+    @property
+    def __usm_ndarray__(self):
+        return self._arr
+
+    @property
+    def __sycl_usm_array_interface__(self):
+        return self._arr.__sycl_usm_array_interface__
+
+
+class MockArrayWithSUAIOnly:
+    """
+    Object that implements only the
+    __sycl_usm_array_interface__ property.
+    """
+
+    def __init__(self, usm_ar):
+        if not isinstance(usm_ar, dpt.usm_ndarray):
+            raise TypeError
+        self._arr = usm_ar
+
+    @property
+    def __sycl_usm_array_interface__(self):
+        return self._arr.__sycl_usm_array_interface__
+
+
+@pytest.mark.parametrize("usm_type", ["shared", "device", "host"])
+def test_asarray_support_for_usm_ndarray_protocol(usm_type):
+    get_queue_or_skip()
+
+    x = dpt.arange(256, dtype="i4", usm_type=usm_type)
+
+    o1 = MockArrayWithBothProtocols(x)
+    o2 = MockArrayWithSUAIOnly(x)
+
+    y1 = dpt.asarray(o1)
+    assert x.sycl_queue == y1.sycl_queue
+    assert x.usm_type == y1.usm_type
+    assert x.dtype == y1.dtype
+    assert y1.usm_data.reference_obj is None
+    assert dpt.all(x == y1)
+
+    y2 = dpt.asarray(o2)
+    assert x.sycl_queue == y2.sycl_queue
+    assert x.usm_type == y2.usm_type
+    assert x.dtype == y2.dtype
+    assert not (y2.usm_data.reference_obj is None)
+    assert dpt.all(x == y2)
+
+    y3 = dpt.asarray([o1, o2])
+    assert x.sycl_queue == y3.sycl_queue
+    assert x.usm_type == y3.usm_type
+    assert x.dtype == y3.dtype
+    assert y3.usm_data.reference_obj is None
+    assert dpt.all(x[dpt.newaxis, :] == y3)
+
+
+@pytest.mark.parametrize("dt", [dpt.float16, dpt.float64, dpt.complex128])
+def test_asarray_to_device_with_unsupported_dtype(dt):
+    aspect = "fp16" if dt == dpt.float16 else "fp64"
+    try:
+        d0 = dpctl.select_device_with_aspects(aspect)
+    except dpctl.SyclDeviceCreationError:
+        pytest.skip("No device with aspect for test")
+    d1 = None
+    for d in dpctl.get_devices():
+        if d.default_selector_score < 0:
+            pass
+        try:
+            d1 = dpctl.select_device_with_aspects(
+                d.device_type.name, excluded_aspects=[aspect]
+            )
+        except dpctl.SyclDeviceCreationError:
+            pass
+    if d1 is None:
+        pytest.skip("No device with missing aspect for test")
+    x = dpt.ones(10, dtype=dt, device=d0)
+    y = dpt.asarray(x, device=d1)
+    assert y.sycl_device == d1
diff --git a/dpnp/tests/tensor/test_tensor_clip.py b/dpnp/tests/tensor/test_tensor_clip.py
new file mode 100644
index 000000000000..de4717f22023
--- /dev/null
+++ b/dpnp/tests/tensor/test_tensor_clip.py
@@ -0,0 +1,793 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+import dpctl
+import numpy as np
+import pytest
+from dpctl.utils import ExecutionPlacementError
+from numpy.testing import assert_raises_regex
+
+import dpnp.tensor as dpt
+from dpnp.tensor._elementwise_common import _get_dtype
+from dpnp.tensor._type_utils import (
+    _can_cast,
+    _strong_dtype_num_kind,
+    _weak_type_num_kind,
+)
+
+from .helper import (
+    get_queue_or_skip,
+    skip_if_dtype_not_supported,
+)
+
+_all_dtypes = [
+    "?",
+    "u1",
+    "i1",
+    "u2",
+    "i2",
+    "u4",
+    "i4",
+    "u8",
+    "i8",
+    "e",
+    "f",
+    "d",
+    "F",
+    "D",
+]
+
+_usm_types = ["device", "shared", "host"]
+
+
+@pytest.mark.parametrize("dt1", _all_dtypes)
+@pytest.mark.parametrize("dt2", _all_dtypes)
+def test_clip_dtypes(dt1, dt2):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dt1, q)
+    skip_if_dtype_not_supported(dt2, q)
+
+    sz = 127
+    ar1 = dpt.ones(sz, dtype=dt1, sycl_queue=q)
+    ar2 = dpt.ones_like(ar1, dtype=dt1, sycl_queue=q)
+    ar3 = dpt.ones_like(ar1, dtype=dt2, sycl_queue=q)
+
+    dev = q.sycl_device
+    _fp16 = dev.has_aspect_fp16
+    _fp64 = dev.has_aspect_fp64
+    # also covers cases where dt1 == dt2
+    if _can_cast(ar3.dtype, ar1.dtype, _fp16, _fp64):
+        r = dpt.clip(ar1, ar2, ar3)
+        assert isinstance(r, dpt.usm_ndarray)
+        assert r.dtype == ar1.dtype
+        assert r.shape == ar1.shape
+        assert dpt.all(r == ar1)
+        assert r.sycl_queue == ar1.sycl_queue
+
+        r = dpt.clip(ar1, min=ar3, max=None)
+        assert isinstance(r, dpt.usm_ndarray)
+        assert r.dtype == ar1.dtype
+        assert r.shape == ar1.shape
+        assert dpt.all(r == ar1)
+        assert r.sycl_queue == ar1.sycl_queue
+
+        r = dpt.clip(ar1, min=None, max=ar3)
+        assert isinstance(r, dpt.usm_ndarray)
+        assert r.dtype == ar1.dtype
+        assert r.shape == ar1.shape
+        assert dpt.all(r == ar1)
+        assert r.sycl_queue == ar1.sycl_queue
+    else:
+        with pytest.raises(ValueError):
+            dpt.clip(ar1, ar2, ar3)
+        with pytest.raises(ValueError):
+            dpt.clip(ar1, min=ar3, max=None)
+        with pytest.raises(ValueError):
+            dpt.clip(ar1, min=None, max=ar3)
+
+
+def test_clip_empty():
+    get_queue_or_skip()
+
+    x = dpt.empty((2, 0, 3), dtype="i4")
+    a_min = dpt.ones((2, 0, 3), dtype="i4")
+    a_max = dpt.ones((2, 0, 3), dtype="i4")
+
+    r = dpt.clip(x, a_min, a_max)
+    assert r.size == 0
+    assert r.shape == x.shape
+
+
+def test_clip_python_scalars():
+    get_queue_or_skip()
+
+    arrs = [
+        dpt.ones(1, dtype="?"),
+        dpt.ones(1, dtype="i4"),
+        dpt.ones(1, dtype="f4"),
+        dpt.ones(1, dtype="c8"),
+    ]
+
+    py_zeros = [
+        False,
+        0,
+        0.0,
+        complex(0, 0),
+    ]
+
+    py_ones = [
+        True,
+        1,
+        1.0,
+        complex(1, 0),
+    ]
+
+    for zero, one, arr in zip(py_zeros, py_ones, arrs):
+        r = dpt.clip(arr, zero, one)
+        assert isinstance(r, dpt.usm_ndarray)
+        r = dpt.clip(arr, min=zero)
+        assert isinstance(r, dpt.usm_ndarray)
+
+
+def test_clip_in_place():
+    get_queue_or_skip()
+
+    x = dpt.arange(10, dtype="i4")
+    a_min = dpt.arange(1, 11, dtype="i4")
+    a_max = dpt.arange(2, 12, dtype="i4")
+    dpt.clip(x, a_min, a_max, out=x)
+    assert dpt.all(x == a_min)
+
+    x = dpt.arange(10, dtype="i4")
+    dpt.clip(x, min=a_min, max=None, out=x)
+    assert dpt.all(x == a_min)
+
+    x = dpt.arange(10, dtype="i4")
+    dpt.clip(x, a_min, a_max, out=a_max)
+    assert dpt.all(a_max == a_min)
+
+    a_min = dpt.arange(1, 11, dtype="i4")
+    dpt.clip(x, min=a_min, max=None, out=a_min[::-1])
+    assert dpt.all((x + 1)[::-1] == a_min)
+
+
+def test_clip_special_cases():
+    get_queue_or_skip()
+
+    x = dpt.arange(10, dtype="f4")
+    r = dpt.clip(x, -dpt.inf, dpt.inf)
+    assert dpt.all(r == x)
+    r = dpt.clip(x, dpt.nan, dpt.inf)
+    assert dpt.all(dpt.isnan(r))
+    r = dpt.clip(x, -dpt.inf, dpt.nan)
+    assert dpt.all(dpt.isnan(r))
+
+
+def test_clip_out_need_temporary():
+    get_queue_or_skip()
+
+    x = dpt.ones(10, dtype="i4")
+    a_min = dpt.asarray(2, dtype="i4")
+    a_max = dpt.asarray(3, dtype="i4")
+    dpt.clip(x[:6], 2, 3, out=x[-6:])
+    assert dpt.all(x[:-6] == 1) and dpt.all(x[-6:] == 2)
+
+    x = dpt.ones(10, dtype="i4")
+    a_min = dpt.asarray(2, dtype="i4")
+    a_max = dpt.asarray(3, dtype="i2")
+    dpt.clip(x[:6], 2, 3, out=x[-6:])
+    assert dpt.all(x[:-6] == 1) and dpt.all(x[-6:] == 2)
+
+    x = dpt.ones(10, dtype="i4")
+    a_min = dpt.asarray(2, dtype="i2")
+    a_max = dpt.asarray(3, dtype="i4")
+    dpt.clip(x[:6], 2, 3, out=x[-6:])
+    assert dpt.all(x[:-6] == 1) and dpt.all(x[-6:] == 2)
+
+    x = dpt.ones(10, dtype="i4")
+    a_min = dpt.asarray(2, dtype="i2")
+    a_max = dpt.asarray(3, dtype="i1")
+    dpt.clip(x[:6], 2, 3, out=x[-6:])
+    assert dpt.all(x[:-6] == 1) and dpt.all(x[-6:] == 2)
+
+    x = dpt.arange(12, dtype="i4")
+    dpt.clip(x[:6], out=x[-6:])
+    expected = dpt.arange(6, dtype="i4")
+    assert dpt.all(x[:-6] == expected) and dpt.all(x[-6:] == expected)
+
+    x = dpt.ones(10, dtype="i4")
+    dpt.clip(x, out=x)
+    assert dpt.all(x == 1)
+
+    x = dpt.full(6, 3, dtype="i4")
+    a_min = dpt.full(10, 2, dtype="i4")
+    a_max = dpt.asarray(4, dtype="i4")
+    dpt.clip(x, min=a_min[:6], max=a_max, out=a_min[-6:])
+    assert dpt.all(a_min[:-6] == 2) and dpt.all(a_min[-6:] == 3)
+
+    x = dpt.full(6, 3, dtype="i4")
+    a_min = dpt.full(10, 2, dtype="i4")
+    a_max = dpt.asarray(4, dtype="i2")
+    dpt.clip(x, min=a_min[:6], max=a_max, out=a_min[-6:])
+    assert dpt.all(a_min[:-6] == 2) and dpt.all(a_min[-6:] == 3)
+
+
+def test_clip_out_need_temporary_none():
+    get_queue_or_skip()
+
+    x = dpt.full(6, 3, dtype="i4")
+    # with min/max == None
+    a_min = dpt.full(10, 2, dtype="i4")
+    dpt.clip(x, min=a_min[:6], max=None, out=a_min[-6:])
+    assert dpt.all(a_min[:-6] == 2) and dpt.all(a_min[-6:] == 3)
+
+
+def test_clip_arg_validation():
+    get_queue_or_skip()
+
+    check = {}
+    x1 = dpt.empty((1,), dtype="i4")
+    x2 = dpt.empty((1,), dtype="i4")
+
+    with pytest.raises(TypeError):
+        dpt.clip(check, x1, x2)
+
+    with pytest.raises(ValueError):
+        dpt.clip(x1, check, x2)
+
+    with pytest.raises(ValueError):
+        dpt.clip(x1, check)
+
+    with pytest.raises(TypeError):
+        dpt.clip(x1, x1, x2, out=check)
+
+    with pytest.raises(TypeError):
+        dpt.clip(x1, x2, out=check)
+
+    with pytest.raises(TypeError):
+        dpt.clip(x1, out=check)
+
+
+@pytest.mark.parametrize(
+    "dt1,dt2", [("i4", "i4"), ("i4", "i2"), ("i2", "i4"), ("i1", "i2")]
+)
+def test_clip_order(dt1, dt2):
+    get_queue_or_skip()
+
+    test_shape = (
+        20,
+        20,
+    )
+    test_shape2 = tuple(2 * dim for dim in test_shape)
+    n = test_shape[-1]
+
+    ar1 = dpt.ones(test_shape, dtype="i4", order="C")
+    ar2 = dpt.ones(test_shape, dtype=dt1, order="C")
+    ar3 = dpt.ones(test_shape, dtype=dt2, order="C")
+    r1 = dpt.clip(ar1, ar2, ar3, order="C")
+    assert r1.flags.c_contiguous
+    r2 = dpt.clip(ar1, ar2, ar3, order="F")
+    assert r2.flags.f_contiguous
+    r3 = dpt.clip(ar1, ar2, ar3, order="A")
+    assert r3.flags.c_contiguous
+    r4 = dpt.clip(ar1, ar2, ar3, order="K")
+    assert r4.flags.c_contiguous
+
+    ar1 = dpt.ones(test_shape, dtype="i4", order="F")
+    ar2 = dpt.ones(test_shape, dtype=dt1, order="F")
+    ar3 = dpt.ones(test_shape, dtype=dt2, order="F")
+    r1 = dpt.clip(ar1, ar2, ar3, order="C")
+    assert r1.flags.c_contiguous
+    r2 = dpt.clip(ar1, ar2, ar3, order="F")
+    assert r2.flags.f_contiguous
+    r3 = dpt.clip(ar1, ar2, ar3, order="A")
+    assert r3.flags.f_contiguous
+    r4 = dpt.clip(ar1, ar2, ar3, order="K")
+    assert r4.flags.f_contiguous
+
+    ar1 = dpt.ones(test_shape2, dtype="i4", order="C")[:20, ::-2]
+    ar2 = dpt.ones(test_shape2, dtype=dt1, order="C")[:20, ::-2]
+    ar3 = dpt.ones(test_shape2, dtype=dt2, order="C")[:20, ::-2]
+    r4 = dpt.clip(ar1, ar2, ar3, order="K")
+    assert r4.strides == (n, -1)
+    r5 = dpt.clip(ar1, ar2, ar3, order="C")
+    assert r5.strides == (n, 1)
+
+    ar1 = dpt.ones(test_shape2, dtype="i4", order="C")[:20, ::-2].mT
+    ar2 = dpt.ones(test_shape2, dtype=dt1, order="C")[:20, ::-2].mT
+    ar3 = dpt.ones(test_shape2, dtype=dt2, order="C")[:20, ::-2].mT
+    r4 = dpt.clip(ar1, ar2, ar3, order="K")
+    assert r4.strides == (-1, n)
+    r5 = dpt.clip(ar1, ar2, ar3, order="C")
+    assert r5.strides == (n, 1)
+
+
+@pytest.mark.parametrize("dt", ["i4", "i2"])
+def test_clip_none_order(dt):
+    get_queue_or_skip()
+
+    test_shape = (
+        20,
+        20,
+    )
+    test_shape2 = tuple(2 * dim for dim in test_shape)
+    n = test_shape[-1]
+
+    ar1 = dpt.ones(test_shape, dtype="i4", order="C")
+    ar2 = dpt.ones(test_shape, dtype=dt, order="C")
+
+    r1 = dpt.clip(ar1, min=None, max=ar2, order="C")
+    assert r1.flags.c_contiguous
+    r2 = dpt.clip(ar1, min=None, max=ar2, order="F")
+    assert r2.flags.f_contiguous
+    r3 = dpt.clip(ar1, min=None, max=ar2, order="A")
+    assert r3.flags.c_contiguous
+    r4 = dpt.clip(ar1, min=None, max=ar2, order="K")
+    assert r4.flags.c_contiguous
+
+    ar1 = dpt.ones(test_shape, dtype="i4", order="F")
+    ar2 = dpt.ones(test_shape, dtype=dt, order="F")
+
+    r1 = dpt.clip(ar1, min=None, max=ar2, order="C")
+    assert r1.flags.c_contiguous
+    r2 = dpt.clip(ar1, min=None, max=ar2, order="F")
+    assert r2.flags.f_contiguous
+    r3 = dpt.clip(ar1, min=None, max=ar2, order="A")
+    assert r3.flags.f_contiguous
+    r4 = dpt.clip(ar1, min=None, max=ar2, order="K")
+    assert r4.flags.f_contiguous
+
+    ar1 = dpt.ones(test_shape2, dtype="i4", order="C")[:20, ::-2]
+    ar2 = dpt.ones(test_shape2, dtype=dt, order="C")[:20, ::-2]
+
+    r4 = dpt.clip(ar1, min=None, max=ar2, order="K")
+    assert r4.strides == (n, -1)
+    r5 = dpt.clip(ar1, min=None, max=ar2, order="C")
+    assert r5.strides == (n, 1)
+
+    ar1 = dpt.ones(test_shape2, dtype="i4", order="C")[:20, ::-2].mT
+    ar2 = dpt.ones(test_shape2, dtype=dt, order="C")[:20, ::-2].mT
+
+    r4 = dpt.clip(ar1, min=None, max=ar2, order="K")
+    assert r4.strides == (-1, n)
+    r5 = dpt.clip(ar1, min=None, max=ar2, order="C")
+    assert r5.strides == (n, 1)
+
+
+@pytest.mark.parametrize("usm_type1", _usm_types)
+@pytest.mark.parametrize("usm_type2", _usm_types)
+@pytest.mark.parametrize("usm_type3", _usm_types)
+def test_clip_usm_type_matrix(usm_type1, usm_type2, usm_type3):
+    get_queue_or_skip()
+
+    sz = 128
+    ar1 = dpt.ones(sz, dtype="i4", usm_type=usm_type1)
+    ar2 = dpt.ones_like(ar1, dtype="i4", usm_type=usm_type2)
+    ar3 = dpt.ones_like(ar1, dtype="i4", usm_type=usm_type3)
+
+    r = dpt.clip(ar1, ar2, ar3)
+    assert isinstance(r, dpt.usm_ndarray)
+    expected_usm_type = dpctl.utils.get_coerced_usm_type(
+        (usm_type1, usm_type2, usm_type3)
+    )
+    assert r.usm_type == expected_usm_type
+
+
+@pytest.mark.parametrize("usm_type1", _usm_types)
+@pytest.mark.parametrize("usm_type2", _usm_types)
+def test_clip_usm_type_matrix_none_arg(usm_type1, usm_type2):
+    get_queue_or_skip()
+
+    sz = 128
+    ar1 = dpt.ones(sz, dtype="i4", usm_type=usm_type1)
+    ar2 = dpt.ones_like(ar1, dtype="i4", usm_type=usm_type2)
+
+    r = dpt.clip(ar1, min=ar2, max=None)
+    assert isinstance(r, dpt.usm_ndarray)
+    expected_usm_type = dpctl.utils.get_coerced_usm_type((usm_type1, usm_type2))
+    assert r.usm_type == expected_usm_type
+
+
+def test_clip_dtype_error():
+    get_queue_or_skip()
+
+    ar1 = dpt.ones(1, dtype="i4")
+    ar2 = dpt.ones(1, dtype="i4")
+    ar3 = dpt.ones(1, dtype="i4")
+    ar4 = dpt.empty_like(ar1, dtype="f4")
+
+    assert_raises_regex(
+        ValueError,
+        "Output array of type.*is needed",
+        dpt.clip,
+        ar1,
+        ar2,
+        ar3,
+        ar4,
+    )
+    assert_raises_regex(
+        ValueError,
+        "Output array of type.*is needed",
+        dpt.clip,
+        ar1,
+        ar2,
+        None,
+        ar4,
+    )
+
+
+def test_clip_errors():
+    get_queue_or_skip()
+    try:
+        gpu_queue = dpctl.SyclQueue("gpu")
+    except dpctl.SyclQueueCreationError:
+        pytest.skip("SyclQueue('gpu') failed, skipping")
+    try:
+        cpu_queue = dpctl.SyclQueue("cpu")
+    except dpctl.SyclQueueCreationError:
+        pytest.skip("SyclQueue('cpu') failed, skipping")
+
+    ar1 = dpt.ones(2, dtype="float32", sycl_queue=gpu_queue)
+    ar2 = dpt.ones_like(ar1, sycl_queue=gpu_queue)
+    ar3 = dpt.ones_like(ar1, sycl_queue=gpu_queue)
+    ar4 = dpt.empty_like(ar1, sycl_queue=cpu_queue)
+    assert_raises_regex(
+        ExecutionPlacementError,
+        "Input and output allocation queues are not compatible",
+        dpt.clip,
+        ar1,
+        ar2,
+        ar3,
+        ar4,
+    )
+
+    assert_raises_regex(
+        ExecutionPlacementError,
+        "Input and output allocation queues are not compatible",
+        dpt.clip,
+        ar1,
+        None,
+        ar3,
+        ar4,
+    )
+
+    assert_raises_regex(
+        ExecutionPlacementError,
+        "Execution placement can not be unambiguously inferred from input "
+        "arguments.",
+        dpt.clip,
+        ar1,
+        ar4,
+        ar2,
+        ar3,
+    )
+
+    assert_raises_regex(
+        ExecutionPlacementError,
+        "Execution placement can not be unambiguously inferred from input "
+        "arguments.",
+        dpt.clip,
+        ar1,
+        ar4,
+        1,
+        ar3,
+    )
+
+    assert_raises_regex(
+        ExecutionPlacementError,
+        "Execution placement can not be unambiguously inferred from input "
+        "arguments.",
+        dpt.clip,
+        ar1,
+        1,
+        ar4,
+        ar3,
+    )
+
+    assert_raises_regex(
+        ExecutionPlacementError,
+        "Execution placement can not be unambiguously inferred from input "
+        "arguments.",
+        dpt.clip,
+        ar1,
+        ar4,
+        None,
+        ar2,
+    )
+
+    ar1 = dpt.ones(2, dtype="float32")
+    ar2 = dpt.ones_like(ar1, dtype="float32")
+    ar3 = dpt.ones_like(ar1, dtype="float32")
+    ar4 = dpt.empty(3, dtype="float32")
+    assert_raises_regex(
+        ValueError,
+        "The shape of input and output arrays are inconsistent",
+        dpt.clip,
+        ar1,
+        ar2,
+        ar3,
+        ar4,
+    )
+
+    assert_raises_regex(
+        ValueError,
+        "The shape of input and output arrays are inconsistent",
+        dpt.clip,
+        ar1,
+        ar2,
+        None,
+        ar4,
+    )
+
+    ar1 = np.ones(2, dtype="f4")
+    ar2 = dpt.ones(2, dtype="f4")
+    ar3 = dpt.ones(2, dtype="f4")
+    assert_raises_regex(
+        TypeError,
+        "Expected `x` to be of dpctl.tensor.usm_ndarray type*",
+        dpt.clip,
+        ar1,
+        ar2,
+        ar3,
+    )
+
+    ar1 = dpt.ones(2, dtype="i4")
+    ar2 = dpt.ones_like(ar1, dtype="i4")
+    ar3 = dpt.ones_like(ar1, dtype="i4")
+    ar4 = np.empty(ar1.shape, dtype=ar1.dtype)
+    assert_raises_regex(
+        TypeError,
+        "output array must be of usm_ndarray type",
+        dpt.clip,
+        ar1,
+        ar2,
+        ar3,
+        ar4,
+    )
+
+    assert_raises_regex(
+        TypeError,
+        "output array must be of usm_ndarray type",
+        dpt.clip,
+        ar1,
+        ar2,
+        None,
+        ar4,
+    )
+
+
+def test_clip_out_type_check():
+    get_queue_or_skip()
+
+    x1 = dpt.ones(10)
+    x2 = dpt.ones(10)
+    x3 = dpt.ones(10)
+
+    out = range(10)
+
+    with pytest.raises(TypeError):
+        dpt.clip(x1, x2, x3, out=out)
+
+
+@pytest.mark.parametrize("dt", ["i4", "f4", "c8"])
+def test_clip_basic(dt):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dt, q)
+
+    sz = 1026
+    x = dpt.arange(sz, dtype=dt, sycl_queue=q)
+    r = dpt.clip(x, min=100, max=500)
+    expected = dpt.arange(sz, dtype=dt, sycl_queue=q)
+    expected[:100] = 100
+    expected[500:] = 500
+    assert dpt.all(expected == r)
+
+    x = dpt.zeros(sz, dtype=dt, sycl_queue=q)
+    a_max = dpt.full(sz, -1, dtype=dt, sycl_queue=q)
+    a_max[::2] = -2
+    r = dpt.clip(x, min=-3, max=a_max)
+    assert dpt.all(a_max == r)
+
+
+@pytest.mark.parametrize("dt", ["i4", "f4", "c8"])
+def test_clip_strided(dt):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dt, q)
+
+    sz = 2 * 1026
+    x = dpt.arange(sz, dtype=dt, sycl_queue=q)[::-2]
+    r = dpt.clip(x, min=100, max=500)
+    expected = dpt.arange(sz, dtype=dt, sycl_queue=q)
+    expected[:100] = 100
+    expected[500:] = 500
+    expected = expected[::-2]
+    assert dpt.all(expected == r)
+
+    x = dpt.zeros(sz, dtype=dt, sycl_queue=q)[::-2]
+    a_max = dpt.full(sz, -1, dtype=dt, sycl_queue=q)
+    a_max[::2] = -2
+    a_max = a_max[::-2]
+    r = dpt.clip(x, min=-3, max=a_max)
+    assert dpt.all(a_max == r)
+
+
+def test_clip_max_less_than_min():
+    get_queue_or_skip()
+
+    x = dpt.ones(10, dtype="i4")
+    res = dpt.clip(x, 5, 0)
+    assert dpt.all(res == 0)
+
+
+@pytest.mark.parametrize("dt", ["?", "i4", "f4", "c8"])
+def test_clip_minmax_weak_types(dt):
+    get_queue_or_skip()
+
+    x = dpt.zeros(10, dtype=dt)
+    min_list = [False, 0, 0.0, 0.0 + 0.0j]
+    max_list = [True, 1, 1.0, 1.0 + 0.0j]
+
+    for min_v, max_v in zip(min_list, max_list):
+        st_dt = _strong_dtype_num_kind(dpt.dtype(dt))
+        wk_dt1 = _weak_type_num_kind(_get_dtype(min_v, x.sycl_device))
+        wk_dt2 = _weak_type_num_kind(_get_dtype(max_v, x.sycl_device))
+
+        if st_dt >= wk_dt1 and st_dt >= wk_dt2:
+            r = dpt.clip(x, min_v, max_v)
+            assert isinstance(r, dpt.usm_ndarray)
+        else:
+            with pytest.raises(ValueError):
+                dpt.clip(x, min_v, max_v)
+
+        if st_dt >= wk_dt1:
+            r = dpt.clip(x, min_v)
+            assert isinstance(r, dpt.usm_ndarray)
+
+            r = dpt.clip(x, None, min_v)
+            assert isinstance(r, dpt.usm_ndarray)
+        else:
+            with pytest.raises(ValueError):
+                dpt.clip(x, min_v)
+            with pytest.raises(ValueError):
+                dpt.clip(x, None, max_v)
+
+
+def test_clip_max_weak_type_errors():
+    get_queue_or_skip()
+
+    x = dpt.zeros(10, dtype="i4")
+    m = dpt.ones(10, dtype="i4")
+
+    with pytest.raises(ValueError):
+        dpt.clip(x, m, 2.5)
+
+    with pytest.raises(ValueError):
+        dpt.clip(x, 2.5, m)
+
+    with pytest.raises(ValueError):
+        dpt.clip(x, 2.5)
+
+    with pytest.raises(ValueError):
+        dpt.clip(dpt.astype(x, "?"), 2)
+
+    with pytest.raises(ValueError):
+        dpt.clip(dpt.astype(x, "f4"), complex(2))
+
+
+def test_clip_unaligned():
+    get_queue_or_skip()
+
+    x = dpt.full(513, 5, dtype="i4")
+    a_min = dpt.zeros(512, dtype="i4")
+    a_max = dpt.full(512, 2, dtype="i4")
+
+    expected = dpt.full(512, 2, dtype="i4")
+    assert dpt.all(dpt.clip(x[1:], a_min, a_max) == expected)
+
+
+def test_clip_none_args():
+    get_queue_or_skip()
+
+    x = dpt.arange(10, dtype="i4")
+    r = dpt.clip(x)
+    assert dpt.all(x == r)
+
+
+def test_clip_shape_errors():
+    get_queue_or_skip()
+
+    x = dpt.ones((4, 4), dtype="i4")
+    a_min = dpt.ones(5, dtype="i4")
+    a_max = dpt.ones(5, dtype="i4")
+
+    with pytest.raises(ValueError):
+        dpt.clip(x, a_min, a_max)
+
+    with pytest.raises(ValueError):
+        dpt.clip(x, a_min)
+
+    with pytest.raises(ValueError):
+        dpt.clip(x, 0, 1, out=a_min)
+
+    with pytest.raises(ValueError):
+        dpt.clip(x, 0, out=a_min)
+
+    with pytest.raises(ValueError):
+        dpt.clip(x, out=a_min)
+
+
+def test_clip_compute_follows_data():
+    q1 = get_queue_or_skip()
+    q2 = get_queue_or_skip()
+
+    x = dpt.ones(10, dtype="i4", sycl_queue=q1)
+    a_min = dpt.ones(10, dtype="i4", sycl_queue=q2)
+    a_max = dpt.ones(10, dtype="i4", sycl_queue=q1)
+    res = dpt.empty_like(x, sycl_queue=q2)
+
+    with pytest.raises(ExecutionPlacementError):
+        dpt.clip(x, a_min, a_max)
+
+    with pytest.raises(ExecutionPlacementError):
+        dpt.clip(x, dpt.ones_like(x), a_max, out=res)
+
+    with pytest.raises(ExecutionPlacementError):
+        dpt.clip(x, a_min)
+
+    with pytest.raises(ExecutionPlacementError):
+        dpt.clip(x, None, a_max, out=res)
+
+    with pytest.raises(ExecutionPlacementError):
+        dpt.clip(x, out=res)
+
+
+def test_clip_readonly_out():
+    get_queue_or_skip()
+    x = dpt.arange(32, dtype=dpt.int32)
+    r = dpt.empty_like(x)
+    r.flags["W"] = False
+
+    with pytest.raises(ValueError):
+        dpt.clip(x, min=0, max=10, out=r)
+
+    with pytest.raises(ValueError):
+        dpt.clip(x, max=10, out=r)
+
+    with pytest.raises(ValueError):
+        dpt.clip(x, min=0, out=r)
+
+    with pytest.raises(ValueError):
+        dpt.clip(x, out=r)
+
+
+def test_clip_gh_1744():
+    get_queue_or_skip()
+    x = dpt.asarray([0, 255], dtype=dpt.uint8)
+    y = dpt.clip(x, -300, 300)
+
+    assert dpt.all(x == y)
diff --git a/dpnp/tests/tensor/test_tensor_copy_utils.py b/dpnp/tests/tensor/test_tensor_copy_utils.py
new file mode 100644
index 000000000000..878877dcaa4c
--- /dev/null
+++ b/dpnp/tests/tensor/test_tensor_copy_utils.py
@@ -0,0 +1,113 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+import numpy as np
+import pytest
+
+import dpnp.tensor as dpt
+import dpnp.tensor._copy_utils as cu
+
+from .helper import get_queue_or_skip
+
+
+def test_copy_utils_empty_like_orderK():
+    get_queue_or_skip()
+    a = dpt.empty((10, 10), dtype=dpt.int32, order="F")
+    X = cu._empty_like_orderK(a, dpt.int32, a.usm_type, a.device)
+    assert X.flags["F"]
+
+
+def test_copy_utils_empty_like_orderK_invalid_args():
+    get_queue_or_skip()
+    with pytest.raises(TypeError):
+        cu._empty_like_orderK([1, 2, 3], dpt.int32, "device", None)
+    with pytest.raises(TypeError):
+        cu._empty_like_pair_orderK(
+            [1, 2, 3],
+            (
+                1,
+                2,
+                3,
+            ),
+            dpt.int32,
+            (3,),
+            "device",
+            None,
+        )
+
+    a = dpt.empty(10, dtype=dpt.int32)
+    with pytest.raises(TypeError):
+        cu._empty_like_pair_orderK(
+            a,
+            (
+                1,
+                2,
+                3,
+            ),
+            dpt.int32,
+            (10,),
+            "device",
+            None,
+        )
+
+
+def test_copy_utils_from_numpy_empty_like_orderK():
+    q = get_queue_or_skip()
+
+    a = np.empty((10, 10), dtype=np.int32, order="C")
+    r0 = cu._from_numpy_empty_like_orderK(a, dpt.int32, "device", q)
+    assert r0.flags["C"]
+
+    b = np.empty((10, 10), dtype=np.int32, order="F")
+    r1 = cu._from_numpy_empty_like_orderK(b, dpt.int32, "device", q)
+    assert r1.flags["F"]
+
+    c = np.empty((2, 3, 4), dtype=np.int32, order="C")
+    c = np.transpose(c, (1, 0, 2))
+    r2 = cu._from_numpy_empty_like_orderK(c, dpt.int32, "device", q)
+    assert not r2.flags["C"] and not r2.flags["F"]
+
+
+def test_copy_utils_from_numpy_empty_like_orderK_invalid_args():
+    with pytest.raises(TypeError):
+        cu._from_numpy_empty_like_orderK([1, 2, 3], dpt.int32, "device", None)
+
+
+def test_gh_2055():
+    """
+    Test that `dpt.asarray` works on contiguous NumPy arrays with `order="K"`
+    when dimensions are permuted.
+
+    See: https://github.com/IntelPython/dpctl/issues/2055
+    """
+    get_queue_or_skip()
+
+    a = np.ones((2, 3, 4), dtype=dpt.int32)
+    a_t = np.transpose(a, (2, 0, 1))
+    r = dpt.asarray(a_t)
+    assert not r.flags["C"] and not r.flags["F"]
diff --git a/dpnp/tests/tensor/test_tensor_diff.py b/dpnp/tests/tensor/test_tensor_diff.py
new file mode 100644
index 000000000000..10153b5f5cc5
--- /dev/null
+++ b/dpnp/tests/tensor/test_tensor_diff.py
@@ -0,0 +1,345 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+from math import prod
+
+import pytest
+from dpctl.utils import ExecutionPlacementError
+from numpy.testing import assert_raises_regex
+
+import dpnp.tensor as dpt
+from dpnp.tensor._type_utils import _to_device_supported_dtype
+
+from .helper import (
+    get_queue_or_skip,
+    skip_if_dtype_not_supported,
+)
+
+_all_dtypes = [
+    "?",
+    "i1",
+    "u1",
+    "i2",
+    "u2",
+    "i4",
+    "u4",
+    "i8",
+    "u8",
+    "f2",
+    "f4",
+    "f8",
+    "c8",
+    "c16",
+]
+
+
+@pytest.mark.parametrize("dt", _all_dtypes)
+def test_diff_basic(dt):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dt, q)
+
+    x = dpt.asarray([9, 12, 7, 17, 10, 18, 15, 9, 8, 8], dtype=dt, sycl_queue=q)
+    op = dpt.not_equal if x.dtype is dpt.bool else dpt.subtract
+
+    # test both n=2 and n>2 branches
+    for n in [1, 2, 5]:
+        res = dpt.diff(x, n=n)
+        expected_res = x
+        for _ in range(n):
+            expected_res = op(expected_res[1:], expected_res[:-1])
+        if dpt.dtype(dt).kind in "fc":
+            assert dpt.allclose(res, expected_res)
+        else:
+            assert dpt.all(res == expected_res)
+
+
+def test_diff_axis():
+    get_queue_or_skip()
+
+    x = dpt.tile(
+        dpt.asarray([9, 12, 7, 17, 10, 18, 15, 9, 8, 8], dtype="i4"), (3, 4, 1)
+    )
+    x[:, ::2, :] = 0
+
+    for n in [1, 2, 3]:
+        res = dpt.diff(x, n=n, axis=1)
+        expected_res = x
+        for _ in range(n):
+            expected_res = dpt.subtract(
+                expected_res[:, 1:, :], expected_res[:, :-1, :]
+            )
+        assert dpt.all(res == expected_res)
+
+
+def test_diff_prepend_append_type_promotion():
+    get_queue_or_skip()
+
+    dts = [
+        ("i1", "u1", "i8"),
+        ("i1", "u8", "u1"),
+        ("u4", "i4", "f4"),
+        ("i8", "c8", "u8"),
+    ]
+
+    for dt0, dt1, dt2 in dts:
+        x = dpt.ones(10, dtype=dt1)
+        prepend = dpt.full(1, 2, dtype=dt0)
+        append = dpt.full(1, 3, dtype=dt2)
+
+        res = dpt.diff(x, prepend=prepend, append=append)
+        assert res.dtype == _to_device_supported_dtype(
+            dpt.result_type(prepend, x, append),
+            x.sycl_queue.sycl_device,
+        )
+
+        res = dpt.diff(x, prepend=prepend)
+        assert res.dtype == _to_device_supported_dtype(
+            dpt.result_type(prepend, x),
+            x.sycl_queue.sycl_device,
+        )
+
+        res = dpt.diff(x, append=append)
+        assert res.dtype == _to_device_supported_dtype(
+            dpt.result_type(x, append),
+            x.sycl_queue.sycl_device,
+        )
+
+
+def test_diff_0d():
+    get_queue_or_skip()
+
+    x = dpt.ones(())
+    with pytest.raises(ValueError):
+        dpt.diff(x)
+
+
+def test_diff_empty_array():
+    get_queue_or_skip()
+
+    x = dpt.ones((3, 0, 5))
+    res = dpt.diff(x, axis=1)
+    assert res.shape == x.shape
+
+    res = dpt.diff(x, axis=0)
+    assert res.shape == (2, 0, 5)
+
+    append = dpt.ones((3, 2, 5))
+    res = dpt.diff(x, axis=1, append=append)
+    assert res.shape == (3, 1, 5)
+
+    prepend = dpt.ones((3, 2, 5))
+    res = dpt.diff(x, axis=1, prepend=prepend)
+    assert res.shape == (3, 1, 5)
+
+
+def test_diff_no_op():
+    get_queue_or_skip()
+
+    x = dpt.ones(10, dtype="i4")
+    res = dpt.diff(x, n=0)
+    assert dpt.all(x == res)
+
+    x = dpt.reshape(x, (2, 5))
+    res = dpt.diff(x, n=0, axis=0)
+    assert dpt.all(x == res)
+
+
+@pytest.mark.parametrize("sh,axis", [((1,), 0), ((3, 4, 5), 1)])
+def test_diff_prepend_append_py_scalars(sh, axis):
+    get_queue_or_skip()
+
+    n = 1
+
+    arr = dpt.ones(sh, dtype="i4")
+    zero = 0
+
+    # first and last elements along axis
+    # will be checked for correctness
+    sl1 = [slice(None)] * arr.ndim
+    sl1[axis] = slice(1)
+    sl1 = tuple(sl1)
+
+    sl2 = [slice(None)] * arr.ndim
+    sl2[axis] = slice(-1, None, None)
+    sl2 = tuple(sl2)
+
+    r = dpt.diff(arr, axis=axis, prepend=zero, append=zero)
+    assert all(r.shape[i] == arr.shape[i] for i in range(arr.ndim) if i != axis)
+    assert r.shape[axis] == arr.shape[axis] + 2 - n
+    assert dpt.all(r[sl1] == 1)
+    assert dpt.all(r[sl2] == -1)
+
+    r = dpt.diff(arr, axis=axis, prepend=zero)
+    assert all(r.shape[i] == arr.shape[i] for i in range(arr.ndim) if i != axis)
+    assert r.shape[axis] == arr.shape[axis] + 1 - n
+    assert dpt.all(r[sl1] == 1)
+
+    r = dpt.diff(arr, axis=axis, append=zero)
+    assert all(r.shape[i] == arr.shape[i] for i in range(arr.ndim) if i != axis)
+    assert r.shape[axis] == arr.shape[axis] + 1 - n
+    assert dpt.all(r[sl2] == -1)
+
+    r = dpt.diff(arr, axis=axis, prepend=dpt.asarray(zero), append=zero)
+    assert all(r.shape[i] == arr.shape[i] for i in range(arr.ndim) if i != axis)
+    assert r.shape[axis] == arr.shape[axis] + 2 - n
+    assert dpt.all(r[sl1] == 1)
+    assert dpt.all(r[sl2] == -1)
+
+    r = dpt.diff(arr, axis=axis, prepend=zero, append=dpt.asarray(zero))
+    assert all(r.shape[i] == arr.shape[i] for i in range(arr.ndim) if i != axis)
+    assert r.shape[axis] == arr.shape[axis] + 2 - n
+    assert dpt.all(r[sl1] == 1)
+    assert dpt.all(r[sl2] == -1)
+
+
+def test_tensor_diff_append_prepend_arrays():
+    get_queue_or_skip()
+
+    n = 1
+    axis = 0
+
+    for sh in [(5,), (3, 4, 5)]:
+        sz = prod(sh)
+        arr = dpt.reshape(dpt.arange(sz, 2 * sz, dtype="i4"), sh)
+        prepend = dpt.reshape(dpt.arange(sz, dtype="i4"), sh)
+        append = dpt.reshape(dpt.arange(2 * sz, 3 * sz, dtype="i4"), sh)
+        const_diff = sz / sh[axis]
+
+        r = dpt.diff(arr, axis=axis, prepend=prepend, append=append)
+        assert all(
+            r.shape[i] == arr.shape[i] for i in range(arr.ndim) if i != axis
+        )
+        assert (
+            r.shape[axis]
+            == arr.shape[axis] + prepend.shape[axis] + append.shape[axis] - n
+        )
+        assert dpt.all(r == const_diff)
+
+        r = dpt.diff(arr, axis=axis, prepend=prepend)
+        assert all(
+            r.shape[i] == arr.shape[i] for i in range(arr.ndim) if i != axis
+        )
+        assert r.shape[axis] == arr.shape[axis] + prepend.shape[axis] - n
+        assert dpt.all(r == const_diff)
+
+        r = dpt.diff(arr, axis=axis, append=append)
+        assert all(
+            r.shape[i] == arr.shape[i] for i in range(arr.ndim) if i != axis
+        )
+        assert r.shape[axis] == arr.shape[axis] + append.shape[axis] - n
+        assert dpt.all(r == const_diff)
+
+
+def test_diff_wrong_append_prepend_shape():
+    get_queue_or_skip()
+
+    arr = dpt.ones((3, 4, 5), dtype="i4")
+    arr_bad_sh = dpt.ones(2, dtype="i4")
+
+    assert_raises_regex(
+        ValueError,
+        ".*shape.*is invalid.*",
+        dpt.diff,
+        arr,
+        prepend=arr_bad_sh,
+        append=arr_bad_sh,
+    )
+
+    assert_raises_regex(
+        ValueError,
+        ".*shape.*is invalid.*",
+        dpt.diff,
+        arr,
+        prepend=arr,
+        append=arr_bad_sh,
+    )
+
+    assert_raises_regex(
+        ValueError,
+        ".*shape.*is invalid.*",
+        dpt.diff,
+        arr,
+        prepend=arr_bad_sh,
+    )
+
+    assert_raises_regex(
+        ValueError,
+        ".*shape.*is invalid.*",
+        dpt.diff,
+        arr,
+        append=arr_bad_sh,
+    )
+
+
+def test_diff_compute_follows_data():
+    q1 = get_queue_or_skip()
+    q2 = get_queue_or_skip()
+    q3 = get_queue_or_skip()
+
+    ar1 = dpt.ones(1, dtype="i4", sycl_queue=q1)
+    ar2 = dpt.ones(1, dtype="i4", sycl_queue=q2)
+    ar3 = dpt.ones(1, dtype="i4", sycl_queue=q3)
+
+    with pytest.raises(ExecutionPlacementError):
+        dpt.diff(ar1, prepend=ar2, append=ar3)
+
+    with pytest.raises(ExecutionPlacementError):
+        dpt.diff(ar1, prepend=ar2, append=0)
+
+    with pytest.raises(ExecutionPlacementError):
+        dpt.diff(ar1, prepend=0, append=ar2)
+
+    with pytest.raises(ExecutionPlacementError):
+        dpt.diff(ar1, prepend=ar2)
+
+    with pytest.raises(ExecutionPlacementError):
+        dpt.diff(ar1, append=ar2)
+
+
+def test_diff_input_validation():
+    bad_in = {}
+    assert_raises_regex(
+        TypeError,
+        "Expecting dpctl.tensor.usm_ndarray type, got.*",
+        dpt.diff,
+        bad_in,
+    )
+
+
+def test_diff_positive_order():
+    get_queue_or_skip()
+
+    x = dpt.ones(1, dtype="i4")
+    n = -1
+    assert_raises_regex(
+        ValueError,
+        ".*must be positive.*",
+        dpt.diff,
+        x,
+        n=n,
+    )
diff --git a/dpnp/tests/tensor/test_tensor_dtype_routines.py b/dpnp/tests/tensor/test_tensor_dtype_routines.py
new file mode 100644
index 000000000000..588926c0d123
--- /dev/null
+++ b/dpnp/tests/tensor/test_tensor_dtype_routines.py
@@ -0,0 +1,170 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+
+import dpctl
+import pytest
+
+import dpnp.tensor as dpt
+
+list_dtypes = [
+    "bool",
+    "int8",
+    "int16",
+    "int32",
+    "int64",
+    "uint8",
+    "uint16",
+    "uint32",
+    "uint64",
+    "float16",
+    "float32",
+    "float64",
+    "complex64",
+    "complex128",
+]
+
+
+dtype_categories = {
+    "bool": ["bool"],
+    "signed integer": ["int8", "int16", "int32", "int64"],
+    "unsigned integer": ["uint8", "uint16", "uint32", "uint64"],
+    "integral": [
+        "int8",
+        "int16",
+        "int32",
+        "int64",
+        "uint8",
+        "uint16",
+        "uint32",
+        "uint64",
+    ],
+    "real floating": ["float16", "float32", "float64"],
+    "complex floating": ["complex64", "complex128"],
+    "numeric": [d for d in list_dtypes if d != "bool"],
+}
+
+
+@pytest.mark.parametrize("kind_str", dtype_categories.keys())
+@pytest.mark.parametrize("dtype_str", list_dtypes)
+def test_isdtype_kind_str(dtype_str, kind_str):
+    dt = dpt.dtype(dtype_str)
+    is_in_kind = dpt.isdtype(dt, kind_str)
+    expected = dtype_str in dtype_categories[kind_str]
+    assert is_in_kind == expected
+
+
+@pytest.mark.parametrize("dtype_str", list_dtypes)
+def test_isdtype_kind_tuple(dtype_str):
+    dt = dpt.dtype(dtype_str)
+    if dtype_str.startswith("bool"):
+        assert dpt.isdtype(dt, ("real floating", "bool"))
+        assert not dpt.isdtype(
+            dt, ("integral", "real floating", "complex floating")
+        )
+    elif dtype_str.startswith("int"):
+        assert dpt.isdtype(dt, ("real floating", "signed integer"))
+        assert not dpt.isdtype(
+            dt, ("bool", "unsigned integer", "real floating")
+        )
+    elif dtype_str.startswith("uint"):
+        assert dpt.isdtype(dt, ("bool", "unsigned integer"))
+        assert not dpt.isdtype(dt, ("real floating", "complex floating"))
+    elif dtype_str.startswith("float"):
+        assert dpt.isdtype(dt, ("complex floating", "real floating"))
+        assert not dpt.isdtype(dt, ("integral", "complex floating", "bool"))
+    else:
+        assert dpt.isdtype(dt, ("integral", "complex floating"))
+        assert not dpt.isdtype(dt, ("bool", "integral", "real floating"))
+
+
+@pytest.mark.parametrize("dtype_str", list_dtypes)
+def test_isdtype_kind_tuple_dtypes(dtype_str):
+    dt = dpt.dtype(dtype_str)
+    if dtype_str.startswith("bool"):
+        assert dpt.isdtype(dt, (dpt.int32, dpt.bool))
+        assert not dpt.isdtype(dt, (dpt.int16, dpt.uint32, dpt.float64))
+
+    elif dtype_str.startswith("int"):
+        assert dpt.isdtype(dt, (dpt.int8, dpt.int16, dpt.int32, dpt.int64))
+        assert not dpt.isdtype(dt, (dpt.bool, dpt.float32, dpt.complex64))
+
+    elif dtype_str.startswith("uint"):
+        assert dpt.isdtype(dt, (dpt.uint8, dpt.uint16, dpt.uint32, dpt.uint64))
+        assert not dpt.isdtype(dt, (dpt.bool, dpt.int32, dpt.float32))
+
+    elif dtype_str.startswith("float"):
+        assert dpt.isdtype(dt, (dpt.float16, dpt.float32, dpt.float64))
+        assert not dpt.isdtype(dt, (dpt.bool, dpt.complex64, dpt.int8))
+
+    else:
+        assert dpt.isdtype(dt, (dpt.complex64, dpt.complex128))
+        assert not dpt.isdtype(dt, (dpt.bool, dpt.uint64, dpt.int8))
+
+
+@pytest.mark.parametrize(
+    "kind",
+    [
+        [dpt.int32, dpt.bool],
+        "f4",
+        float,
+        123,
+        "complex",
+    ],
+)
+def test_isdtype_invalid_kind(kind):
+    with pytest.raises((TypeError, ValueError)):
+        dpt.isdtype(dpt.int32, kind)
+
+
+def test_finfo_array():
+    try:
+        x = dpt.empty(tuple(), dtype="f4")
+    except dpctl.SyclDeviceCreationError:
+        pytest.skip("Default-selected SYCL device unavailable")
+    o = dpt.finfo(x)
+    assert o.dtype == dpt.float32
+
+
+def test_iinfo_array():
+    try:
+        x = dpt.empty(tuple(), dtype="i4")
+    except dpctl.SyclDeviceCreationError:
+        pytest.skip("Default-selected SYCL device unavailable")
+    o = dpt.iinfo(x)
+    assert o.dtype == dpt.int32
+
+
+def test_iinfo_validation():
+    with pytest.raises(ValueError):
+        dpt.iinfo("O")
+
+
+def test_finfo_validation():
+    with pytest.raises(ValueError):
+        dpt.iinfo("O")
diff --git a/dpnp/tests/tensor/test_tensor_isin.py b/dpnp/tests/tensor/test_tensor_isin.py
new file mode 100644
index 000000000000..0bb22ea242ad
--- /dev/null
+++ b/dpnp/tests/tensor/test_tensor_isin.py
@@ -0,0 +1,282 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+import ctypes
+
+import numpy as np
+import pytest
+from dpctl.utils import ExecutionPlacementError
+
+import dpnp.tensor as dpt
+
+from .helper import (
+    get_queue_or_skip,
+    skip_if_dtype_not_supported,
+)
+
+_numeric_dtypes = [
+    "i1",
+    "u1",
+    "i2",
+    "u2",
+    "i4",
+    "u4",
+    "i8",
+    "u8",
+    "f2",
+    "f4",
+    "f8",
+    "c8",
+    "c16",
+]
+
+_all_dtypes = ["?"] + _numeric_dtypes
+
+
+@pytest.mark.parametrize("dtype", _numeric_dtypes)
+def test_isin_basic(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    n = 100
+    x = dpt.arange(n, dtype=dtype, sycl_queue=q)
+    test = dpt.arange(n - 1, dtype=dtype, sycl_queue=q)
+    r1 = dpt.isin(x, test)
+    assert dpt.all(r1[:-1])
+    assert not r1[-1]
+    assert r1.shape == x.shape
+
+    # test with invert keyword
+    r2 = dpt.isin(x, test, invert=True)
+    assert not dpt.any(r2[:-1])
+    assert r2[-1]
+    assert r2.shape == x.shape
+
+
+def test_isin_basic_bool():
+    dt = dpt.bool
+    n = 100
+    x = dpt.zeros(n, dtype=dt)
+    x[-1] = True
+    test = dpt.zeros((), dtype=dt)
+    r1 = dpt.isin(x, test)
+    assert dpt.all(r1[:-1])
+    assert not r1[-1]
+    assert r1.shape == x.shape
+
+    r2 = dpt.isin(x, test, invert=True)
+    assert not dpt.any(r2[:-1])
+    assert r2[-1]
+    assert r2.shape == x.shape
+
+
+@pytest.mark.parametrize(
+    "dtype",
+    [
+        "i1",
+        "u1",
+        "i2",
+        "u2",
+        "i4",
+        "u4",
+        "i8",
+        "u8",
+        "f2",
+        "f4",
+        "f8",
+        "c8",
+        "c16",
+    ],
+)
+def test_isin_strided(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    n, m = 100, 20
+    x = dpt.zeros((n, m), dtype=dtype, order="F", sycl_queue=q)
+    x[:, ::2] = dpt.arange(1, (m / 2) + 1, dtype=dtype, sycl_queue=q)
+    x_s = x[:, ::2]
+    test = dpt.arange(1, (m / 2), dtype=dtype, sycl_queue=q)
+    r1 = dpt.isin(x_s, test)
+    assert dpt.all(r1[:, :-1])
+    assert not dpt.any(r1[:, -1])
+    assert not dpt.any(x[:, 1::2])
+    assert r1.shape == x_s.shape
+    assert r1.flags.c_contiguous
+
+    # test with invert keyword
+    r2 = dpt.isin(x_s, test, invert=True)
+    assert not dpt.any(r2[:, :-1])
+    assert dpt.all(r2[:, -1])
+    assert not dpt.any(x[:, 1:2])
+    assert r2.shape == x_s.shape
+    assert r2.flags.c_contiguous
+
+
+def test_isin_strided_bool():
+    dt = dpt.bool
+
+    n, m = 100, 20
+    x = dpt.zeros((n, m), dtype=dt, order="F")
+    x[:, :-2:2] = True
+    x_s = x[:, ::2]
+    test = dpt.ones((), dtype=dt)
+    r1 = dpt.isin(x_s, test)
+    assert dpt.all(r1[:, :-1])
+    assert not dpt.any(r1[:, -1])
+    assert not dpt.any(x[:, 1::2])
+    assert r1.shape == x_s.shape
+    assert r1.flags.c_contiguous
+
+    # test with invert keyword
+    r2 = dpt.isin(x_s, test, invert=True)
+    assert not dpt.any(r2[:, :-1])
+    assert dpt.all(r2[:, -1])
+    assert not dpt.any(x[:, 1:2])
+    assert r2.shape == x_s.shape
+    assert r2.flags.c_contiguous
+
+
+@pytest.mark.parametrize("dt1", _numeric_dtypes)
+@pytest.mark.parametrize("dt2", _numeric_dtypes)
+def test_isin_dtype_matrix(dt1, dt2):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dt1, q)
+    skip_if_dtype_not_supported(dt2, q)
+
+    sz = 10
+    x = dpt.asarray([0, 1, 11], dtype=dt1, sycl_queue=q)
+    test1 = dpt.arange(sz, dtype=dt2, sycl_queue=q)
+
+    r1 = dpt.isin(x, test1)
+    assert isinstance(r1, dpt.usm_ndarray)
+    assert r1.dtype == dpt.bool
+    assert r1.shape == x.shape
+    assert not r1[-1]
+    assert dpt.all(r1[0:-1])
+    assert r1.sycl_queue == x.sycl_queue
+
+    test2 = dpt.tile(dpt.asarray([[0, 1]], dtype=dt2, sycl_queue=q).mT, 2)
+    r2 = dpt.isin(x, test2)
+    assert isinstance(r2, dpt.usm_ndarray)
+    assert r2.dtype == dpt.bool
+    assert r2.shape == x.shape
+    assert not r2[-1]
+    assert dpt.all(r1[0:-1])
+    assert r2.sycl_queue == x.sycl_queue
+
+
+def test_isin_empty_inputs():
+    get_queue_or_skip()
+
+    x = dpt.ones((10, 0, 1), dtype="i4")
+    test = dpt.ones((), dtype="i4")
+    res1 = dpt.isin(x, test)
+    assert isinstance(res1, dpt.usm_ndarray)
+    assert res1.size == 0
+    assert res1.shape == x.shape
+    assert res1.dtype == dpt.bool
+
+    res2 = dpt.isin(x, test, invert=True)
+    assert isinstance(res2, dpt.usm_ndarray)
+    assert res2.size == 0
+    assert res2.shape == x.shape
+    assert res2.dtype == dpt.bool
+
+    x = dpt.ones((3, 3), dtype="i4")
+    test = dpt.ones(0, dtype="i4")
+    res3 = dpt.isin(x, test)
+    assert isinstance(res3, dpt.usm_ndarray)
+    assert res3.shape == x.shape
+    assert res3.dtype == dpt.bool
+    assert not dpt.all(res3)
+
+    res4 = dpt.isin(x, test, invert=True)
+    assert isinstance(res4, dpt.usm_ndarray)
+    assert res4.shape == x.shape
+    assert res4.dtype == dpt.bool
+    assert dpt.all(res4)
+
+
+def test_isin_validation():
+    get_queue_or_skip()
+    with pytest.raises(ExecutionPlacementError):
+        dpt.isin(1, 1)
+    not_bool = {}
+    with pytest.raises(TypeError):
+        dpt.isin(dpt.ones([1]), dpt.ones([1]), invert=not_bool)
+
+
+def test_isin_special_floating_point_vals():
+    get_queue_or_skip()
+
+    # real and complex nans compare false
+    x = dpt.asarray(dpt.nan, dtype="f4")
+    test = dpt.asarray(dpt.nan, dtype="f4")
+    assert not dpt.isin(x, test)
+
+    x = dpt.asarray(dpt.nan, dtype="c8")
+    test = dpt.asarray(dpt.nan, dtype="c8")
+    assert not dpt.isin(x, test)
+
+    # -0.0 compares equal to +0.0
+    x = dpt.asarray(-0.0, dtype="f4")
+    test = dpt.asarray(0.0, dtype="f4")
+    assert dpt.isin(x, test)
+    assert dpt.isin(test, x)
+
+
+@pytest.mark.parametrize("dt", _all_dtypes)
+def test_isin_py_scalars(dt):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dt, q)
+
+    x = dpt.zeros((10, 10), dtype=dt, sycl_queue=q)
+    py_zeros = (
+        bool(0),
+        int(0),
+        float(0),
+        complex(0),
+        np.float32(0),
+        ctypes.c_int(0),
+    )
+    for sc in py_zeros:
+        r1 = dpt.isin(x, sc)
+        assert isinstance(r1, dpt.usm_ndarray)
+        r2 = dpt.isin(sc, x)
+        assert isinstance(r2, dpt.usm_ndarray)
+
+
+def test_isin_compute_follows_data():
+    q1 = get_queue_or_skip()
+    q2 = get_queue_or_skip()
+
+    x = dpt.ones(10, sycl_queue=q1)
+    test = dpt.ones_like(x, sycl_queue=q2)
+    with pytest.raises(ExecutionPlacementError):
+        dpt.isin(x, test)
diff --git a/dpnp/tests/tensor/test_tensor_statistical_functions.py b/dpnp/tests/tensor/test_tensor_statistical_functions.py
new file mode 100644
index 000000000000..7e444500d75f
--- /dev/null
+++ b/dpnp/tests/tensor/test_tensor_statistical_functions.py
@@ -0,0 +1,271 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+import pytest
+
+import dpnp.tensor as dpt
+from dpnp.tensor._tensor_impl import default_device_fp_type
+
+from .helper import (
+    get_queue_or_skip,
+    skip_if_dtype_not_supported,
+)
+
+_no_complex_dtypes = [
+    "?",
+    "i1",
+    "u1",
+    "i2",
+    "u2",
+    "i4",
+    "u4",
+    "i8",
+    "u8",
+    "f2",
+    "f4",
+    "f8",
+]
+
+
+@pytest.mark.parametrize("dt", _no_complex_dtypes)
+def test_mean_dtypes(dt):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dt, q)
+
+    x = dpt.ones(10, dtype=dt)
+    res = dpt.mean(x)
+    assert res == 1
+    if x.dtype.kind in "biu":
+        assert res.dtype == dpt.dtype(default_device_fp_type(q))
+    else:
+        assert res.dtype == x.dtype
+
+
+@pytest.mark.parametrize("dt", _no_complex_dtypes)
+@pytest.mark.parametrize("py_zero", [float(0), int(0)])
+def test_std_var_dtypes(dt, py_zero):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dt, q)
+
+    x = dpt.ones(10, dtype=dt)
+    res = dpt.std(x, correction=py_zero)
+    assert res == 0
+    if x.dtype.kind in "biu":
+        assert res.dtype == dpt.dtype(default_device_fp_type(q))
+    else:
+        assert res.dtype == x.dtype
+
+    res = dpt.var(x, correction=py_zero)
+    assert res == 0
+    if x.dtype.kind in "biu":
+        assert res.dtype == dpt.dtype(default_device_fp_type(q))
+    else:
+        assert res.dtype == x.dtype
+
+
+def test_stat_fns_axis():
+    get_queue_or_skip()
+
+    x = dpt.ones((3, 4, 5, 6, 7), dtype="f4")
+    m = dpt.mean(x, axis=(1, 2, -1))
+
+    assert isinstance(m, dpt.usm_ndarray)
+    assert m.shape == (3, 6)
+    assert dpt.allclose(m, dpt.asarray(1, dtype=m.dtype))
+
+    s = dpt.var(x, axis=(1, 2, -1))
+    assert isinstance(s, dpt.usm_ndarray)
+    assert s.shape == (3, 6)
+    assert dpt.allclose(s, dpt.asarray(0, dtype=s.dtype))
+
+
+@pytest.mark.parametrize("fn", [dpt.mean, dpt.var])
+def test_stat_fns_empty(fn):
+    get_queue_or_skip()
+    x = dpt.empty((0,), dtype="f4")
+    r = fn(x)
+    assert r.shape == ()
+    assert dpt.isnan(r)
+
+    x = dpt.empty((10, 0, 2), dtype="f4")
+    r = fn(x, axis=1)
+    assert r.shape == (10, 2)
+    assert dpt.all(dpt.isnan(r))
+
+    r = fn(x, axis=0)
+    assert r.shape == (0, 2)
+    assert r.size == 0
+
+
+def test_stat_fns_keepdims():
+    get_queue_or_skip()
+
+    x = dpt.ones((3, 4, 5, 6, 7), dtype="f4")
+    m = dpt.mean(x, axis=(1, 2, -1), keepdims=True)
+
+    assert isinstance(m, dpt.usm_ndarray)
+    assert m.shape == (3, 1, 1, 6, 1)
+    assert dpt.allclose(m, dpt.asarray(1, dtype=m.dtype))
+
+    s = dpt.var(x, axis=(1, 2, -1), keepdims=True)
+    assert isinstance(s, dpt.usm_ndarray)
+    assert s.shape == (3, 1, 1, 6, 1)
+    assert dpt.allclose(s, dpt.asarray(0, dtype=s.dtype))
+
+
+def test_stat_fns_empty_axis():
+    get_queue_or_skip()
+
+    x = dpt.reshape(dpt.arange(3 * 4 * 5, dtype="f4"), (3, 4, 5))
+    m = dpt.mean(x, axis=())
+
+    assert x.shape == m.shape
+    assert dpt.all(x == m)
+
+    s = dpt.var(x, axis=())
+    assert x.shape == s.shape
+    assert dpt.all(s == 0)
+
+    d = dpt.std(x, axis=())
+    assert x.shape == d.shape
+    assert dpt.all(d == 0)
+
+
+def test_mean():
+    get_queue_or_skip()
+
+    x = dpt.reshape(dpt.arange(9, dtype="f4"), (3, 3))
+    m = dpt.mean(x)
+    expected = dpt.asarray(4, dtype="f4")
+    assert dpt.allclose(m, expected)
+
+    m = dpt.mean(x, axis=0)
+    expected = dpt.arange(3, 6, dtype="f4")
+    assert dpt.allclose(m, expected)
+
+    m = dpt.mean(x, axis=1)
+    expected = dpt.asarray([1, 4, 7], dtype="f4")
+    assert dpt.allclose(m, expected)
+
+
+def test_var_std():
+    get_queue_or_skip()
+
+    x = dpt.reshape(dpt.arange(9, dtype="f4"), (3, 3))
+    r = dpt.var(x)
+    expected = dpt.asarray(6.666666507720947, dtype="f4")
+    assert dpt.allclose(r, expected)
+
+    r1 = dpt.var(x, correction=3)
+    expected1 = dpt.asarray(10.0, dtype="f4")
+    assert dpt.allclose(r1, expected1)
+
+    r = dpt.std(x)
+    expected = dpt.sqrt(expected)
+    assert dpt.allclose(r, expected)
+
+    r1 = dpt.std(x, correction=3)
+    expected1 = dpt.sqrt(expected1)
+    assert dpt.allclose(r1, expected1)
+
+    r = dpt.var(x, axis=0)
+    expected = dpt.full(x.shape[1], 6, dtype="f4")
+    assert dpt.allclose(r, expected)
+
+    r1 = dpt.var(x, axis=0, correction=1)
+    expected1 = dpt.full(x.shape[1], 9, dtype="f4")
+    assert dpt.allclose(r1, expected1)
+
+    r = dpt.std(x, axis=0)
+    expected = dpt.sqrt(expected)
+    assert dpt.allclose(r, expected)
+
+    r1 = dpt.std(x, axis=0, correction=1)
+    expected1 = dpt.sqrt(expected1)
+    assert dpt.allclose(r1, expected1)
+
+    r = dpt.var(x, axis=1)
+    expected = dpt.full(x.shape[0], 0.6666666865348816, dtype="f4")
+    assert dpt.allclose(r, expected)
+
+    r1 = dpt.var(x, axis=1, correction=1)
+    expected1 = dpt.ones(x.shape[0], dtype="f4")
+    assert dpt.allclose(r1, expected1)
+
+    r = dpt.std(x, axis=1)
+    expected = dpt.sqrt(expected)
+    assert dpt.allclose(r, expected)
+
+    r1 = dpt.std(x, axis=1, correction=1)
+    expected1 = dpt.sqrt(expected1)
+    assert dpt.allclose(r1, expected1)
+
+
+def test_var_axis_length_correction():
+    get_queue_or_skip()
+
+    x = dpt.reshape(dpt.arange(9, dtype="f4"), (3, 3))
+
+    r = dpt.var(x, correction=x.size)
+    assert dpt.isnan(r)
+
+    r = dpt.var(x, axis=0, correction=x.shape[0])
+    assert dpt.all(dpt.isnan(r))
+
+    r = dpt.var(x, axis=1, correction=x.shape[1])
+    assert dpt.all(dpt.isnan(r))
+
+
+def test_stat_function_errors():
+    d = {}
+    with pytest.raises(TypeError):
+        dpt.var(d)
+    with pytest.raises(TypeError):
+        dpt.std(d)
+    with pytest.raises(TypeError):
+        dpt.mean(d)
+
+    get_queue_or_skip()
+    x = dpt.empty(1, dtype="f4")
+    with pytest.raises(TypeError):
+        dpt.var(x, axis=d)
+    with pytest.raises(TypeError):
+        dpt.std(x, axis=d)
+    with pytest.raises(TypeError):
+        dpt.mean(x, axis=d)
+
+    with pytest.raises(TypeError):
+        dpt.var(x, correction=d)
+    with pytest.raises(TypeError):
+        dpt.std(x, correction=d)
+
+    x = dpt.empty(1, dtype="c8")
+    with pytest.raises(ValueError):
+        dpt.var(x)
+    with pytest.raises(ValueError):
+        dpt.std(x)
diff --git a/dpnp/tests/tensor/test_tensor_sum.py b/dpnp/tests/tensor/test_tensor_sum.py
new file mode 100644
index 000000000000..90e548f1b28c
--- /dev/null
+++ b/dpnp/tests/tensor/test_tensor_sum.py
@@ -0,0 +1,348 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+import pytest
+
+import dpnp.tensor as dpt
+
+from .helper import (
+    get_queue_or_skip,
+    skip_if_dtype_not_supported,
+)
+
+_all_dtypes = [
+    "?",
+    "i1",
+    "u1",
+    "i2",
+    "u2",
+    "i4",
+    "u4",
+    "i8",
+    "u8",
+    "f2",
+    "f4",
+    "f8",
+    "c8",
+    "c16",
+]
+
+
+@pytest.mark.parametrize("arg_dtype", _all_dtypes)
+def test_sum_arg_dtype_default_output_dtype_matrix(arg_dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(arg_dtype, q)
+
+    # test reduction for C-contiguous input
+    m = dpt.ones(100, dtype=arg_dtype)
+    r = dpt.sum(m)
+
+    assert isinstance(r, dpt.usm_ndarray)
+    if m.dtype.kind == "i":
+        assert r.dtype.kind == "i"
+    elif m.dtype.kind == "u":
+        assert r.dtype.kind == "u"
+    elif m.dtype.kind == "f":
+        assert r.dtype.kind == "f"
+    elif m.dtype.kind == "c":
+        assert r.dtype.kind == "c"
+
+    assert dpt.all(r == 100)
+
+    # test reduction for strided input
+    m = dpt.ones(200, dtype=arg_dtype)[:1:-2]
+    r = dpt.sum(m)
+    assert dpt.all(r == 99)
+
+    # test reduction for strided input which can be simplified
+    # to contiguous computation
+    m = dpt.ones(100, dtype=arg_dtype)
+    r = dpt.sum(dpt.flip(m))
+    assert dpt.all(r == 100)
+
+
+@pytest.mark.parametrize("arg_dtype", _all_dtypes)
+@pytest.mark.parametrize("out_dtype", _all_dtypes[1:])
+def test_sum_arg_out_dtype_matrix(arg_dtype, out_dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(arg_dtype, q)
+    skip_if_dtype_not_supported(out_dtype, q)
+
+    m = dpt.ones(100, dtype=arg_dtype)
+    r = dpt.sum(m, dtype=out_dtype)
+
+    assert isinstance(r, dpt.usm_ndarray)
+    assert r.dtype == dpt.dtype(out_dtype)
+    assert dpt.all(r == 100)
+
+
+def test_sum_empty():
+    get_queue_or_skip()
+    x = dpt.empty((0,), dtype="u1")
+    y = dpt.sum(x)
+    assert y.shape == ()
+    assert int(y) == 0
+
+
+def test_sum_axis():
+    get_queue_or_skip()
+
+    m = dpt.ones((3, 4, 5, 6, 7), dtype="i4")
+    s = dpt.sum(m, axis=(1, 2, -1))
+
+    assert isinstance(s, dpt.usm_ndarray)
+    assert s.shape == (3, 6)
+    assert dpt.all(s == dpt.asarray(4 * 5 * 7, dtype="i4"))
+
+
+def test_sum_keepdims():
+    get_queue_or_skip()
+
+    m = dpt.ones((3, 4, 5, 6, 7), dtype="i4")
+    s = dpt.sum(m, axis=(1, 2, -1), keepdims=True)
+
+    assert isinstance(s, dpt.usm_ndarray)
+    assert s.shape == (3, 1, 1, 6, 1)
+    assert dpt.all(s == dpt.asarray(4 * 5 * 7, dtype=s.dtype))
+
+
+def test_sum_scalar():
+    get_queue_or_skip()
+
+    m = dpt.ones(())
+    s = dpt.sum(m)
+
+    assert isinstance(s, dpt.usm_ndarray)
+    assert m.sycl_queue == s.sycl_queue
+    assert s.shape == ()
+    assert s == dpt.full((), 1)
+
+
+@pytest.mark.parametrize("arg_dtype", _all_dtypes)
+@pytest.mark.parametrize("out_dtype", _all_dtypes[1:])
+def test_sum_arg_out_dtype_scalar(arg_dtype, out_dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(arg_dtype, q)
+    skip_if_dtype_not_supported(out_dtype, q)
+
+    m = dpt.ones((), dtype=arg_dtype)
+    r = dpt.sum(m, dtype=out_dtype)
+
+    assert isinstance(r, dpt.usm_ndarray)
+    assert r.dtype == dpt.dtype(out_dtype)
+    assert r == 1
+
+
+def test_sum_keepdims_zero_size():
+    """See gh-1293"""
+    get_queue_or_skip()
+    n = 10
+    a = dpt.ones((n, 0, n))
+
+    s1 = dpt.sum(a, keepdims=True)
+    assert s1.shape == (1, 1, 1)
+
+    s2 = dpt.sum(a, axis=(0, 1), keepdims=True)
+    assert s2.shape == (1, 1, n)
+
+    s3 = dpt.sum(a, axis=(1, 2), keepdims=True)
+    assert s3.shape == (n, 1, 1)
+
+    s4 = dpt.sum(a, axis=(0, 2), keepdims=True)
+    assert s4.shape == (1, 0, 1)
+
+    a0 = a[0]
+    s5 = dpt.sum(a0, keepdims=True)
+    assert s5.shape == (1, 1)
+
+
+@pytest.mark.parametrize("arg_dtype", ["i8", "f4", "c8"])
+@pytest.mark.parametrize("n", [1023, 1024, 1025])
+def test_largish_reduction(arg_dtype, n):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(arg_dtype, q)
+
+    m = 5
+    x = dpt.ones((m, n, m), dtype=arg_dtype)
+
+    y1 = dpt.sum(x, axis=(0, 1))
+    y2 = dpt.sum(x, axis=(1, 2))
+
+    assert dpt.all(dpt.equal(y1, y2))
+    assert dpt.all(dpt.equal(y1, n * m))
+
+
+@pytest.mark.parametrize("n", [1023, 1024, 1025])
+def test_largish_reduction_axis1_axis0(n):
+    get_queue_or_skip()
+
+    m = 25
+    x1 = dpt.ones((m, n), dtype="f4")
+    x2 = dpt.ones((n, m), dtype="f4")
+
+    y1 = dpt.sum(x1, axis=1)
+    y2 = dpt.sum(x2, axis=0)
+
+    assert dpt.all(y1 == n)
+    assert dpt.all(y2 == n)
+
+
+def test_axis0_bug():
+    "gh-1391"
+    get_queue_or_skip()
+
+    sh = (1, 2, 3)
+    a = dpt.arange(sh[0] * sh[1] * sh[2], dtype="i4")
+    a = dpt.reshape(a, sh)
+    aT = dpt.permute_dims(a, (2, 1, 0))
+
+    s = dpt.sum(aT, axis=2)
+    expected = dpt.asarray([[0, 3], [1, 4], [2, 5]])
+
+    assert dpt.all(s == expected)
+
+
+def test_sum_axis1_axis0():
+    """See gh-1455"""
+    get_queue_or_skip()
+
+    # The atomic case is checked in `test_usm_ndarray_reductions`
+    # This test checks the tree reduction path for correctness
+    x = dpt.reshape(dpt.arange(3 * 4 * 5, dtype="f4"), (3, 4, 5))
+
+    m = dpt.sum(x, axis=0)
+    expected = dpt.asarray(
+        [
+            [60, 63, 66, 69, 72],
+            [75, 78, 81, 84, 87],
+            [90, 93, 96, 99, 102],
+            [105, 108, 111, 114, 117],
+        ],
+        dtype="f4",
+    )
+    tol = dpt.finfo(m.dtype).resolution
+    assert dpt.allclose(m, expected, atol=tol, rtol=tol)
+
+    x = dpt.flip(x, axis=2)
+    m = dpt.sum(x, axis=2)
+    expected = dpt.asarray(
+        [[10, 35, 60, 85], [110, 135, 160, 185], [210, 235, 260, 285]],
+        dtype="f4",
+    )
+    assert dpt.allclose(m, expected, atol=tol, rtol=tol)
+
+
+@pytest.mark.parametrize("arg_dtype", _all_dtypes[1:])
+def test_prod_arg_dtype_default_output_dtype_matrix(arg_dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(arg_dtype, q)
+
+    arg_dtype = dpt.dtype(arg_dtype)
+
+    m = dpt.ones(100, dtype=arg_dtype)
+    r = dpt.prod(m)
+
+    assert isinstance(r, dpt.usm_ndarray)
+    if m.dtype.kind == "i":
+        assert r.dtype.kind == "i"
+    elif m.dtype.kind == "u":
+        assert r.dtype.kind == "u"
+    elif m.dtype.kind == "f":
+        assert r.dtype.kind == "f"
+    elif m.dtype.kind == "c":
+        assert r.dtype.kind == "c"
+    assert dpt.all(r == 1)
+
+    if dpt.isdtype(m.dtype, "unsigned integer"):
+        m = dpt.tile(dpt.arange(1, 3, dtype=arg_dtype), 10)[:1:-2]
+        r = dpt.prod(m)
+        assert dpt.all(r == dpt.asarray(512, dtype=r.dtype))
+    else:
+        m = dpt.full(200, -1, dtype=arg_dtype)[:1:-2]
+        r = dpt.prod(m)
+        assert dpt.all(r == dpt.asarray(-1, dtype=r.dtype))
+
+
+def test_prod_empty():
+    get_queue_or_skip()
+    x = dpt.empty((0,), dtype="u1")
+    y = dpt.prod(x)
+    assert y.shape == ()
+    assert int(y) == 1
+
+
+def test_prod_axis():
+    get_queue_or_skip()
+
+    m = dpt.ones((3, 4, 5, 6, 7), dtype="i4")
+    s = dpt.prod(m, axis=(1, 2, -1))
+
+    assert isinstance(s, dpt.usm_ndarray)
+    assert s.shape == (3, 6)
+    assert dpt.all(s == dpt.asarray(1, dtype="i4"))
+
+
+@pytest.mark.parametrize("arg_dtype", _all_dtypes)
+@pytest.mark.parametrize("out_dtype", _all_dtypes[1:])
+def test_prod_arg_out_dtype_matrix(arg_dtype, out_dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(arg_dtype, q)
+    skip_if_dtype_not_supported(out_dtype, q)
+
+    out_dtype = dpt.dtype(out_dtype)
+    arg_dtype = dpt.dtype(arg_dtype)
+
+    m = dpt.ones(100, dtype=arg_dtype)
+    r = dpt.prod(m, dtype=out_dtype)
+
+    assert isinstance(r, dpt.usm_ndarray)
+    assert r.dtype == dpt.dtype(out_dtype)
+    assert dpt.all(r == 1)
+
+
+def test_gh_1468():
+    "See https://github.com/IntelPython/dpctl/issues/1468"
+    get_queue_or_skip()
+
+    a = dpt.full((2, 3, 4), 123456789, dtype=dpt.int32)
+    t = dpt.sum(a, dtype="f4")
+    assert t > 0
+
+
+@pytest.mark.parametrize(
+    "dt", ["i1", "i2", "i4", "i8", "f2", "f4", "f8", "c8", "c16"]
+)
+def test_gh_1944(dt):
+    "See https://github.com/IntelPython/dpctl/issues/1944"
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dt, q)
+    x = dpt.asarray([-1, 1], dtype=dpt.dtype(dt), sycl_queue=q)
+    r = dpt.sum(x, dtype="?")
+    # reduction must be performed in the requested dtype
+    # if performed in the input type, result is False
+    assert r
diff --git a/dpnp/tests/tensor/test_tensor_testing.py b/dpnp/tests/tensor/test_tensor_testing.py
new file mode 100644
index 000000000000..34cc40987354
--- /dev/null
+++ b/dpnp/tests/tensor/test_tensor_testing.py
@@ -0,0 +1,181 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+import itertools
+
+import pytest
+
+import dpnp.tensor as dpt
+
+from .helper import (
+    get_queue_or_skip,
+    skip_if_dtype_not_supported,
+)
+
+_all_dtypes = [
+    "?",
+    "i1",
+    "u1",
+    "i2",
+    "u2",
+    "i4",
+    "u4",
+    "i8",
+    "u8",
+    "f2",
+    "f4",
+    "f8",
+    "c8",
+    "c16",
+]
+
+
+@pytest.mark.parametrize("dtype", _all_dtypes)
+def test_allclose(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    a1 = dpt.ones(10, dtype=dtype)
+    a2 = dpt.ones(10, dtype=dtype)
+
+    assert dpt.allclose(a1, a2)
+
+
+@pytest.mark.parametrize("dtype", ["f2", "f4", "f8"])
+def test_allclose_real_fp(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    v = [dpt.nan, -dpt.nan, dpt.inf, -dpt.inf, -0.0, 0.0, 1.0, -1.0]
+    a1 = dpt.asarray(v[2:], dtype=dtype)
+    a2 = dpt.asarray(v[2:], dtype=dtype)
+
+    tol = dpt.finfo(a1.dtype).resolution
+    assert dpt.allclose(a1, a2, atol=tol, rtol=tol)
+
+    a1 = dpt.asarray(v, dtype=dtype)
+    a2 = dpt.asarray(v, dtype=dtype)
+
+    assert not dpt.allclose(a1, a2, atol=tol, rtol=tol)
+    assert dpt.allclose(a1, a2, atol=tol, rtol=tol, equal_nan=True)
+
+
+@pytest.mark.parametrize("dtype", ["c8", "c16"])
+def test_allclose_complex_fp(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    v = [dpt.nan, -dpt.nan, dpt.inf, -dpt.inf, -0.0, 0.0, 1.0, -1.0]
+
+    not_nans = [complex(*xy) for xy in itertools.product(v[2:], repeat=2)]
+    z1 = dpt.asarray(not_nans, dtype=dtype)
+    z2 = dpt.asarray(not_nans, dtype=dtype)
+
+    tol = dpt.finfo(z1.dtype).resolution
+    assert dpt.allclose(z1, z2, atol=tol, rtol=tol)
+
+    both = [complex(*xy) for xy in itertools.product(v, repeat=2)]
+    z1 = dpt.asarray(both, dtype=dtype)
+    z2 = dpt.asarray(both, dtype=dtype)
+
+    tol = dpt.finfo(z1.dtype).resolution
+    assert not dpt.allclose(z1, z2, atol=tol, rtol=tol)
+    assert dpt.allclose(z1, z2, atol=tol, rtol=tol, equal_nan=True)
+
+
+def test_allclose_validation():
+    with pytest.raises(TypeError):
+        dpt.allclose(True, False)
+
+    get_queue_or_skip()
+    x = dpt.asarray(True)
+    with pytest.raises(TypeError):
+        dpt.allclose(x, False)
+
+
+def test_allclose_type_promotion():
+    get_queue_or_skip()
+
+    x1 = dpt.ones(10, dtype="i4")
+    x2 = dpt.ones(10, dtype="i8")
+
+    assert dpt.allclose(x1, x2)
+
+
+def test_allclose_tolerance():
+    get_queue_or_skip()
+
+    x = dpt.zeros(10, dtype="f4")
+    atol = 1e-5
+    y = dpt.full_like(x, atol)
+    assert dpt.allclose(x, y, atol=atol, rtol=0)
+
+    # about 8e-6
+    tol = float.fromhex("0x1.0p-17")
+    x = dpt.ones(10, dtype="f4")
+    y = x - tol
+    assert dpt.allclose(x, y, atol=0, rtol=tol)
+
+
+def test_allclose_real_fp_early_exists():
+    get_queue_or_skip()
+
+    x1 = dpt.asarray([0.0, dpt.inf, -dpt.inf], dtype="f4")
+    x2 = dpt.asarray([dpt.inf, 0.0, -dpt.inf], dtype="f4")
+
+    # early exists, inf positions are different
+    assert not dpt.allclose(x1, x2)
+
+    x2 = dpt.asarray([0.0, -dpt.inf, dpt.inf], dtype="f4")
+
+    # early exists, inf positions are the same, but signs differ
+    assert not dpt.allclose(x1, x2)
+
+
+def test_allclose_complex_fp_early_exists():
+    get_queue_or_skip()
+
+    x1 = dpt.asarray([0.0, dpt.inf, -dpt.inf], dtype="c8")
+    x2 = dpt.asarray([dpt.inf, 0.0, -dpt.inf], dtype="c8")
+
+    # early exists, inf positions of real parts are different
+    assert not dpt.allclose(x1, x2)
+
+    x2 = dpt.asarray([0.0, -dpt.inf, dpt.inf], dtype="c8")
+
+    # early exists, inf positions of real parts are the same, but signs differ
+    assert not dpt.allclose(x1, x2)
+
+    x1 = dpt.asarray([0.0, dpt.inf * 1j, -dpt.inf * 1j], dtype="c8")
+    x2 = dpt.asarray([dpt.inf * 1j, 0.0, -dpt.inf * 1j], dtype="c8")
+
+    # early exists, inf positions of imag parts are different
+    assert not dpt.allclose(x1, x2)
+
+    x2 = dpt.asarray([0.0, -dpt.inf * 1j, dpt.inf * 1j], dtype="c8")
+    assert not dpt.allclose(x1, x2)
diff --git a/dpnp/tests/tensor/test_usm_ndarray_ctor.py b/dpnp/tests/tensor/test_usm_ndarray_ctor.py
new file mode 100644
index 000000000000..cb185ff64a1f
--- /dev/null
+++ b/dpnp/tests/tensor/test_usm_ndarray_ctor.py
@@ -0,0 +1,2324 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+import ctypes
+import numbers
+from math import prod
+
+import dpctl
+import dpctl.memory as dpm
+import numpy as np
+import pytest
+from numpy.testing import assert_raises_regex
+
+import dpnp.tensor as dpt
+from dpnp.tensor import Device
+
+from .helper import (
+    get_queue_or_skip,
+    skip_if_dtype_not_supported,
+)
+
+_all_dtypes = [
+    "b1",
+    "i1",
+    "u1",
+    "i2",
+    "u2",
+    "i4",
+    "u4",
+    "i8",
+    "u8",
+    "f2",
+    "f4",
+    "f8",
+    "c8",
+    "c16",
+]
+
+
+@pytest.mark.parametrize(
+    "shape",
+    [
+        (),
+        (4,),
+        (0,),
+        (0, 1),
+        (0, 0),
+        (4, 5),
+        (2, 5, 2),
+        (2, 2, 2, 2, 2, 2, 2, 2),
+        5,
+        np.int32(7),
+    ],
+)
+@pytest.mark.parametrize("usm_type", ["shared", "host", "device"])
+def test_allocate_usm_ndarray(shape, usm_type):
+    q = get_queue_or_skip()
+    X = dpt.usm_ndarray(
+        shape, dtype="i8", buffer=usm_type, buffer_ctor_kwargs={"queue": q}
+    )
+    Xnp = np.ndarray(shape, dtype="i8")
+    assert X.usm_type == usm_type
+    assert X.sycl_context == q.sycl_context
+    assert X.sycl_device == q.sycl_device
+    assert X.size == Xnp.size
+    assert X.shape == Xnp.shape
+    assert X.shape == X.__sycl_usm_array_interface__["shape"]
+
+
+def test_usm_ndarray_flags():
+    get_queue_or_skip()
+    f = dpt.usm_ndarray((5,), dtype="i4").flags
+    assert f.fc
+    assert f.forc
+
+    f = dpt.usm_ndarray((5, 2), dtype="i4").flags
+    assert f.c_contiguous
+    assert f.forc
+
+    f = dpt.usm_ndarray((5, 2), dtype="i4", order="F").flags
+    assert f.f_contiguous
+    assert f.forc
+    assert f.fnc
+
+    f = dpt.usm_ndarray((5,), dtype="i4", strides=(1,)).flags
+    assert f.fc
+    assert f.forc
+
+    f = dpt.usm_ndarray((5, 1, 2), dtype="i4", strides=(2, 0, 1)).flags
+    assert f.c_contiguous
+    assert f.forc
+
+    f = dpt.usm_ndarray((5, 1, 2), dtype="i4", strides=(1, 0, 5)).flags
+    assert f.f_contiguous
+    assert f.forc
+    assert f.fnc
+
+    f = dpt.usm_ndarray((5, 0, 1), dtype="i4", strides=(1, 0, 1)).flags
+    assert f.fc
+    assert f.forc
+    assert not dpt.usm_ndarray(
+        (5, 1, 1), dtype="i4", strides=(2, 0, 1)
+    ).flags.forc
+
+    x = dpt.empty(5, dtype="u2")
+    assert x.flags.writable is True
+    x.flags.writable = False
+    assert x.flags.writable is False
+    with pytest.raises(ValueError):
+        x[:] = 0
+    x.flags["W"] = True
+    assert x.flags.writable is True
+    x.flags["WRITABLE"] = True
+    assert x.flags.writable is True
+    x[:] = 0
+
+    with pytest.raises(TypeError):
+        x.flags.writable = {}
+    with pytest.raises(ValueError):
+        x.flags["C"] = False
+
+
+def test_usm_ndarray_flags_bug_gh_1334():
+    get_queue_or_skip()
+    a = dpt.ones((2, 3), dtype="u4")
+    r = dpt.reshape(a, (1, 6, 1))
+    assert r.flags["C"] and r.flags["F"]
+
+    a = dpt.ones((2, 3), dtype="u4", order="F")
+    r = dpt.reshape(a, (1, 6, 1), order="F")
+    assert r.flags["C"] and r.flags["F"]
+
+    a = dpt.ones((2, 3, 4), dtype="i8")
+    r = dpt.sum(a, axis=(1, 2), keepdims=True)
+    assert r.flags["C"] and r.flags["F"]
+
+    a = dpt.ones((2, 1), dtype="?")
+    r = a[:, 1::-1]
+    assert r.flags["F"] and r.flags["C"]
+
+
+def test_usm_ndarray_writable_flag_views():
+    get_queue_or_skip()
+    a = dpt.arange(10, dtype="f4")
+    a.flags["W"] = False
+
+    a.shape = (5, 2)
+    assert not a.flags.writable
+    assert not a.T.flags.writable
+    assert not a.mT.flags.writable
+    assert not a.real.flags.writable
+    assert not a[0:3].flags.writable
+
+    a = dpt.arange(10, dtype="c8")
+    a.flags["W"] = False
+
+    assert not a.real.flags.writable
+    assert not a.imag.flags.writable
+
+
+@pytest.mark.parametrize("dt1", _all_dtypes)
+@pytest.mark.parametrize("dt2", _all_dtypes)
+def test_usm_ndarray_from_zero_sized_usm_ndarray(dt1, dt2):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dt1, q)
+    skip_if_dtype_not_supported(dt2, q)
+
+    x1 = dpt.ones((0,), dtype=dt1, sycl_queue=q)
+    x2 = dpt.usm_ndarray(x1.shape, dtype=dt2, buffer=x1)
+    assert x2.dtype == dt2
+    assert x2.sycl_queue == q
+    assert x2._pointer == x1._pointer
+    assert x2.shape == x1.shape
+
+
+def test_usm_ndarray_from_usm_ndarray_readonly():
+    get_queue_or_skip()
+
+    x1 = dpt.arange(10, dtype="f4")
+    x1.flags["W"] = False
+    x2 = dpt.usm_ndarray(x1.shape, dtype="f4", buffer=x1)
+    assert not x2.flags.writable
+
+
+@pytest.mark.parametrize(
+    "dtype",
+    _all_dtypes
+    + [
+        b"float32",
+        dpt.dtype("d"),
+        np.half,
+    ],
+)
+def test_dtypes(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+    Xusm = dpt.usm_ndarray((1,), dtype=dtype)
+    assert Xusm.itemsize == dpt.dtype(dtype).itemsize
+    expected_fmt = (dpt.dtype(dtype).str)[1:]
+    actual_fmt = Xusm.__sycl_usm_array_interface__["typestr"][1:]
+    assert expected_fmt == actual_fmt
+
+
+@pytest.mark.parametrize("usm_type", ["device", "shared", "host"])
+@pytest.mark.parametrize("buffer_ctor_kwargs", [dict(), {"queue": None}])
+def test_default_dtype(usm_type, buffer_ctor_kwargs):
+    q = get_queue_or_skip()
+    dev = q.get_sycl_device()
+    if buffer_ctor_kwargs:
+        buffer_ctor_kwargs["queue"] = q
+    Xusm = dpt.usm_ndarray(
+        (1,), buffer=usm_type, buffer_ctor_kwargs=buffer_ctor_kwargs
+    )
+    if dev.has_aspect_fp64:
+        expected_dtype = "f8"
+    else:
+        expected_dtype = "f4"
+    assert Xusm.itemsize == dpt.dtype(expected_dtype).itemsize
+    expected_fmt = (dpt.dtype(expected_dtype).str)[1:]
+    actual_fmt = Xusm.__sycl_usm_array_interface__["typestr"][1:]
+    assert expected_fmt == actual_fmt
+
+
+@pytest.mark.parametrize(
+    "dtype",
+    [
+        "",
+        ">f4",
+        "invalid",
+        123,
+        np.dtype(">f4"),
+        np.dtype([("a", ">f4"), ("b", "i4")]),
+    ],
+)
+def test_dtypes_invalid(dtype):
+    with pytest.raises((TypeError, ValueError)):
+        dpt.usm_ndarray((1,), dtype=dtype)
+
+
+@pytest.mark.parametrize("dt", ["f", "c8"])
+def test_properties(dt):
+    """
+    Test that properties execute
+    """
+    try:
+        X = dpt.usm_ndarray((3, 4, 5), dtype=dt)
+    except dpctl.SyclDeviceCreationError:
+        pytest.skip("No SYCL devices available")
+    assert isinstance(X.sycl_queue, dpctl.SyclQueue)
+    assert isinstance(X.sycl_device, dpctl.SyclDevice)
+    assert isinstance(X.sycl_context, dpctl.SyclContext)
+    assert isinstance(X.dtype, dpt.dtype)
+    assert isinstance(X.__sycl_usm_array_interface__, dict)
+    assert isinstance(X.mT, dpt.usm_ndarray)
+    assert isinstance(X.imag, dpt.usm_ndarray)
+    assert isinstance(X.real, dpt.usm_ndarray)
+    assert isinstance(X.shape, tuple)
+    assert isinstance(X.strides, tuple)
+    assert X.usm_type in ("shared", "device", "host")
+    assert isinstance(X.size, numbers.Integral)
+    assert isinstance(X.nbytes, numbers.Integral)
+    assert isinstance(X.ndim, numbers.Integral)
+    assert isinstance(X._pointer, numbers.Integral)
+    assert isinstance(X.device, Device)
+    with pytest.raises(ValueError):
+        # array-API mandates exception for .ndim != 2
+        X.T
+    Y = dpt.usm_ndarray((2, 3), dtype=dt)
+    assert isinstance(Y.mT, dpt.usm_ndarray)
+    V = dpt.usm_ndarray((3,), dtype=dt)
+    with pytest.raises(ValueError):
+        # array-API mandates exception for .ndim != 2
+        V.mT
+
+
+@pytest.mark.parametrize("shape", [tuple(), (1,), (1, 1), (1, 1, 1)])
+@pytest.mark.parametrize("dtype", ["|b1", "|u2", "|f4", "|i8"])
+class TestCopyScalar:
+    @pytest.mark.parametrize("func", [bool, float, int, complex])
+    def test_copy_scalar_with_func(self, func, shape, dtype):
+        try:
+            X = dpt.usm_ndarray(shape, dtype=dtype)
+        except dpctl.SyclDeviceCreationError:
+            pytest.skip("No SYCL devices available")
+        Y = np.arange(1, X.size + 1, dtype=dtype)
+        X.usm_data.copy_from_host(Y.view("|u1"))
+        Y = Y.reshape(())
+        # Non-0D numeric arrays must not be convertible to Python scalars
+        if len(shape) != 0:
+            assert_raises_regex(TypeError, "only 0-dimensional arrays", func, X)
+        else:
+            # 0D arrays are allowed to convert
+            assert func(X) == func(Y)
+
+    @pytest.mark.parametrize(
+        "method", ["__bool__", "__float__", "__int__", "__complex__"]
+    )
+    def test_copy_scalar_with_method(self, method, shape, dtype):
+        try:
+            X = dpt.usm_ndarray(shape, dtype=dtype)
+        except dpctl.SyclDeviceCreationError:
+            pytest.skip("No SYCL devices available")
+        Y = np.arange(1, X.size + 1, dtype=dtype)
+        X.usm_data.copy_from_host(Y.view("|u1"))
+        Y = Y.reshape(())
+        if len(shape) != 0:
+            assert_raises_regex(
+                TypeError, "only 0-dimensional arrays", getattr(X, method)
+            )
+        else:
+            assert getattr(X, method)() == getattr(Y, method)()
+
+
+@pytest.mark.parametrize("func", [bool, float, int, complex])
+@pytest.mark.parametrize("shape", [(2,), (1, 2), (3, 4, 5), (0,)])
+def test_copy_scalar_invalid_shape(func, shape):
+    try:
+        X = dpt.usm_ndarray(shape, dtype="i8")
+    except dpctl.SyclDeviceCreationError:
+        pytest.skip("No SYCL devices available")
+    with pytest.raises(ValueError):
+        func(X)
+
+
+def test_index_noninteger():
+    import operator
+
+    try:
+        X = dpt.usm_ndarray(1, "f4")
+    except dpctl.SyclDeviceCreationError:
+        pytest.skip("No SYCL devices available")
+    with pytest.raises(IndexError):
+        operator.index(X)
+
+
+@pytest.mark.parametrize(
+    "ind",
+    [
+        tuple(),
+        (None,),
+        (
+            None,
+            Ellipsis,
+            None,
+        ),
+        (2, 2, None, 3, 4),
+        (Ellipsis,),
+        (None, slice(0, None, 2), Ellipsis, slice(0, None, 3)),
+        (None, slice(1, None, 2), Ellipsis, slice(1, None, 3)),
+        (None, slice(None, -1, -2), Ellipsis, slice(2, None, 3)),
+        (
+            slice(None, None, -1),
+            slice(None, None, -1),
+            slice(0, None, 3),
+            slice(1, None, 2),
+        ),
+    ],
+)
+def test_basic_slice(ind):
+    try:
+        X = dpt.usm_ndarray((2 * 3, 2 * 4, 3 * 5, 2 * 7), dtype="u1")
+    except dpctl.SyclDeviceCreationError:
+        pytest.skip("No SYCL devices available")
+    Xnp = np.empty(X.shape, dtype=X.dtype)
+    S = X[ind]
+    Snp = Xnp[ind]
+    assert S.shape == Snp.shape
+    assert S.strides == Snp.strides
+    assert S.dtype == X.dtype
+
+
+def test_empty_slice():
+    # see gh801
+    try:
+        X = dpt.empty((1, 0, 1), dtype="u1")
+    except dpctl.SyclDeviceCreationError:
+        pytest.skip("No SYCL devices available")
+    Y = X[:, ::-1, :]
+    assert Y.shape == X.shape
+    Z = X[:, ::2, :]
+    assert Z.shape == X.shape
+    X = dpt.empty(0)
+    Y = X[::-1]
+    assert Y.shape == X.shape
+    Z = X[::2]
+    assert Z.shape == X.shape
+    X = dpt.empty((0, 4), dtype="u1")
+    assert X[:, 1].shape == (0,)
+    assert X[:, 1:3].shape == (0, 2)
+
+
+def test_slice_constructor_1d():
+    Xh = np.arange(37, dtype="i4")
+    try:
+        Xusm = dpt.arange(Xh.size, dtype="i4")
+    except dpctl.SyclDeviceCreationError:
+        pytest.skip("No SYCL devices available")
+    for ind in [
+        slice(1, None, 2),
+        slice(0, None, 3),
+        slice(1, None, 3),
+        slice(2, None, 3),
+        slice(None, None, -1),
+        slice(-2, 2, -2),
+        slice(-1, 1, -2),
+        slice(None, None, -13),
+    ]:
+        assert np.array_equal(
+            dpt.asnumpy(Xusm[ind]), Xh[ind]
+        ), "Failed for {}".format(ind)
+
+
+def test_slice_constructor_3d():
+    Xh = np.ones((37, 24, 35), dtype="i4")
+    try:
+        Xusm = dpt.ones(Xh.shape, dtype=Xh.dtype)
+    except dpctl.SyclDeviceCreationError:
+        pytest.skip("No SYCL devices available")
+    for ind in [
+        slice(1, None, 2),
+        slice(0, None, 3),
+        slice(1, None, 3),
+        slice(2, None, 3),
+        slice(None, None, -1),
+        slice(-2, 2, -2),
+        slice(-1, 1, -2),
+        slice(None, None, -13),
+        (slice(None, None, -2), Ellipsis, None, 15),
+    ]:
+        assert np.array_equal(
+            dpt.to_numpy(Xusm[ind]), Xh[ind]
+        ), "Failed for {}".format(ind)
+
+
+@pytest.mark.parametrize("usm_type", ["device", "shared", "host"])
+def test_slice_suai(usm_type):
+    Xh = np.arange(0, 10, dtype="u1")
+    try:
+        Xusm = dpt.arange(0, 10, dtype="u1", usm_type=usm_type)
+    except dpctl.SyclDeviceCreationError:
+        pytest.skip("No SYCL devices available")
+    for ind in [slice(2, 3, None), slice(5, 7, None), slice(3, 9, None)]:
+        assert np.array_equal(
+            dpm.as_usm_memory(Xusm[ind]).copy_to_host(), Xh[ind]
+        ), "Failed for {}".format(ind)
+
+
+def test_slicing_basic():
+    try:
+        Xusm = dpt.usm_ndarray((10, 5), dtype="c8")
+    except dpctl.SyclDeviceCreationError:
+        pytest.skip("No SYCL devices available")
+    Xusm[None]
+    Xusm[...]
+    Xusm[8]
+    Xusm[-3]
+    with pytest.raises(IndexError):
+        Xusm[..., ...]
+    with pytest.raises(IndexError):
+        Xusm[1, 1, :, 1]
+    Xusm[:, -4]
+    with pytest.raises(IndexError):
+        Xusm[:, -128]
+    with pytest.raises(IndexError):
+        Xusm[{1, 2, 3, 4, 5, 6, 7}]
+    X = dpt.usm_ndarray(10, "u1")
+    X.usm_data.copy_from_host(b"\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09")
+    int(
+        X[X[2]]
+    )  # check that objects with __index__ method can be used as indices
+    Xh = dpm.as_usm_memory(X[X[2] : X[5]]).copy_to_host()
+    Xnp = np.arange(0, 10, dtype="u1")
+    assert np.array_equal(Xh, Xnp[Xnp[2] : Xnp[5]])
+
+
+def test_slicing_empty():
+    try:
+        X = dpt.usm_ndarray((0, 10), dtype="i4")
+    except dpctl.SyclDeviceCreationError:
+        pytest.skip("No SYCL devices available")
+    x = dpt.moveaxis(X, 1, 0)
+    # this used to raise ValueError
+    y = x[1]
+    assert y.ndim == 1
+    assert y.shape == (0,)
+    assert y.dtype == X.dtype
+    assert y.usm_type == X.usm_type
+    assert y.sycl_queue == X.sycl_queue
+    w = x[1:3]
+    assert w.ndim == 2
+    assert w.shape == (
+        2,
+        0,
+    )
+    assert w.dtype == X.dtype
+    assert w.usm_type == X.usm_type
+    assert w.sycl_queue == X.sycl_queue
+
+
+def test_ctor_invalid_shape():
+    with pytest.raises(TypeError):
+        dpt.usm_ndarray(dict())
+
+
+def test_ctor_invalid_order():
+    get_queue_or_skip()
+    with pytest.raises(ValueError):
+        dpt.usm_ndarray((5, 5, 3), order="Z")
+    with pytest.raises(ValueError):
+        dpt.usm_ndarray((10), strides=(1,), order="Z")
+    with pytest.raises(ValueError):
+        dpt.usm_ndarray((), order="Z")
+
+
+def test_ctor_buffer_kwarg():
+    try:
+        dpt.usm_ndarray(10, dtype="i8", buffer=b"device")
+    except dpctl.SyclDeviceCreationError:
+        pytest.skip("No SYCL devices available")
+    with pytest.raises(ValueError):
+        dpt.usm_ndarray(10, buffer="invalid_param")
+    Xusm = dpt.usm_ndarray((10, 5), dtype="c8")
+    Xusm[...] = 1
+    X2 = dpt.usm_ndarray(Xusm.shape, buffer=Xusm, dtype=Xusm.dtype)
+    Horig_copy = Xusm.usm_data.copy_to_host()
+    H2_copy = X2.usm_data.copy_to_host()
+    assert np.array_equal(Horig_copy, H2_copy)
+    with pytest.raises(ValueError):
+        dpt.usm_ndarray(10, dtype="i4", buffer=dict())
+    # use device-specific default fp data type
+    X3 = dpt.usm_ndarray(Xusm.shape, buffer=Xusm)
+    assert np.array_equal(Horig_copy, X3.usm_data.copy_to_host())
+
+
+def test_usm_ndarray_props():
+    try:
+        Xusm = dpt.usm_ndarray((10, 5), dtype="c8", order="F")
+    except dpctl.SyclDeviceCreationError:
+        pytest.skip("No SYCL devices available")
+    Xusm.ndim
+    repr(Xusm)
+    Xusm.flags
+    Xusm.__sycl_usm_array_interface__
+    Xusm.device
+    Xusm.strides
+    Xusm.real
+    Xusm.imag
+    try:
+        dpctl.SyclQueue("cpu")
+    except dpctl.SyclQueueCreationError:
+        pytest.skip("Sycl device CPU was not detected")
+    Xusm.to_device("cpu")
+
+
+def test_datapi_device():
+    try:
+        X = dpt.usm_ndarray(1, dtype="i4")
+    except dpctl.SyclDeviceCreationError:
+        pytest.skip("No SYCL devices available")
+    dev_t = type(X.device)
+    with pytest.raises(TypeError):
+        dev_t()
+    dev_t.create_device(X.device)
+    dev_t.create_device(X.sycl_queue)
+    d1 = dev_t.create_device(X.sycl_device)
+    d2 = dev_t.create_device(X.sycl_device.filter_string)
+    d3 = dev_t.create_device(None)
+    assert d1.sycl_queue == d2.sycl_queue
+    assert d1.sycl_queue == d3.sycl_queue
+    X.device.sycl_context
+    X.device.sycl_queue
+    X.device.sycl_device
+    repr(X.device)
+    X.device.print_device_info()
+
+
+def _pyx_capi_int(X, pyx_capi_name, caps_name=b"int", val_restype=ctypes.c_int):
+    import sys
+
+    mod = sys.modules[X.__class__.__module__]
+    cap = mod.__pyx_capi__.get(pyx_capi_name, None)
+    if cap is None:
+        raise ValueError(
+            "__pyx_capi__ does not export {} capsule".format(pyx_capi_name)
+        )
+    # construct Python callable to invoke these functions
+    cap_ptr_fn = ctypes.pythonapi.PyCapsule_GetPointer
+    cap_ptr_fn.restype = ctypes.c_void_p
+    cap_ptr_fn.argtypes = [ctypes.py_object, ctypes.c_char_p]
+    cap_ptr = cap_ptr_fn(cap, caps_name)
+    val_ptr = ctypes.cast(cap_ptr, ctypes.POINTER(val_restype))
+    return val_ptr.contents.value
+
+
+def test_pyx_capi_check_constants():
+    try:
+        X = dpt.usm_ndarray(17, dtype="i1")[1::2]
+    except dpctl.SyclDeviceCreationError:
+        pytest.skip("No SYCL devices available")
+    cc_flag = _pyx_capi_int(X, "USM_ARRAY_C_CONTIGUOUS")
+    assert cc_flag > 0 and 0 == (cc_flag & (cc_flag - 1))
+    fc_flag = _pyx_capi_int(X, "USM_ARRAY_F_CONTIGUOUS")
+    assert fc_flag > 0 and 0 == (fc_flag & (fc_flag - 1))
+    w_flag = _pyx_capi_int(X, "USM_ARRAY_WRITABLE")
+    assert w_flag > 0 and 0 == (w_flag & (w_flag - 1))
+
+    bool_typenum = _pyx_capi_int(X, "UAR_BOOL")
+    assert bool_typenum == dpt.dtype("bool_").num
+
+    byte_typenum = _pyx_capi_int(X, "UAR_BYTE")
+    assert byte_typenum == dpt.dtype(np.byte).num
+    ubyte_typenum = _pyx_capi_int(X, "UAR_UBYTE")
+    assert ubyte_typenum == dpt.dtype(np.ubyte).num
+
+    short_typenum = _pyx_capi_int(X, "UAR_SHORT")
+    assert short_typenum == dpt.dtype(np.short).num
+    ushort_typenum = _pyx_capi_int(X, "UAR_USHORT")
+    assert ushort_typenum == dpt.dtype(np.ushort).num
+
+    int_typenum = _pyx_capi_int(X, "UAR_INT")
+    assert int_typenum == dpt.dtype(np.intc).num
+    uint_typenum = _pyx_capi_int(X, "UAR_UINT")
+    assert uint_typenum == dpt.dtype(np.uintc).num
+
+    long_typenum = _pyx_capi_int(X, "UAR_LONG")
+    assert long_typenum == dpt.dtype("l").num
+    ulong_typenum = _pyx_capi_int(X, "UAR_ULONG")
+    assert ulong_typenum == dpt.dtype("L").num
+
+    longlong_typenum = _pyx_capi_int(X, "UAR_LONGLONG")
+    assert longlong_typenum == dpt.dtype(np.longlong).num
+    ulonglong_typenum = _pyx_capi_int(X, "UAR_ULONGLONG")
+    assert ulonglong_typenum == dpt.dtype(np.ulonglong).num
+
+    half_typenum = _pyx_capi_int(X, "UAR_HALF")
+    assert half_typenum == dpt.dtype(np.half).num
+    float_typenum = _pyx_capi_int(X, "UAR_FLOAT")
+    assert float_typenum == dpt.dtype(np.single).num
+    double_typenum = _pyx_capi_int(X, "UAR_DOUBLE")
+    assert double_typenum == dpt.dtype(np.double).num
+
+    cfloat_typenum = _pyx_capi_int(X, "UAR_CFLOAT")
+    assert cfloat_typenum == dpt.dtype(np.csingle).num
+    cdouble_typenum = _pyx_capi_int(X, "UAR_CDOUBLE")
+    assert cdouble_typenum == dpt.dtype(np.cdouble).num
+
+
+@pytest.mark.parametrize(
+    "shape", [tuple(), (1,), (5,), (2, 3), (2, 3, 4), (2, 2, 2, 2, 2)]
+)
+@pytest.mark.parametrize(
+    "dtype",
+    _all_dtypes,
+)
+@pytest.mark.parametrize("usm_type", ["device", "shared", "host"])
+def test_tofrom_numpy(shape, dtype, usm_type):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+    Xusm = dpt.zeros(shape, dtype=dtype, usm_type=usm_type, sycl_queue=q)
+    Ynp = np.ones(shape, dtype=dtype)
+    Ynp[(0,) * len(shape)] = 0
+    ind = (slice(None, None, None),) * Ynp.ndim
+    Xusm[ind] = Ynp
+    assert np.array_equal(dpt.to_numpy(Xusm), Ynp)
+
+
+@pytest.mark.parametrize(
+    "dtype",
+    _all_dtypes,
+)
+@pytest.mark.parametrize("usm_type", ["device", "shared", "host"])
+def test_tofrom_numpy_permuted(dtype, usm_type):
+    shape = (3, 5, 7)
+    perm = (1, 2, 0)
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+    Xusm = dpt.permute_dims(
+        dpt.zeros(shape, dtype=dtype, usm_type=usm_type, sycl_queue=q), perm
+    )
+    Ynp = np.transpose(np.ones(shape, dtype=dtype), perm)
+    Ynp[:, ::2, ::2] = 0
+    ind = (slice(None, None, None),) * Ynp.ndim
+    # even though Xusm and Ynp are strided, simple memcpy could be done.
+    # This test validates that it is being done correctly
+    Xusm[ind] = Ynp
+    assert np.array_equal(dpt.to_numpy(Xusm), Ynp)
+
+
+@pytest.mark.parametrize(
+    "dtype",
+    _all_dtypes,
+)
+@pytest.mark.parametrize("src_usm_type", ["device", "shared", "host"])
+@pytest.mark.parametrize("dst_usm_type", ["device", "shared", "host"])
+def test_setitem_same_dtype(dtype, src_usm_type, dst_usm_type):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+    shape = (2, 4, 3)
+    Xnp = (
+        np.random.randint(-10, 10, size=prod(shape))
+        .astype(dtype)
+        .reshape(shape)
+    )
+    X = dpt.from_numpy(Xnp, usm_type=src_usm_type)
+    Z = dpt.zeros(shape, dtype=dtype, usm_type=dst_usm_type)
+    Zusm_0d = dpt.copy(Z[0, 0, 0])
+    ind = (-1, -1, -1)
+    Xusm_0d = X[ind]
+    Zusm_0d[Ellipsis] = Xusm_0d
+    assert np.array_equal(dpt.to_numpy(Zusm_0d), Xnp[ind])
+    Zusm_1d = dpt.copy(Z[0, 1:3, 0])
+    ind = (-1, slice(0, 2, None), -1)
+    Xusm_1d = X[ind]
+    Zusm_1d[Ellipsis] = Xusm_1d
+    assert np.array_equal(dpt.to_numpy(Zusm_1d), Xnp[ind])
+    Zusm_2d = dpt.copy(Z[:, 1:3, 0])[::-1]
+    Xusm_2d = X[:, 1:4, -1]
+    Zusm_2d[:] = Xusm_2d[:, 0:2]
+    assert np.array_equal(dpt.to_numpy(Zusm_2d), Xnp[:, 1:3, -1])
+    Zusm_3d = dpt.copy(Z)
+    Xusm_3d = X
+    Zusm_3d[:] = Xusm_3d
+    assert np.array_equal(dpt.to_numpy(Zusm_3d), Xnp)
+    Zusm_3d[::-1] = Xusm_3d[::-1]
+    assert np.array_equal(dpt.to_numpy(Zusm_3d), Xnp)
+    Zusm_3d[:] = Xusm_3d[0]
+    R1 = dpt.to_numpy(Zusm_3d)
+    R2 = np.broadcast_to(Xnp[0], R1.shape)
+    assert R1.shape == R2.shape
+    assert np.allclose(R1, R2)
+    Zusm_empty = Zusm_1d[0:0]
+    Zusm_empty[Ellipsis] = Zusm_3d[0, 0, 0:0]
+
+
+def test_setitem_broadcasting():
+    "See gh-1503"
+    get_queue_or_skip()
+    dst = dpt.ones((2, 3, 4), dtype="u4")
+    src = dpt.zeros((3, 1), dtype=dst.dtype)
+    dst[...] = src
+    expected = np.zeros(dst.shape, dtype=dst.dtype)
+    assert np.array_equal(dpt.asnumpy(dst), expected)
+
+
+def test_setitem_broadcasting_offset():
+    get_queue_or_skip()
+    dt = dpt.int32
+    x = dpt.asarray([[1, 2, 3], [6, 7, 8]], dtype=dt)
+    y = dpt.asarray([4, 5], dtype=dt)
+    x[0] = y[1]
+    expected = dpt.asarray([[5, 5, 5], [6, 7, 8]], dtype=dt)
+    assert dpt.all(x == expected)
+
+
+def test_setitem_broadcasting_empty_dst_validation():
+    "Broadcasting rules apply, except exception"
+    get_queue_or_skip()
+    dst = dpt.ones((2, 0, 5, 4), dtype="i8")
+    src = dpt.ones((2, 0, 3, 4), dtype="i8")
+    with pytest.raises(ValueError):
+        dst[...] = src
+
+
+def test_setitem_broadcasting_empty_dst_edge_case():
+    """RHS is shunken to empty array by
+    broadasting rule, hence no exception"""
+    get_queue_or_skip()
+    dst = dpt.ones(1, dtype="i8")[0:0]
+    src = dpt.ones(tuple(), dtype="i8")
+    dst[...] = src
+
+
+def test_setitem_broadcasting_src_ndim_equal_dst_ndim():
+    get_queue_or_skip()
+    dst = dpt.ones((2, 3, 4), dtype="i4")
+    src = dpt.zeros((2, 1, 4), dtype="i4")
+    dst[...] = src
+
+    expected = np.zeros(dst.shape, dtype=dst.dtype)
+    assert np.array_equal(dpt.asnumpy(dst), expected)
+
+
+def test_setitem_broadcasting_src_ndim_greater_than_dst_ndim():
+    get_queue_or_skip()
+    dst = dpt.ones((2, 3, 4), dtype="i4")
+    src = dpt.zeros((1, 2, 1, 4), dtype="i4")
+    dst[...] = src
+
+    expected = np.zeros(dst.shape, dtype=dst.dtype)
+    assert np.array_equal(dpt.asnumpy(dst), expected)
+
+
+@pytest.mark.parametrize(
+    "dtype",
+    _all_dtypes,
+)
+@pytest.mark.parametrize("usm_type", ["device", "shared", "host"])
+def test_setitem_scalar(dtype, usm_type):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+    X = dpt.usm_ndarray((6, 6), dtype=dtype, buffer=usm_type)
+    for i in range(X.size):
+        X[np.unravel_index(i, X.shape)] = np.asarray(i, dtype=dtype)
+    assert np.array_equal(
+        dpt.to_numpy(X), np.arange(X.size).astype(dtype).reshape(X.shape)
+    )
+    Y = dpt.usm_ndarray((2, 3), dtype=dtype, buffer=usm_type)
+    for i in range(Y.size):
+        Y[np.unravel_index(i, Y.shape)] = i
+    assert np.array_equal(
+        dpt.to_numpy(Y), np.arange(Y.size).astype(dtype).reshape(Y.shape)
+    )
+
+
+def test_setitem_errors():
+    q = get_queue_or_skip()
+    X = dpt.empty((4,), dtype="u1", sycl_queue=q)
+    Y = dpt.empty((4, 2), dtype="u1", sycl_queue=q)
+    with pytest.raises(ValueError):
+        X[:] = Y
+    with pytest.raises(ValueError):
+        X[:] = Y[:, 0:1]
+    X[:] = Y[None, :, 0]
+
+
+@pytest.mark.parametrize("src_dt,dst_dt", [("i4", "i8"), ("f4", "f8")])
+def test_setitem_different_dtypes(src_dt, dst_dt):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dst_dt, q)
+    X = dpt.ones(10, dtype=src_dt, sycl_queue=q)
+    Y = dpt.zeros(10, dtype=src_dt, sycl_queue=q)
+    Z = dpt.empty((20,), dtype=dst_dt, sycl_queue=q)
+    Z[::2] = X
+    Z[1::2] = Y
+    assert np.allclose(dpt.asnumpy(Z), np.tile(np.array([1, 0], Z.dtype), 10))
+
+
+def test_setitem_wingaps():
+    q = get_queue_or_skip()
+    if dpt.dtype("intc").itemsize == dpt.dtype("int32").itemsize:
+        dpt_dst = dpt.empty(4, dtype="int32", sycl_queue=q)
+        np_src = np.arange(4, dtype="intc")
+        dpt_dst[:] = np_src  # should not raise exceptions
+        assert np.array_equal(dpt.asnumpy(dpt_dst), np_src)
+    if dpt.dtype("long").itemsize == dpt.dtype("longlong").itemsize:
+        dpt_dst = dpt.empty(4, dtype="longlong", sycl_queue=q)
+        np_src = np.arange(4, dtype="long")
+        dpt_dst[:] = np_src  # should not raise exceptions
+        assert np.array_equal(dpt.asnumpy(dpt_dst), np_src)
+
+
+def test_shape_setter():
+    def cc_strides(sh):
+        return np.empty(sh, dtype="u1").strides
+
+    def relaxed_strides_equal(st1, st2, sh):
+        eq_ = True
+        for s1, s2, d in zip(st1, st2, sh):
+            eq_ = eq_ and ((d == 1) or (s1 == s2))
+        return eq_
+
+    sh_s = (2 * 3 * 4 * 5,)
+    sh_f = (
+        2,
+        3,
+        4,
+        5,
+    )
+    try:
+        X = dpt.usm_ndarray(sh_s, dtype="i8")
+    except dpctl.SyclDeviceCreationError:
+        pytest.skip("No SYCL devices available")
+    X.shape = sh_f
+    assert X.shape == sh_f
+    assert relaxed_strides_equal(X.strides, cc_strides(sh_f), sh_f)
+    assert X.flags.c_contiguous, "reshaped array expected to be C-contiguous"
+
+    sh_s = (
+        2,
+        12,
+        5,
+    )
+    sh_f = (
+        2,
+        3,
+        4,
+        5,
+    )
+    X = dpt.usm_ndarray(sh_s, dtype="u4", order="C")
+    X.shape = sh_f
+    assert X.shape == sh_f
+    assert relaxed_strides_equal(X.strides, cc_strides(sh_f), sh_f)
+
+    sh_s = (2, 3, 4, 5)
+    sh_f = (4, 3, 2, 5)
+    X = dpt.usm_ndarray(sh_s, dtype="f4")
+    X.shape = sh_f
+    assert relaxed_strides_equal(X.strides, cc_strides(sh_f), sh_f)
+
+    sh_s = (2, 3, 4, 5)
+    sh_f = (4, 3, 1, 2, 5)
+    X = dpt.usm_ndarray(sh_s, dtype="?")
+    X.shape = sh_f
+    assert relaxed_strides_equal(X.strides, cc_strides(sh_f), sh_f)
+    sz = X.size
+    X.shape = sz
+    assert X.shape == (sz,)
+    assert relaxed_strides_equal(X.strides, (1,), (sz,))
+
+    X = dpt.usm_ndarray(sh_s, dtype="u4")
+    with pytest.raises(TypeError):
+        X.shape = "abcbe"
+    X = dpt.usm_ndarray((4, 4), dtype="u1")[::2, ::2]
+    with pytest.raises(AttributeError):
+        X.shape = (4,)
+    X = dpt.usm_ndarray((0,), dtype="i4")
+    X.shape = (0,)
+    X.shape = (
+        2,
+        0,
+    )
+    X.shape = (
+        0,
+        2,
+    )
+    X.shape = (
+        1,
+        0,
+        1,
+    )
+
+
+def test_len():
+    try:
+        X = dpt.usm_ndarray(1, "i4")
+    except dpctl.SyclDeviceCreationError:
+        pytest.skip("No SYCL devices available")
+    assert len(X) == 1
+    X = dpt.usm_ndarray((2, 1), "i4")
+    assert len(X) == 2
+    X = dpt.usm_ndarray(tuple(), "i4")
+    with pytest.raises(TypeError):
+        len(X)
+
+
+def test_array_namespace():
+    try:
+        X = dpt.usm_ndarray(1, "i4")
+    except dpctl.SyclDeviceCreationError:
+        pytest.skip("No SYCL devices available")
+    X.__array_namespace__()
+    X._set_namespace(dpt)
+    assert X.__array_namespace__() is dpt
+    X.__array_namespace__(api_version=dpt.__array_api_version__)
+    assert X.__array_namespace__() is dpt
+
+
+def test_dlpack():
+    try:
+        X = dpt.usm_ndarray(1, "i4")
+    except dpctl.SyclDeviceCreationError:
+        pytest.skip("No SYCL devices available")
+    X.__dlpack_device__()
+    X.__dlpack__(stream=None)
+
+
+def test_to_device():
+    try:
+        X = dpt.usm_ndarray(1, "f4")
+    except dpctl.SyclDeviceCreationError:
+        pytest.skip("No SYCL devices available")
+    for dev in dpctl.get_devices():
+        if dev.default_selector_score > 0:
+            Y = X.to_device(dev)
+            assert Y.sycl_device == dev
+
+
+def test_to_device_stream_validation():
+    try:
+        X = dpt.usm_ndarray(1, "f4")
+    except dpctl.SyclDeviceCreationError:
+        pytest.skip("No SYCL devices available")
+    # invalid type of stream keyword
+    with pytest.raises(TypeError):
+        X.to_device(X.sycl_queue, stream=dict())
+    # stream is keyword-only arg
+    with pytest.raises(TypeError):
+        X.to_device(X.sycl_queue, X.sycl_queue)
+
+
+def test_to_device_stream_use():
+    try:
+        X = dpt.usm_ndarray(1, "f4")
+    except dpctl.SyclDeviceCreationError:
+        pytest.skip("No SYCL devices available")
+    q1 = dpctl.SyclQueue(
+        X.sycl_context, X.sycl_device, property="enable_profiling"
+    )
+    X.to_device(q1, stream=q1)
+
+
+def test_to_device_migration():
+    q1 = get_queue_or_skip()  # two distinct copies of default-constructed queue
+    q2 = get_queue_or_skip()
+    X1 = dpt.empty((5,), dtype="i8", sycl_queue=q1)  # X1 is associated with q1
+    X2 = X1.to_device(q2)  # X2 is reassociated with q2
+    assert X1.sycl_queue == q1
+    assert X2.sycl_queue == q2
+    assert X1.usm_data._pointer == X2.usm_data._pointer
+
+
+def test_astype():
+    try:
+        X = dpt.empty((5, 5), dtype="i4")
+    except dpctl.SyclDeviceCreationError:
+        pytest.skip("No SYCL devices available")
+    X[:] = np.full((5, 5), 7, dtype="i4")
+    Y = dpt.astype(X, "c8", order="C")
+    assert np.allclose(dpt.to_numpy(Y), np.full((5, 5), 7, dtype="c8"))
+    if Y.sycl_device.has_aspect_fp16:
+        Y = dpt.astype(X[::2, ::-1], "f2", order="K")
+        assert np.allclose(dpt.to_numpy(Y), np.full(Y.shape, 7, dtype="f2"))
+    Y = dpt.astype(X[::2, ::-1], "f4", order="K")
+    assert np.allclose(dpt.to_numpy(Y), np.full(Y.shape, 7, dtype="f4"))
+    Y = dpt.astype(X[::2, ::-1], "i4", order="K", copy=False)
+    assert Y.usm_data is X.usm_data
+    Y = dpt.astype(X, None, order="K")
+    if X.sycl_queue.sycl_device.has_aspect_fp64:
+        assert Y.dtype is dpt.float64
+    else:
+        assert Y.dtype is dpt.float32
+
+
+def test_astype_invalid_order():
+    try:
+        X = dpt.usm_ndarray(5, "i4")
+    except dpctl.SyclDeviceCreationError:
+        pytest.skip("No SYCL devices available")
+    with pytest.raises(ValueError):
+        dpt.astype(X, "i4", order="WRONG")
+
+
+def test_astype_device():
+    get_queue_or_skip()
+    q1 = dpctl.SyclQueue()
+    q2 = dpctl.SyclQueue()
+
+    x = dpt.arange(5, dtype="i4", sycl_queue=q1)
+    r = dpt.astype(x, "f4")
+    assert r.sycl_queue == x.sycl_queue
+    assert r.sycl_device == x.sycl_device
+
+    r = dpt.astype(x, "f4", device=q2)
+    assert r.sycl_queue == q2
+
+
+def test_astype_gh_1926():
+    get_queue_or_skip()
+
+    x = dpt.ones(64)
+    x_ = dpt.astype(x, x.dtype, copy=False, order="C")
+    assert x is x_
+
+    x__ = dpt.astype(x, x.dtype, copy=False, order="F")
+    assert x is x__
+
+
+def test_astype_gh_2121():
+    get_queue_or_skip()
+
+    x_np = np.asarray([0, 3, 1, 2, 0, 1], dtype="u1").view("?")
+    x = dpt.asarray(x_np)
+    res = dpt.astype(x, dpt.uint8)
+    expected = dpt.asarray([0, 1, 1, 1, 0, 1], dtype="u1")
+    assert dpt.all(res == expected)
+
+
+def test_copy():
+    try:
+        X = dpt.usm_ndarray((5, 5), "i4")[2:4, 1:4]
+    except dpctl.SyclDeviceCreationError:
+        pytest.skip("No SYCL devices available")
+    X[:] = 42
+    Yc = dpt.copy(X, order="C")
+    Yf = dpt.copy(X, order="F")
+    Ya = dpt.copy(X, order="A")
+    Yk = dpt.copy(X, order="K")
+    assert Yc.usm_data is not X.usm_data
+    assert Yf.usm_data is not X.usm_data
+    assert Ya.usm_data is not X.usm_data
+    assert Yk.usm_data is not X.usm_data
+    assert Yc.strides == (3, 1)
+    assert Yf.strides == (1, 2)
+    assert Ya.strides == (3, 1)
+    assert Yk.strides == (3, 1)
+    ref = np.full(X.shape, 42, dtype=X.dtype)
+    assert np.array_equal(dpt.asnumpy(Yc), ref)
+    assert np.array_equal(dpt.asnumpy(Yf), ref)
+    assert np.array_equal(dpt.asnumpy(Ya), ref)
+    assert np.array_equal(dpt.asnumpy(Yk), ref)
+
+
+def test_copy_unaligned():
+    get_queue_or_skip()
+
+    x = dpt.ones(513, dtype="i4")
+    r = dpt.astype(x[1:], "f4")
+
+    assert dpt.all(r == 1)
+
+
+def test_ctor_invalid():
+    try:
+        m = dpm.MemoryUSMShared(12)
+    except dpctl.SyclDeviceCreationError:
+        pytest.skip("No SYCL devices available")
+    with pytest.raises(ValueError):
+        dpt.usm_ndarray((4,), dtype="i4", buffer=m)
+    m = dpm.MemoryUSMShared(64)
+    with pytest.raises(ValueError):
+        dpt.usm_ndarray((4,), dtype="u1", buffer=m, strides={"not": "valid"})
+
+
+def test_reshape():
+    try:
+        X = dpt.usm_ndarray((5, 5), "i4")
+    except dpctl.SyclDeviceCreationError:
+        pytest.skip("No SYCL devices available")
+    # can be done as views
+    Y = dpt.reshape(X, (25,))
+    assert Y.shape == (25,)
+    Z = X[::2, ::2]
+    # requires a copy
+    W = dpt.reshape(Z, (Z.size,), order="F")
+    assert W.shape == (Z.size,)
+    with pytest.raises(TypeError):
+        dpt.reshape("invalid")
+    with pytest.raises(ValueError):
+        dpt.reshape(Z, (2, 2, 2, 2, 2))
+    with pytest.raises(ValueError):
+        dpt.reshape(Z, Z.shape, order="invalid")
+    W = dpt.reshape(Z, (-1,), order="C")
+    assert W.shape == (Z.size,)
+
+    X = dpt.usm_ndarray((1,), dtype="i8")
+    Y = dpt.reshape(X, X.shape)
+    assert Y.flags == X.flags
+
+    A = dpt.usm_ndarray((0,), "i4")
+    A1 = dpt.reshape(A, (0,))
+    assert A1.shape == (0,)
+    requested_shape = (
+        2,
+        0,
+    )
+    A2 = dpt.reshape(A, requested_shape)
+    assert A2.shape == requested_shape
+    requested_shape = (
+        0,
+        2,
+    )
+    A3 = dpt.reshape(A, requested_shape)
+    assert A3.shape == requested_shape
+    requested_shape = (
+        1,
+        0,
+        2,
+    )
+    A4 = dpt.reshape(A, requested_shape)
+    assert A4.shape == requested_shape
+
+
+def test_reshape_orderF():
+    try:
+        a = dpt.arange(6 * 3 * 4, dtype="i4")
+    except dpctl.SyclDeviceCreationError:
+        pytest.skip("No SYCL devices available")
+    b = dpt.reshape(a, (6, 2, 6))
+    c = dpt.reshape(b, (9, 8), order="F")
+    assert c.flags.f_contiguous
+    assert c._pointer != b._pointer
+    assert b._pointer == a._pointer
+
+    a_np = np.arange(6 * 3 * 4, dtype="i4")
+    b_np = np.reshape(a_np, (6, 2, 6))
+    c_np = np.reshape(b_np, (9, 8), order="F")
+    assert np.array_equal(c_np, dpt.asnumpy(c))
+
+
+def test_reshape_noop():
+    """Per gh-1664"""
+    try:
+        a = dpt.ones((2, 1))
+    except dpctl.SyclDeviceCreationError:
+        pytest.skip("No SYCL devices available")
+    b = dpt.reshape(a, (2, 1))
+    assert b is a
+
+
+def test_reshape_zero_size():
+    try:
+        a = dpt.empty((0,))
+    except dpctl.SyclDeviceCreationError:
+        pytest.skip("No SYCL devices available")
+    with pytest.raises(ValueError):
+        dpt.reshape(a, (-1, 0))
+
+
+def test_reshape_large_ndim():
+    ndim = 32
+    idx = tuple(1 if i + 1 < ndim else ndim for i in range(ndim))
+    try:
+        d = dpt.ones(ndim, dtype="i4")
+    except dpctl.SyclDeviceCreationError:
+        pytest.skip("No SYCL devices available")
+    d = dpt.reshape(d, idx)
+    assert d.shape == idx
+
+
+def test_reshape_copy_kwrd():
+    try:
+        X = dpt.usm_ndarray((2, 3), "i4")
+    except dpctl.SyclDeviceCreationError:
+        pytest.skip("No SYCL devices available")
+    new_shape = (6,)
+    Z = dpt.reshape(X, new_shape, copy=True)
+    assert Z.shape == new_shape
+    assert Z.usm_data is not X.usm_data
+    X = dpt.usm_ndarray((3, 3), "i4")[::2, ::2]
+    new_shape = (4,)
+    with pytest.raises(ValueError):
+        Z = dpt.reshape(X, new_shape, copy=False)
+    with pytest.raises(ValueError):
+        invalid = Ellipsis
+        Z = dpt.reshape(X, new_shape, copy=invalid)
+
+
+def test_transpose():
+    n, m = 2, 3
+    try:
+        X = dpt.usm_ndarray((n, m), "f4")
+    except dpctl.SyclDeviceCreationError:
+        pytest.skip("No SYCL devices available")
+    Xnp = np.arange(n * m, dtype="f4").reshape((n, m))
+    X[:] = Xnp
+    assert np.array_equal(dpt.to_numpy(X.T), Xnp.T)
+    assert np.array_equal(dpt.to_numpy(X[1:].T), Xnp[1:].T)
+
+
+def test_real_imag_views():
+    n, m = 2, 3
+    try:
+        X = dpt.usm_ndarray((n, m), "c8")
+        X_scalar = dpt.usm_ndarray((), dtype="c8")
+    except dpctl.SyclDeviceCreationError:
+        pytest.skip("No SYCL devices available")
+    Xnp_r = np.arange(n * m, dtype="f4").reshape((n, m))
+    Xnp_i = np.arange(n * m, 2 * n * m, dtype="f4").reshape((n, m))
+    Xnp = Xnp_r + 1j * Xnp_i
+    X[:] = Xnp
+    X_real = X.real
+    X_imag = X.imag
+    assert np.array_equal(dpt.to_numpy(X_real), Xnp.real)
+    assert np.array_equal(dpt.to_numpy(X.imag), Xnp.imag)
+    assert not X_real.flags["C"] and not X_real.flags["F"]
+    assert not X_imag.flags["C"] and not X_imag.flags["F"]
+    assert X_real.strides == X_imag.strides
+    assert np.array_equal(dpt.to_numpy(X[1:].real), Xnp[1:].real)
+    assert np.array_equal(dpt.to_numpy(X[1:].imag), Xnp[1:].imag)
+
+    X_scalar[...] = complex(n * m, 2 * n * m)
+    assert X_scalar.real and X_scalar.imag
+
+    # check that _zero_like works for scalars
+    X_scalar = dpt.usm_ndarray((), dtype="f4")
+    assert isinstance(X_scalar.imag, dpt.usm_ndarray)
+    assert not X_scalar.imag
+    assert X_scalar.real.sycl_queue == X_scalar.imag.sycl_queue
+
+
+def test_real_imag_views_fp16():
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dpt.float16, q)
+
+    X = dpt.usm_ndarray(
+        (3, 4), dtype=dpt.float16, buffer_ctor_kwargs={"queue": q}
+    )
+    assert isinstance(X.real, dpt.usm_ndarray) and isinstance(
+        X.imag, dpt.usm_ndarray
+    )
+
+
+@pytest.mark.parametrize(
+    "dtype",
+    _all_dtypes,
+)
+def test_zeros(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+    X = dpt.zeros(10, dtype=dtype, sycl_queue=q)
+    assert np.array_equal(dpt.asnumpy(X), np.zeros(10, dtype=dtype))
+
+
+@pytest.mark.parametrize(
+    "dtype",
+    _all_dtypes,
+)
+def test_ones(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+    X = dpt.ones(10, dtype=dtype, sycl_queue=q)
+    assert np.array_equal(dpt.asnumpy(X), np.ones(10, dtype=dtype))
+
+
+@pytest.mark.parametrize(
+    "dtype",
+    _all_dtypes,
+)
+def test_full(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+    X = dpt.full(10, 4, dtype=dtype, sycl_queue=q)
+    assert np.array_equal(dpt.asnumpy(X), np.full(10, 4, dtype=dtype))
+
+
+def test_full_cmplx128():
+    q = get_queue_or_skip()
+    dtype = "c16"
+    skip_if_dtype_not_supported(dtype, q)
+    fill_v = 1 + 1j
+    X = dpt.full(tuple(), fill_value=fill_v, dtype=dtype, sycl_queue=q)
+    assert np.array_equal(
+        dpt.asnumpy(X), np.full(tuple(), fill_value=fill_v, dtype=dtype)
+    )
+    fill_v = 0 + 1j
+    X = dpt.full(tuple(), fill_value=fill_v, dtype=dtype, sycl_queue=q)
+    assert np.array_equal(
+        dpt.asnumpy(X), np.full(tuple(), fill_value=fill_v, dtype=dtype)
+    )
+    fill_v = 0 + 0j
+    X = dpt.full(tuple(), fill_value=fill_v, dtype=dtype, sycl_queue=q)
+    assert np.array_equal(
+        dpt.asnumpy(X), np.full(tuple(), fill_value=fill_v, dtype=dtype)
+    )
+
+
+def test_full_dtype_inference():
+    try:
+        X = dpt.full(10, 4)
+    except dpctl.SyclDeviceCreationError:
+        pytest.skip("No SYCL devices available")
+    assert np.issubdtype(X.dtype, np.integer)
+    try:
+        X = dpt.full(10, True)
+    except dpctl.SyclDeviceCreationError:
+        pytest.skip("No SYCL devices available")
+    assert X.dtype is dpt.dtype(np.bool_)
+    assert np.issubdtype(dpt.full(10, 12.3).dtype, np.floating)
+    try:
+        X = dpt.full(10, 0.3 - 2j)
+    except dpctl.SyclDeviceCreationError:
+        pytest.skip("No SYCL devices available")
+    cdt = X.dtype
+    assert np.issubdtype(cdt, np.complexfloating)
+
+    assert np.issubdtype(dpt.full(10, 12.3, dtype=int).dtype, np.integer)
+    assert np.issubdtype(dpt.full(10, 0.3 - 2j, dtype=int).dtype, np.integer)
+    rdt = np.finfo(cdt).dtype
+    assert np.issubdtype(dpt.full(10, 0.3 - 2j, dtype=rdt).dtype, np.floating)
+
+
+@pytest.mark.parametrize("dt", ["f2", "f4", "f8"])
+def test_full_special_fp(dt):
+    """See gh-1314"""
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dt, q)
+
+    ar = dpt.full(10, fill_value=dpt.nan)
+    err_msg = f"Failed for fill_value=dpt.nan and dtype {dt}"
+    assert dpt.isnan(ar[0]), err_msg
+
+    ar = dpt.full(10, fill_value=dpt.inf)
+    err_msg = f"Failed for fill_value=dpt.inf and dtype {dt}"
+    assert dpt.isinf(ar[0]) and dpt.greater(ar[0], 0), err_msg
+
+    ar = dpt.full(10, fill_value=-dpt.inf)
+    err_msg = f"Failed for fill_value=-dpt.inf and dtype {dt}"
+    assert dpt.isinf(ar[0]) and dpt.less(ar[0], 0), err_msg
+
+    ar = dpt.full(10, fill_value=dpt.pi)
+    err_msg = f"Failed for fill_value=dpt.pi and dtype {dt}"
+    check = abs(float(ar[0]) - dpt.pi) < 16 * dpt.finfo(ar.dtype).eps
+    assert check, err_msg
+
+
+def test_full_fill_array():
+    q = get_queue_or_skip()
+
+    Xnp = np.array([1, 2, 3], dtype="i4")
+    X = dpt.asarray(Xnp, sycl_queue=q)
+
+    shape = (3, 3)
+    Y = dpt.full(shape, X)
+    Ynp = np.full(shape, Xnp)
+
+    assert Y.dtype == Ynp.dtype
+    assert Y.usm_type == "device"
+    assert np.array_equal(dpt.asnumpy(Y), Ynp)
+
+
+def test_full_compute_follows_data():
+    q1 = get_queue_or_skip()
+    q2 = get_queue_or_skip()
+
+    X = dpt.arange(10, dtype="i4", sycl_queue=q1, usm_type="shared")
+    Y = dpt.full(10, X[3])
+
+    assert Y.dtype == X.dtype
+    assert Y.usm_type == X.usm_type
+    assert dpctl.utils.get_execution_queue((Y.sycl_queue, X.sycl_queue))
+    assert np.array_equal(dpt.asnumpy(Y), np.full(10, 3, dtype="i4"))
+
+    Y = dpt.full(10, X[3], dtype="f4", sycl_queue=q2, usm_type="host")
+
+    assert Y.dtype == dpt.dtype("f4")
+    assert Y.usm_type == "host"
+    assert dpctl.utils.get_execution_queue((Y.sycl_queue, q2))
+    assert np.array_equal(dpt.asnumpy(Y), np.full(10, 3, dtype="f4"))
+
+
+@pytest.mark.parametrize("order1", ["F", "C"])
+@pytest.mark.parametrize("order2", ["F", "C"])
+def test_full_order(order1, order2):
+    q = get_queue_or_skip()
+    Xnp = np.array([1, 2, 3], order=order1)
+    Ynp = np.full((3, 3), Xnp, order=order2)
+    Y = dpt.full((3, 3), Xnp, order=order2, sycl_queue=q)
+    assert Y.flags.c_contiguous == Ynp.flags.c_contiguous
+    assert Y.flags.f_contiguous == Ynp.flags.f_contiguous
+    assert np.array_equal(dpt.asnumpy(Y), Ynp)
+
+
+def test_full_strides():
+    q = get_queue_or_skip()
+    X = dpt.full((3, 3), dpt.arange(3, dtype="i4"), sycl_queue=q)
+    Xnp = np.full((3, 3), np.arange(3, dtype="i4"))
+    assert X.strides == tuple(el // Xnp.itemsize for el in Xnp.strides)
+    assert np.array_equal(dpt.asnumpy(X), Xnp)
+
+    X = dpt.full((3, 3), dpt.arange(6, dtype="i4")[::2], sycl_queue=q)
+    Xnp = np.full((3, 3), np.arange(6, dtype="i4")[::2])
+    assert X.strides == tuple(el // Xnp.itemsize for el in Xnp.strides)
+    assert np.array_equal(dpt.asnumpy(X), Xnp)
+
+
+@pytest.mark.parametrize("dt", ["i1", "u1", "i2", "u2", "i4", "u4", "i8", "u8"])
+def test_full_gh_1230(dt):
+    get_queue_or_skip()
+    dtype = dpt.dtype(dt)
+    dt_maxint = dpt.iinfo(dtype).max
+
+    if (dtype.itemsize < 8) and (np.lib.NumpyVersion(np.__version__) < "2.0.0"):
+        try:
+            X = dpt.full(1, fill_value=(dt_maxint + 1), dtype=dt)
+        except OverflowError:
+            pytest.skip("Expected OverflowError raised")
+        Y = dpt.full_like(X, fill_value=dpt.iinfo(dt).min)
+        assert dpt.all(X == Y)
+    else:
+        with pytest.raises(OverflowError):
+            dpt.full(1, dt_maxint + 1, dtype=dt)
+
+
+@pytest.mark.parametrize(
+    "dt",
+    _all_dtypes[1:],
+)
+def test_arange(dt):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dt, q)
+    X = dpt.arange(0, 123, dtype=dt, sycl_queue=q)
+    dt = dpt.dtype(dt)
+    if np.issubdtype(dt, np.integer):
+        assert int(X[47]) == 47
+    elif np.issubdtype(dt, np.floating):
+        assert float(X[47]) == 47.0
+    elif np.issubdtype(dt, np.complexfloating):
+        assert complex(X[47]) == 47.0 + 0.0j
+
+    # choose size larger than maximal value that u1/u2 can accommodate
+    sz = int(dpt.iinfo(dpt.int8).max)
+    X1 = dpt.arange(sz + 1, dtype=dt, sycl_queue=q)
+    assert X1.shape == (sz + 1,)
+
+    X2 = dpt.arange(sz, 0, -1, dtype=dt, sycl_queue=q)
+    assert X2.shape == (sz,)
+
+
+def test_arange_fp():
+    q = get_queue_or_skip()
+
+    assert dpt.arange(7, 0, -2, dtype="f4", device=q).shape == (4,)
+    assert dpt.arange(0, 1, 0.25, dtype="f4", device=q).shape == (4,)
+
+    has_fp64 = q.sycl_device.has_aspect_fp64
+    if has_fp64:
+        assert dpt.arange(7, 0, -2, dtype="f8", device=q).shape == (4,)
+    assert dpt.arange(0, 1, 0.25, dtype="f4", device=q).shape == (4,)
+
+    x = dpt.arange(9.7, stop=10, sycl_queue=q)
+    assert x.shape == (1,)
+    assert x.dtype == dpt.float64 if has_fp64 else dpt.float32
+
+
+def test_arange_step_None():
+    q = get_queue_or_skip()
+
+    x = dpt.arange(0, stop=10, step=None, dtype="int32", sycl_queue=q)
+    assert x.shape == (10,)
+
+
+def test_arange_bool():
+    q = get_queue_or_skip()
+
+    x = dpt.arange(0, stop=2, dtype="bool", sycl_queue=q)
+    assert x.shape == (2,)
+    assert x.dtype == dpt.bool
+
+
+def test_arange_mixed_types():
+    q = get_queue_or_skip()
+
+    x = dpt.arange(-2.5, stop=200, step=100, dtype="int32", sycl_queue=q)
+    assert x.shape[0] == 3
+    assert int(x[1]) == 99 + int(x[0])
+
+    x = dpt.arange(+2.5, stop=200, step=100, dtype="int32", device=x.device)
+    assert x.shape[0] == 2
+    assert int(x[1]) == 100 + int(x[0])
+
+    _stop = np.float32(504)
+    x = dpt.arange(0, stop=_stop, step=100, dtype="f4", device=x.device)
+    assert x.shape == (6,)
+
+    # ensure length is determined using uncast parameters
+    x = dpt.arange(-5, stop=10**2, step=2.7, dtype="int64", device=x.device)
+    assert x.shape == (39,)
+
+
+@pytest.mark.parametrize(
+    "dt",
+    _all_dtypes,
+)
+def test_linspace(dt):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dt, q)
+    X = dpt.linspace(0, 1, num=2, dtype=dt, sycl_queue=q)
+    assert np.allclose(dpt.asnumpy(X), np.linspace(0, 1, num=2, dtype=dt))
+
+
+def test_linspace_fp():
+    q = get_queue_or_skip()
+    n = 16
+    X = dpt.linspace(0, n - 1, num=n, sycl_queue=q)
+    if q.sycl_device.has_aspect_fp64:
+        assert X.dtype == dpt.dtype("float64")
+    else:
+        assert X.dtype == dpt.dtype("float32")
+    assert X.shape == (n,)
+    assert X.strides == (1,)
+
+
+@pytest.mark.parametrize("dtype", ["f2", "f4", "f8"])
+def test_linspace_fp_max(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+    n = 16
+    dt = dpt.dtype(dtype)
+    max_ = dpt.finfo(dt).max
+    X = dpt.linspace(max_, max_, endpoint=True, num=n, dtype=dt, sycl_queue=q)
+    assert X.shape == (n,)
+    assert X.strides == (1,)
+    assert np.allclose(
+        dpt.asnumpy(X), np.linspace(max_, max_, endpoint=True, num=n, dtype=dt)
+    )
+
+
+def test_linspace_int():
+    q = get_queue_or_skip()
+    X = dpt.linspace(0.1, 9.1, 11, endpoint=True, dtype=int, sycl_queue=q)
+    Xnp = np.linspace(0.1, 9.1, 11, endpoint=True, dtype=int)
+    assert np.array_equal(dpt.asnumpy(X), Xnp)
+
+
+@pytest.mark.parametrize(
+    "dt",
+    _all_dtypes,
+)
+@pytest.mark.parametrize(
+    "usm_kind",
+    [
+        "shared",
+        "device",
+        "host",
+    ],
+)
+def test_empty_like(dt, usm_kind):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dt, q)
+
+    X = dpt.empty((4, 5), dtype=dt, usm_type=usm_kind, sycl_queue=q)
+    Y = dpt.empty_like(X)
+    assert X.shape == Y.shape
+    assert X.dtype == Y.dtype
+    assert X.usm_type == Y.usm_type
+    assert X.sycl_queue == Y.sycl_queue
+
+    X = dpt.empty(tuple(), dtype=dt, usm_type=usm_kind, sycl_queue=q)
+    Y = dpt.empty_like(X)
+    assert X.shape == Y.shape
+    assert X.dtype == Y.dtype
+    assert X.usm_type == Y.usm_type
+    assert X.sycl_queue == Y.sycl_queue
+
+
+def test_empty_unexpected_data_type():
+    with pytest.raises(TypeError):
+        try:
+            dpt.empty(1, dtype=np.object_)
+        except dpctl.SyclDeviceCreationError:
+            pytest.skip("No SYCL devices available")
+
+
+@pytest.mark.parametrize(
+    "dt",
+    _all_dtypes,
+)
+@pytest.mark.parametrize(
+    "usm_kind",
+    [
+        "shared",
+        "device",
+        "host",
+    ],
+)
+def test_zeros_like(dt, usm_kind):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dt, q)
+
+    X = dpt.empty((4, 5), dtype=dt, usm_type=usm_kind, sycl_queue=q)
+    Y = dpt.zeros_like(X)
+    assert X.shape == Y.shape
+    assert X.dtype == Y.dtype
+    assert X.usm_type == Y.usm_type
+    assert X.sycl_queue == Y.sycl_queue
+    assert np.allclose(dpt.asnumpy(Y), np.zeros(X.shape, dtype=X.dtype))
+
+    X = dpt.empty(tuple(), dtype=dt, usm_type=usm_kind, sycl_queue=q)
+    Y = dpt.zeros_like(X)
+    assert X.shape == Y.shape
+    assert X.dtype == Y.dtype
+    assert X.usm_type == Y.usm_type
+    assert X.sycl_queue == Y.sycl_queue
+    assert np.array_equal(dpt.asnumpy(Y), np.zeros(X.shape, dtype=X.dtype))
+
+
+@pytest.mark.parametrize(
+    "dt",
+    _all_dtypes,
+)
+@pytest.mark.parametrize(
+    "usm_kind",
+    [
+        "shared",
+        "device",
+        "host",
+    ],
+)
+def test_ones_like(dt, usm_kind):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dt, q)
+
+    X = dpt.empty((4, 5), dtype=dt, usm_type=usm_kind, sycl_queue=q)
+    Y = dpt.ones_like(X)
+    assert X.shape == Y.shape
+    assert X.dtype == Y.dtype
+    assert X.usm_type == Y.usm_type
+    assert X.sycl_queue == Y.sycl_queue
+    assert np.allclose(dpt.asnumpy(Y), np.ones(X.shape, dtype=X.dtype))
+
+    X = dpt.empty(tuple(), dtype=dt, usm_type=usm_kind, sycl_queue=q)
+    Y = dpt.ones_like(X)
+    assert X.shape == Y.shape
+    assert X.dtype == Y.dtype
+    assert X.usm_type == Y.usm_type
+    assert X.sycl_queue == Y.sycl_queue
+    assert np.array_equal(dpt.asnumpy(Y), np.ones(X.shape, dtype=X.dtype))
+
+
+@pytest.mark.parametrize(
+    "dt",
+    _all_dtypes,
+)
+@pytest.mark.parametrize(
+    "usm_kind",
+    [
+        "shared",
+        "device",
+        "host",
+    ],
+)
+def test_full_like(dt, usm_kind):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dt, q)
+
+    fill_v = dpt.dtype(dt).type(1)
+    X = dpt.empty((4, 5), dtype=dt, usm_type=usm_kind, sycl_queue=q)
+    Y = dpt.full_like(X, fill_v)
+    assert X.shape == Y.shape
+    assert X.dtype == Y.dtype
+    assert X.usm_type == Y.usm_type
+    assert X.sycl_queue == Y.sycl_queue
+    assert np.allclose(dpt.asnumpy(Y), np.ones(X.shape, dtype=X.dtype))
+
+    X = dpt.empty(tuple(), dtype=dt, usm_type=usm_kind, sycl_queue=q)
+    Y = dpt.full_like(X, fill_v)
+    assert X.shape == Y.shape
+    assert X.dtype == Y.dtype
+    assert X.usm_type == Y.usm_type
+    assert X.sycl_queue == Y.sycl_queue
+    assert np.array_equal(dpt.asnumpy(Y), np.ones(X.shape, dtype=X.dtype))
+
+
+@pytest.mark.parametrize("dtype", _all_dtypes)
+@pytest.mark.parametrize("usm_kind", ["shared", "device", "host"])
+def test_eye(dtype, usm_kind):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    X = dpt.eye(4, 5, k=1, dtype=dtype, usm_type=usm_kind, sycl_queue=q)
+    Xnp = np.eye(4, 5, k=1, dtype=dtype)
+    assert X.dtype == Xnp.dtype
+    assert np.array_equal(Xnp, dpt.asnumpy(X))
+
+
+@pytest.mark.parametrize("dtype", _all_dtypes[1:])
+def test_tril(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    shape = (2, 3, 4, 5, 5)
+    X = dpt.reshape(dpt.arange(prod(shape), dtype=dtype, sycl_queue=q), shape)
+    Y = dpt.tril(X)
+    Xnp = np.arange(prod(shape), dtype=dtype).reshape(shape)
+    Ynp = np.tril(Xnp)
+    assert Y.dtype == Ynp.dtype
+    assert np.array_equal(Ynp, dpt.asnumpy(Y))
+
+
+@pytest.mark.parametrize("dtype", _all_dtypes[1:])
+def test_triu(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    shape = (4, 5)
+    X = dpt.reshape(dpt.arange(prod(shape), dtype=dtype, sycl_queue=q), shape)
+    Y = dpt.triu(X, k=1)
+    Xnp = np.arange(prod(shape), dtype=dtype).reshape(shape)
+    Ynp = np.triu(Xnp, k=1)
+    assert Y.dtype == Ynp.dtype
+    assert np.array_equal(Ynp, dpt.asnumpy(Y))
+
+
+@pytest.mark.parametrize("tri_fn", [dpt.tril, dpt.triu])
+@pytest.mark.parametrize("usm_type", ["device", "shared", "host"])
+def test_tri_usm_type(tri_fn, usm_type):
+    q = get_queue_or_skip()
+    dtype = dpt.uint16
+
+    shape = (2, 3, 4, 5, 5)
+    size = prod(shape)
+    X = dpt.reshape(
+        dpt.arange(size, dtype=dtype, usm_type=usm_type, sycl_queue=q), shape
+    )
+    Y = tri_fn(X)  # main execution branch
+    assert Y.usm_type == X.usm_type
+    assert Y.sycl_queue == q
+    Y = tri_fn(X, k=-6)  # special case of Y == X
+    assert Y.usm_type == X.usm_type
+    assert Y.sycl_queue == q
+    Y = tri_fn(X, k=6)  # special case of Y == 0
+    assert Y.usm_type == X.usm_type
+    assert Y.sycl_queue == q
+
+
+def test_tril_slice():
+    q = get_queue_or_skip()
+
+    shape = (6, 10)
+    X = dpt.reshape(dpt.arange(prod(shape), dtype="int", sycl_queue=q), shape)[
+        1:, ::-2
+    ]
+    Y = dpt.tril(X)
+    Xnp = np.arange(prod(shape), dtype="int").reshape(shape)[1:, ::-2]
+    Ynp = np.tril(Xnp)
+    assert Y.dtype == Ynp.dtype
+    assert np.array_equal(Ynp, dpt.asnumpy(Y))
+
+
+def test_triu_permute_dims():
+    q = get_queue_or_skip()
+
+    shape = (2, 3, 4, 5)
+    X = dpt.permute_dims(
+        dpt.reshape(dpt.arange(prod(shape), dtype="int", sycl_queue=q), shape),
+        (3, 2, 1, 0),
+    )
+    Y = dpt.triu(X)
+    Xnp = np.transpose(
+        np.arange(prod(shape), dtype="int").reshape(shape), (3, 2, 1, 0)
+    )
+    Ynp = np.triu(Xnp)
+    assert Y.dtype == Ynp.dtype
+    assert np.array_equal(Ynp, dpt.asnumpy(Y))
+
+
+def test_tril_broadcast_to():
+    q = get_queue_or_skip()
+
+    shape = (5, 5)
+    X = dpt.broadcast_to(dpt.ones((1), dtype="int", sycl_queue=q), shape)
+    Y = dpt.tril(X)
+    Xnp = np.broadcast_to(np.ones((1), dtype="int"), shape)
+    Ynp = np.tril(Xnp)
+    assert Y.dtype == Ynp.dtype
+    assert np.array_equal(Ynp, dpt.asnumpy(Y))
+
+
+def test_triu_bool():
+    q = get_queue_or_skip()
+
+    shape = (4, 5)
+    X = dpt.ones((shape), dtype="bool", sycl_queue=q)
+    Y = dpt.triu(X)
+    Xnp = np.ones((shape), dtype="bool")
+    Ynp = np.triu(Xnp)
+    assert Y.dtype == Ynp.dtype
+    assert np.array_equal(Ynp, dpt.asnumpy(Y))
+
+
+@pytest.mark.parametrize("order", ["F", "C"])
+@pytest.mark.parametrize("k", [-10, -2, -1, 3, 4, 10])
+def test_triu_order_k(order, k):
+    q = get_queue_or_skip()
+
+    shape = (3, 3)
+    X = dpt.reshape(
+        dpt.arange(prod(shape), dtype="int", sycl_queue=q),
+        shape,
+        order=order,
+    )
+    Y = dpt.triu(X, k=k)
+    Xnp = np.arange(prod(shape), dtype="int").reshape(shape, order=order)
+    Ynp = np.triu(Xnp, k=k)
+    assert Y.dtype == Ynp.dtype
+    assert X.flags == Y.flags
+    assert np.array_equal(Ynp, dpt.asnumpy(Y))
+
+
+@pytest.mark.parametrize("order", ["F", "C"])
+@pytest.mark.parametrize("k", [-10, -4, -3, 1, 2, 10])
+def test_tril_order_k(order, k):
+    try:
+        q = dpctl.SyclQueue()
+    except dpctl.SyclQueueCreationError:
+        pytest.skip("Queue could not be created")
+    shape = (3, 3)
+    X = dpt.reshape(
+        dpt.arange(prod(shape), dtype="int", sycl_queue=q),
+        shape,
+        order=order,
+    )
+    Y = dpt.tril(X, k=k)
+    Xnp = np.arange(prod(shape), dtype="int").reshape(shape, order=order)
+    Ynp = np.tril(Xnp, k=k)
+    assert Y.dtype == Ynp.dtype
+    assert X.flags == Y.flags
+    assert np.array_equal(Ynp, dpt.asnumpy(Y))
+
+
+def test_meshgrid():
+    q = get_queue_or_skip()
+
+    X = dpt.arange(5, sycl_queue=q)
+    Y = dpt.arange(3, sycl_queue=q)
+    Z = dpt.meshgrid(X, Y)
+    Znp = np.meshgrid(dpt.asnumpy(X), dpt.asnumpy(Y))
+    n = len(Z)
+    assert n == len(Znp)
+    for i in range(n):
+        assert np.array_equal(dpt.asnumpy(Z[i]), Znp[i])
+    assert dpt.meshgrid() == []
+    # dimension > 1 must raise ValueError
+    with pytest.raises(ValueError):
+        dpt.meshgrid(dpt.usm_ndarray((4, 4)))
+    # unknown indexing kwarg must raise ValueError
+    with pytest.raises(ValueError):
+        dpt.meshgrid(X, indexing="ji")
+    # input arrays with different data types must raise ValueError
+    with pytest.raises(ValueError):
+        dpt.meshgrid(X, dpt.asarray(Y, dtype="b1"))
+
+
+def test_meshgrid2():
+    q1 = get_queue_or_skip()
+    q2 = get_queue_or_skip()
+    q3 = get_queue_or_skip()
+
+    x1 = dpt.arange(0, 2, dtype="int16", sycl_queue=q1)
+    x2 = dpt.arange(3, 6, dtype="int16", sycl_queue=q2)
+    x3 = dpt.arange(6, 10, dtype="int16", sycl_queue=q3)
+    y1, y2, y3 = dpt.meshgrid(x1, x2, x3, indexing="xy")
+    z1, z2, z3 = dpt.meshgrid(x1, x2, x3, indexing="ij")
+    assert all(
+        x.sycl_queue == y.sycl_queue for x, y in zip((x1, x2, x3), (y1, y2, y3))
+    )
+    assert all(
+        x.sycl_queue == z.sycl_queue for x, z in zip((x1, x2, x3), (z1, z2, z3))
+    )
+    assert y1.shape == y2.shape and y2.shape == y3.shape
+    assert z1.shape == z2.shape and z2.shape == z3.shape
+    assert y1.shape == (len(x2), len(x1), len(x3))
+    assert z1.shape == (len(x1), len(x2), len(x3))
+
+
+def test_common_arg_validation():
+    order = "I"
+    # invalid order must raise ValueError
+    with pytest.raises(ValueError):
+        dpt.empty(10, order=order)
+    with pytest.raises(ValueError):
+        dpt.zeros(10, order=order)
+    with pytest.raises(ValueError):
+        dpt.ones(10, order=order)
+    with pytest.raises(ValueError):
+        dpt.full(10, 1, order=order)
+    with pytest.raises(ValueError):
+        dpt.eye(10, order=order)
+    try:
+        X = dpt.empty(10)
+    except dpctl.SyclDeviceCreationError:
+        pytest.skip("No SYCL devices available")
+    with pytest.raises(ValueError):
+        dpt.empty_like(X, order=order)
+    with pytest.raises(ValueError):
+        dpt.zeros_like(X, order=order)
+    with pytest.raises(ValueError):
+        dpt.ones_like(X, order=order)
+    with pytest.raises(ValueError):
+        dpt.full_like(X, 1, order=order)
+    X = {}
+    # test for type validation
+    with pytest.raises(TypeError):
+        dpt.empty_like(X)
+    with pytest.raises(TypeError):
+        dpt.zeros_like(X)
+    with pytest.raises(TypeError):
+        dpt.ones_like(X)
+    with pytest.raises(TypeError):
+        dpt.full_like(X, 1)
+    with pytest.raises(TypeError):
+        dpt.tril(X)
+    with pytest.raises(TypeError):
+        dpt.triu(X)
+    with pytest.raises(TypeError):
+        dpt.meshgrid(X)
+
+
+def test_flags():
+    try:
+        x = dpt.empty(tuple(), dtype="i4")
+    except dpctl.SyclDeviceCreationError:
+        pytest.skip("No SYCL devices available")
+    f = x.flags
+    # check comparison with generic types
+    assert f != Ellipsis
+    f.__repr__()
+    assert f.c_contiguous == f["C"]
+    assert f.f_contiguous == f["F"]
+    assert f.contiguous == f["CONTIGUOUS"]
+    assert f.fc == f["FC"]
+    assert f.forc == f["FORC"]
+    assert f.fnc == f["FNC"]
+    assert f.writable == f["W"]
+
+
+def test_asarray_uint64():
+    Xnp = np.ndarray(1, dtype=np.uint64)
+    try:
+        X = dpt.asarray(Xnp)
+    except dpctl.SyclDeviceCreationError:
+        pytest.skip("No SYCL devices available")
+    assert X.dtype == Xnp.dtype
+
+
+def test_Device():
+    try:
+        dev = dpctl.select_default_device()
+        d1 = dpt.Device.create_device(dev)
+        d2 = dpt.Device.create_device(dev)
+    except (dpctl.SyclQueueCreationError, dpctl.SyclDeviceCreationError):
+        pytest.skip(
+            "Could not create default device, or a queue that targets it"
+        )
+    assert d1 == d2
+    dict = {d1: 1}
+    assert dict[d2] == 1
+    assert d1 == d2.sycl_queue
+    assert not d1 == Ellipsis
+
+
+def test_element_offset():
+    n0, n1 = 3, 8
+    try:
+        x = dpt.empty((n0, n1), dtype="i4")
+    except dpctl.SyclDeviceCreationError:
+        pytest.skip("No SYCL devices available")
+    assert isinstance(x._element_offset, int)
+    assert x._element_offset == 0
+    y = x[::-1, ::2]
+    assert y._element_offset == (n0 - 1) * n1
+
+
+def test_byte_bounds():
+    n0, n1 = 3, 8
+    try:
+        x = dpt.empty((n0, n1), dtype="i4")
+    except dpctl.SyclDeviceCreationError:
+        pytest.skip("No SYCL devices available")
+    assert isinstance(x._byte_bounds, tuple)
+    assert len(x._byte_bounds) == 2
+    lo, hi = x._byte_bounds
+    assert hi - lo == n0 * n1 * x.itemsize
+    y = x[::-1, ::2]
+    lo, hi = y._byte_bounds
+    assert hi - lo == (n0 * n1 - 1) * x.itemsize
+
+
+def test_gh_1201():
+    n = 100
+    a = np.flipud(np.arange(n, dtype="i4"))
+    try:
+        b = dpt.asarray(a)
+    except dpctl.SyclDeviceCreationError:
+        pytest.skip("No SYCL devices available")
+    assert (dpt.asnumpy(b) == a).all()
+    c = dpt.flip(dpt.empty(a.shape, dtype=a.dtype))
+    c[:] = a
+    assert (dpt.asnumpy(c) == a).all()
+
+
+class ObjWithSyclUsmArrayInterface:
+    def __init__(self, ary):
+        self._array_obj = ary
+
+    @property
+    def __sycl_usm_array_interface__(self):
+        _suai = self._array_obj.__sycl_usm_array_interface__
+        return _suai
+
+
+@pytest.mark.parametrize("ro_flag", [True, False])
+def test_asarray_writable_flag(ro_flag):
+    try:
+        a = dpt.empty(8)
+    except dpctl.SyclDeviceCreationError:
+        pytest.skip("No SYCL devices available")
+
+    a.flags["W"] = not ro_flag
+    wrapped = ObjWithSyclUsmArrayInterface(a)
+
+    b = dpt.asarray(wrapped)
+
+    assert b.flags["W"] == (not ro_flag)
+    assert b._pointer == a._pointer
+
+
+def test_getitem_validation():
+    """Test based on gh-1785"""
+    try:
+        a = dpt.empty((2, 2, 2))
+    except dpctl.SyclDeviceCreationError:
+        pytest.skip("No SYCL devices available")
+    with pytest.raises(IndexError):
+        a[0.0]
+    with pytest.raises(IndexError):
+        a[1, 0.0, ...]
+    with pytest.raises(IndexError):
+        a[1, 0.0, dpt.newaxis, 1]
+    with pytest.raises(IndexError):
+        a[dpt.newaxis, ..., 0.0]
+    with pytest.raises(IndexError):
+        a[dpt.newaxis, ..., 0.0, dpt.newaxis]
+    with pytest.raises(IndexError):
+        a[..., 0.0, dpt.newaxis]
+    with pytest.raises(IndexError):
+        a[:, 0.0, dpt.newaxis]
+
+
+def test_array_like_ctors_order_K():
+    get_queue_or_skip()
+
+    sh = (10, 10)
+    x1 = dpt.zeros(sh, dtype="i4", order="C")
+    r1 = dpt.full_like(x1, 2, order="K")
+    assert dpt.all(r1 == 2)
+    assert r1.flags.c_contiguous
+    r2 = dpt.empty_like(x1, order="K")
+    assert r2.flags.c_contiguous
+    r3 = dpt.ones_like(x1, order="K")
+    assert dpt.all(r3 == 1)
+    assert r3.flags.c_contiguous
+    r4 = dpt.zeros_like(x1, order="K")
+    assert dpt.all(r4 == 0)
+    assert r4.flags.c_contiguous
+
+    x2 = dpt.zeros(sh, dtype="i4", order="F")
+    r5 = dpt.full_like(x2, 2, order="K")
+    assert dpt.all(r5 == 2)
+    assert r5.flags.f_contiguous
+    r6 = dpt.empty_like(x2, order="K")
+    assert r6.flags.f_contiguous
+    r7 = dpt.ones_like(x2, order="K")
+    assert dpt.all(r7 == 1)
+    assert r7.flags.f_contiguous
+    r8 = dpt.zeros_like(x2, order="K")
+    assert dpt.all(r8 == 0)
+    assert r8.flags.f_contiguous
+
+    x3 = dpt.zeros(sh, dtype="i4", order="C")[::-2, :5]
+    st_expected = (-5, 1)
+    r9 = dpt.full_like(x3, 2, order="K")
+    assert dpt.all(r1 == 2)
+    assert r9.strides == st_expected
+    assert not r9.flags.forc
+    r10 = dpt.empty_like(x3, order="K")
+    assert not r10.flags.forc
+    assert r10.strides == st_expected
+    r11 = dpt.ones_like(x3, order="K")
+    assert dpt.all(r11 == 1)
+    assert not r11.flags.forc
+    assert r11.strides == st_expected
+    r12 = dpt.zeros_like(x3, order="K")
+    assert dpt.all(r12 == 0)
+    assert not r12.flags.forc
+    assert r12.strides == st_expected
+
+
+def test_array_like_ctors_order_A():
+    get_queue_or_skip()
+
+    sh = (10, 10)
+    x1 = dpt.zeros(sh, dtype="i4", order="C")
+    r1 = dpt.full_like(x1, 2, order="A")
+    assert dpt.all(r1 == 2)
+    assert r1.flags.c_contiguous
+    r2 = dpt.empty_like(x1, order="A")
+    assert r2.flags.c_contiguous
+    r3 = dpt.ones_like(x1, order="A")
+    assert dpt.all(r3 == 1)
+    assert r3.flags.c_contiguous
+    r4 = dpt.zeros_like(x1, order="A")
+    assert dpt.all(r4 == 0)
+    assert r4.flags.c_contiguous
+
+    x2 = dpt.zeros(sh, dtype="i4", order="F")
+    r5 = dpt.full_like(x2, 2, order="A")
+    assert dpt.all(r5 == 2)
+    assert r5.flags.f_contiguous
+    r6 = dpt.empty_like(x2, order="A")
+    assert r6.flags.f_contiguous
+    r7 = dpt.ones_like(x2, order="A")
+    assert dpt.all(r7 == 1)
+    assert r7.flags.f_contiguous
+    r8 = dpt.zeros_like(x2, order="A")
+    assert dpt.all(r8 == 0)
+    assert r8.flags.f_contiguous
+
+    x3 = dpt.zeros(sh, dtype="i4", order="C")[::-2, :5]
+    r9 = dpt.full_like(x3, 2, order="A")
+    assert dpt.all(r1 == 2)
+    assert r9.flags.c_contiguous
+    r10 = dpt.empty_like(x3, order="A")
+    assert r10.flags.c_contiguous
+    r11 = dpt.ones_like(x3, order="A")
+    assert dpt.all(r11 == 1)
+    assert r11.flags.c_contiguous
+    r12 = dpt.zeros_like(x3, order="A")
+    assert dpt.all(r12 == 0)
+    assert r12.flags.c_contiguous
+
+
+def test_full_like_order_K_array_fill_v():
+    get_queue_or_skip()
+
+    x = dpt.zeros((10, 10), dtype="i4")
+    fill_v = dpt.asarray(2, dtype="i4")
+
+    r1 = dpt.full_like(x, fill_v, order="K")
+    assert dpt.all(r1 == 2)
+
+    # broadcast behavior
+    fill_v = dpt.arange(10, dtype="i4")[:, dpt.newaxis]
+    r1 = dpt.full_like(x, fill_v, order="K")
+    assert dpt.all(r1 == dpt.tile(fill_v, (1, 10)))
+
+
+def test_full_like_order_K_same_input_output_queues():
+    q1 = get_queue_or_skip()
+    q2 = get_queue_or_skip()
+
+    x = dpt.zeros((10, 10), dtype="i4", sycl_queue=q1)
+    fill_v = dpt.asarray(2, dtype="i4", sycl_queue=q2)
+
+    r = dpt.full_like(x, fill_v, order="K")
+    assert r.sycl_queue == x.sycl_queue
+
+
+def test_asarray_from_numpy_contig():
+    get_queue_or_skip()
+
+    i_dt = np.int64
+    Xnp = np.arange(32, dtype=i_dt)
+
+    fp_dt = dpt.float32
+    # Use contig copy kernel
+    Xdpt = dpt.asarray(Xnp, dtype=fp_dt)
+
+    assert dpt.all(Xdpt == dpt.arange(32, dtype=fp_dt))
+
+
+def test_setitem_from_numpy_contig():
+    get_queue_or_skip()
+
+    i_dt = np.int64
+    fp_dt = dpt.float32
+
+    Xnp = np.flip(np.arange(32, dtype=i_dt))
+    Xdpt = dpt.flip(dpt.empty(Xnp.shape, dtype=fp_dt))
+    # Use contig copy kernel, after stride simplification
+    Xdpt[:] = Xnp
+
+    expected = dpt.arange(31, stop=-1, step=-1, dtype=fp_dt)
+    assert dpt.all(Xdpt == expected)
+
+    Xnp = np.fliplr(np.reshape(np.arange(-10, 10, dtype=i_dt), (4, 5)))
+    Xdpt = dpt.flip(dpt.empty(Xnp.shape, dtype=fp_dt), axis=-1)
+
+    # after stride simplification, contig kernel is used
+    Xdpt[:] = Xnp
+
+    expected = dpt.reshape(dpt.arange(-10, 10, dtype=fp_dt), (4, 5))
+    assert dpt.all(dpt.flip(Xdpt, axis=-1) == expected)
+
+
+def test_full_functions_raise_type_error():
+    get_queue_or_skip()
+
+    with pytest.raises(TypeError):
+        dpt.full(1, "0")
+
+    x = dpt.ones(1, dtype="i4")
+    with pytest.raises(TypeError):
+        dpt.full_like(x, "0")
+
+
+@pytest.mark.parametrize("dt", _all_dtypes)
+def test_setitem_copy_as_contig_alignment(dt):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dt, q)
+
+    dtype_ = dpt.dtype(dt)
+    n0, n1 = 8, 23
+
+    x = dpt.zeros((n0, n1), dtype=dtype_, sycl_queue=q)
+
+    vals = dpt.ones(n1, dtype=dtype_, sycl_queue=q)[dpt.newaxis, :]
+    x[1:, ...] = vals
+    assert dpt.all(x[0] == 0)
+    assert dpt.all(x[1:, :] == vals)
+
+
+def test_asarray_property():
+    get_queue_or_skip()
+
+    x = dpt.ones(11, dtype="i4")
+
+    with pytest.raises(TypeError):
+        np.asarray(x)
diff --git a/dpnp/tests/tensor/test_usm_ndarray_dlpack.py b/dpnp/tests/tensor/test_usm_ndarray_dlpack.py
new file mode 100644
index 000000000000..4b04339fe7f9
--- /dev/null
+++ b/dpnp/tests/tensor/test_usm_ndarray_dlpack.py
@@ -0,0 +1,917 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+import collections
+import ctypes
+
+import dpctl
+import numpy as np
+import pytest
+
+import dpnp.tensor as dpt
+import dpnp.tensor._dlpack as _dlp
+import dpnp.tensor._usmarray as dpt_arr
+
+from .helper import (
+    get_queue_or_skip,
+    skip_if_dtype_not_supported,
+)
+
+device_CPU = dpt_arr.DLDeviceType.kDLCPU
+device_oneAPI = dpt_arr.DLDeviceType.kDLOneAPI
+
+_usm_types_list = ["shared", "device", "host"]
+
+
+@pytest.fixture(params=_usm_types_list)
+def usm_type(request):
+    return request.param
+
+
+_typestrs_list = [
+    "b1",
+    "u1",
+    "i1",
+    "u2",
+    "i2",
+    "u4",
+    "i4",
+    "u8",
+    "i8",
+    "f2",
+    "f4",
+    "f8",
+    "c8",
+    "c16",
+]
+
+
+@pytest.fixture(params=_typestrs_list)
+def typestr(request):
+    return request.param
+
+
+@pytest.fixture
+def all_root_devices():
+    """
+    Caches root devices. For the sake of speed
+    of test suite execution, keep at most two
+    devices from each platform
+    """
+    devs = dpctl.get_devices()
+    devs_per_platform = collections.defaultdict(list)
+    for dev in devs:
+        devs_per_platform[dev.sycl_platform].append(dev)
+
+    pruned = map(lambda li: li[:2], devs_per_platform.values())
+    return sum(pruned, start=[])
+
+
+def test_dlpack_device(usm_type, all_root_devices):
+    for sycl_dev in all_root_devices:
+        X = dpt.empty((64,), dtype="u1", usm_type=usm_type, device=sycl_dev)
+        dev = X.__dlpack_device__()
+        assert type(dev) is tuple
+        assert len(dev) == 2
+        assert dev[0] == device_oneAPI
+        assert dev[1] == sycl_dev.get_device_id()
+
+
+def test_dlpack_exporter(typestr, usm_type, all_root_devices):
+    caps_fn = ctypes.pythonapi.PyCapsule_IsValid
+    caps_fn.restype = bool
+    caps_fn.argtypes = [ctypes.py_object, ctypes.c_char_p]
+    for sycl_dev in all_root_devices:
+        skip_if_dtype_not_supported(typestr, sycl_dev)
+        X = dpt.empty((64,), dtype=typestr, usm_type=usm_type, device=sycl_dev)
+        caps = X.__dlpack__()
+        assert caps_fn(caps, b"dltensor")
+        Y = X[::2]
+        caps2 = Y.__dlpack__()
+        assert caps_fn(caps2, b"dltensor")
+
+
+def test_dlpack_exporter_empty(typestr, usm_type):
+    caps_fn = ctypes.pythonapi.PyCapsule_IsValid
+    caps_fn.restype = bool
+    caps_fn.argtypes = [ctypes.py_object, ctypes.c_char_p]
+    try:
+        sycl_dev = dpctl.select_default_device()
+    except dpctl.SyclDeviceCreationError:
+        pytest.skip("No SYCL devices available")
+    skip_if_dtype_not_supported(typestr, sycl_dev)
+    X = dpt.empty((0,), dtype=typestr, usm_type=usm_type, device=sycl_dev)
+    caps = X.__dlpack__()
+    assert caps_fn(caps, b"dltensor")
+    Y = dpt.empty(
+        (
+            1,
+            0,
+        ),
+        dtype=typestr,
+        usm_type=usm_type,
+        device=sycl_dev,
+    )
+    caps = Y.__dlpack__()
+    assert caps_fn(caps, b"dltensor")
+
+
+def test_dlpack_exporter_stream():
+    try:
+        q1 = dpctl.SyclQueue()
+        q2 = dpctl.SyclQueue()
+    except dpctl.SyclQueueCreationError:
+        pytest.skip("Could not create default queues")
+    X = dpt.empty((64,), dtype="u1", sycl_queue=q1)
+    cap1 = X.__dlpack__(stream=q1)
+    cap2 = X.__dlpack__(stream=q2)
+    assert type(cap1) is type(cap2)
+
+
+@pytest.mark.parametrize("shape", [tuple(), (2,), (3, 0, 1), (2, 2, 2)])
+def test_from_dlpack(shape, typestr, usm_type, all_root_devices):
+    for sycl_dev in all_root_devices:
+        skip_if_dtype_not_supported(typestr, sycl_dev)
+        X = dpt.empty(shape, dtype=typestr, usm_type=usm_type, device=sycl_dev)
+        Y = dpt.from_dlpack(X)
+        assert X.shape == Y.shape
+        assert X.dtype == Y.dtype
+        assert X.usm_type == Y.usm_type
+        assert X._pointer == Y._pointer
+        # we can only expect device to round-trip for USM-device and
+        # USM-shared allocations, which are made for specific device
+        assert (Y.usm_type == "host") or (X.sycl_device == Y.sycl_device)
+        if Y.ndim:
+            V = Y[::-1]
+            W = dpt.from_dlpack(V)
+            assert V.strides == W.strides
+
+
+@pytest.mark.parametrize("mod", [2, 5])
+def test_from_dlpack_strides(mod, typestr, usm_type, all_root_devices):
+    for sycl_dev in all_root_devices:
+        skip_if_dtype_not_supported(typestr, sycl_dev)
+        X0 = dpt.empty(
+            3 * mod, dtype=typestr, usm_type=usm_type, device=sycl_dev
+        )
+        for start in range(mod):
+            X = X0[slice(-start - 1, None, -mod)]
+            Y = dpt.from_dlpack(X)
+            assert X.shape == Y.shape
+            assert X.dtype == Y.dtype
+            assert X.usm_type == Y.usm_type
+            assert X._pointer == Y._pointer
+            # we can only expect device to round-trip for USM-device and
+            # USM-shared allocations, which are made for specific device
+            assert (Y.usm_type == "host") or (X.sycl_device == Y.sycl_device)
+            if Y.ndim:
+                V = Y[::-1]
+                W = dpt.from_dlpack(V)
+                assert V.strides == W.strides
+
+
+def test_from_dlpack_input_validation():
+    v = dpt._dlpack.get_build_dlpack_version()
+    assert type(v) is tuple
+    with pytest.raises(TypeError):
+        dpt.from_dlpack(None)
+
+    class DummyWithProperty:
+        @property
+        def __dlpack__(self):
+            return None
+
+    with pytest.raises(TypeError):
+        dpt.from_dlpack(DummyWithProperty())
+
+    class DummyWithMethod:
+        def __dlpack__(self):
+            return None
+
+    with pytest.raises(TypeError):
+        dpt.from_dlpack(DummyWithMethod())
+
+
+def test_from_dlpack_fortran_contig_array_roundtripping():
+    """Based on examples from issue gh-1241"""
+    n0, n1 = 3, 5
+    try:
+        ar1d = dpt.arange(n0 * n1, dtype="i4")
+    except dpctl.SyclDeviceCreationError:
+        pytest.skip("No default device available")
+    ar2d_c = dpt.reshape(ar1d, (n0, n1), order="C")
+    ar2d_f = dpt.asarray(ar2d_c, order="F")
+    ar2d_r = dpt.from_dlpack(ar2d_f)
+
+    assert dpt.all(dpt.equal(ar2d_f, ar2d_r))
+    assert dpt.all(dpt.equal(ar2d_c, ar2d_r))
+
+
+def test_dlpack_from_subdevice():
+    """
+    This test checks that array allocated on a sub-device,
+    with memory bound to platform-default SyclContext can be
+    exported and imported via DLPack.
+    """
+    n = 64
+    try:
+        dev = dpctl.SyclDevice()
+    except dpctl.SyclDeviceCreationError:
+        pytest.skip("No default device available")
+    try:
+        sdevs = dev.create_sub_devices(partition="next_partitionable")
+    except dpctl.SyclSubDeviceCreationError:
+        sdevs = None
+    try:
+        if sdevs is None:
+            sdevs = dev.create_sub_devices(partition=[1, 1])
+    except dpctl.SyclSubDeviceCreationError:
+        pytest.skip("Default device can not be partitioned")
+    assert isinstance(sdevs, list) and len(sdevs) > 0
+    try:
+        ctx = sdevs[0].sycl_platform.default_context
+    except dpctl.SyclContextCreationError:
+        pytest.skip("Platform's default_context is not available")
+    try:
+        q = dpctl.SyclQueue(ctx, sdevs[0])
+    except dpctl.SyclQueueCreationError:
+        pytest.skip("Queue could not be created")
+
+    ar = dpt.arange(n, dtype=dpt.int32, sycl_queue=q)
+    ar2 = dpt.from_dlpack(ar)
+    assert ar2.sycl_device == sdevs[0]
+
+
+def test_legacy_dlpack_capsule():
+    try:
+        x = dpt.arange(100, dtype="i4")
+    except dpctl.SyclDeviceCreationError:
+        pytest.skip("No default device available")
+
+    legacy_ver = (0, 8)
+
+    cap = x.__dlpack__(max_version=legacy_ver)
+    y = _dlp.from_dlpack_capsule(cap)
+    del cap
+    assert x._pointer == y._pointer
+
+    x = dpt.arange(100, dtype="u4")
+    x2 = dpt.reshape(x, (10, 10)).mT
+    cap = x2.__dlpack__(max_version=legacy_ver)
+    y = _dlp.from_dlpack_capsule(cap)
+    del cap
+    assert x2._pointer == y._pointer
+    del x2
+
+    x = dpt.arange(100, dtype="f4")
+    x2 = dpt.asarray(dpt.reshape(x, (10, 10)), order="F")
+    cap = x2.__dlpack__(max_version=legacy_ver)
+    y = _dlp.from_dlpack_capsule(cap)
+    del cap
+    assert x2._pointer == y._pointer
+
+    x = dpt.arange(100, dtype="c8")
+    x3 = x[::-2]
+    cap = x3.__dlpack__(max_version=legacy_ver)
+    y = _dlp.from_dlpack_capsule(cap)
+    assert x3._pointer == y._pointer
+    del x3, y, x
+    del cap
+
+    x = dpt.ones(100, dtype="?")
+    x4 = x[::-2]
+    cap = x4.__dlpack__(max_version=legacy_ver)
+    y = _dlp.from_dlpack_capsule(cap)
+    assert x4._pointer == y._pointer
+    del x4, y, x
+    del cap
+
+
+def test_versioned_dlpack_capsule():
+    try:
+        x = dpt.arange(100, dtype="i4")
+    except dpctl.SyclDeviceCreationError:
+        pytest.skip("No default device available")
+
+    max_supported_ver = _dlp.get_build_dlpack_version()
+    cap = x.__dlpack__(max_version=max_supported_ver)
+    y = _dlp.from_dlpack_capsule(cap)
+    del cap
+    assert x._pointer == y._pointer
+
+    x2 = dpt.asarray(dpt.reshape(x, (10, 10)), order="F")
+    cap = x2.__dlpack__(max_version=max_supported_ver)
+    y = _dlp.from_dlpack_capsule(cap)
+    del cap
+    assert x2._pointer == y._pointer
+    del x2
+
+    x3 = x[::-2]
+    cap = x3.__dlpack__(max_version=max_supported_ver)
+    y = _dlp.from_dlpack_capsule(cap)
+    assert x3._pointer == y._pointer
+    del x3, y, x
+    del cap
+
+    # read-only array
+    x = dpt.arange(100, dtype="i4")
+    x.flags["W"] = False
+    cap = x.__dlpack__(max_version=max_supported_ver)
+    y = _dlp.from_dlpack_capsule(cap)
+    assert x._pointer == y._pointer
+    assert not y.flags.writable
+
+    # read-only array, and copy
+    cap = x.__dlpack__(max_version=max_supported_ver, copy=True)
+    y = _dlp.from_dlpack_capsule(cap)
+    assert x._pointer != y._pointer
+    assert not y.flags.writable
+
+
+def test_from_dlpack_kwargs():
+    try:
+        x = dpt.arange(100, dtype="i4")
+    except dpctl.SyclDeviceCreationError:
+        pytest.skip("No default device available")
+
+    y = dpt.from_dlpack(x, copy=True)
+    assert x._pointer != y._pointer
+
+    z = dpt.from_dlpack(x, device=x.sycl_device)
+    assert z._pointer == x._pointer
+
+
+def test_dlpack_deleters():
+    try:
+        x = dpt.arange(100, dtype="i4")
+    except dpctl.SyclDeviceCreationError:
+        pytest.skip("No default device available")
+
+    legacy_ver = (0, 8)
+    cap = x.__dlpack__(max_version=legacy_ver)
+    del cap
+
+    max_supported_ver = _dlp.get_build_dlpack_version()
+    cap = x.__dlpack__(max_version=max_supported_ver)
+    del cap
+
+
+def test_from_dlpack_device():
+    try:
+        x = dpt.arange(100, dtype="i4")
+    except dpctl.SyclDeviceCreationError:
+        pytest.skip("No default device available")
+
+    out = dpt.from_dlpack(x, device=x.__dlpack_device__())
+    assert x.device == out.device
+    assert x._pointer == out._pointer
+
+    out = dpt.from_dlpack(x, device=x.device)
+    assert x.device == out.device
+    assert x._pointer == out._pointer
+
+    out = dpt.from_dlpack(x, device=x.sycl_device)
+    assert x.device == out.device
+    assert x._pointer == out._pointer
+
+
+def test_used_dlpack_capsule():
+    try:
+        x = dpt.arange(100, dtype="i4")
+    except dpctl.SyclDeviceCreationError:
+        pytest.skip("No default device available")
+
+    legacy_ver = (0, 8)
+    cap = x.__dlpack__(max_version=legacy_ver)
+    _dlp.from_dlpack_capsule(cap)
+    with pytest.raises(
+        ValueError,
+        match="A DLPack tensor object can not be consumed multiple times",
+    ):
+        _dlp.from_dlpack_capsule(cap)
+    del cap
+
+    max_supported_ver = _dlp.get_build_dlpack_version()
+    cap = x.__dlpack__(max_version=max_supported_ver)
+    _dlp.from_dlpack_capsule(cap)
+    with pytest.raises(
+        ValueError,
+        match="A DLPack tensor object can not be consumed multiple times",
+    ):
+        _dlp.from_dlpack_capsule(cap)
+    del cap
+
+
+def test_dlpack_size_0():
+    try:
+        x = dpt.ones(0, dtype="i4")
+    except dpctl.SyclDeviceCreationError:
+        pytest.skip("No default device available")
+
+    legacy_ver = (0, 8)
+    cap = x.__dlpack__(max_version=legacy_ver)
+    y = _dlp.from_dlpack_capsule(cap)
+    assert y._pointer == x._pointer
+
+    max_supported_ver = _dlp.get_build_dlpack_version()
+    cap = x.__dlpack__(max_version=max_supported_ver)
+    y = _dlp.from_dlpack_capsule(cap)
+    assert y._pointer == x._pointer
+
+
+def test_dlpack_max_version_validation():
+    try:
+        x = dpt.ones(100, dtype="i4")
+    except dpctl.SyclDeviceCreationError:
+        pytest.skip("No default device available")
+
+    with pytest.raises(
+        TypeError,
+        match=r"`__dlpack__` expects `max_version` to be a "
+        r"2-tuple of integers `\(major, minor\)`, instead "
+        r"got .*",
+    ):
+        x.__dlpack__(max_version=1)
+
+
+def test_dlpack_kwargs():
+    try:
+        q1 = dpctl.SyclQueue()
+        q2 = dpctl.SyclQueue()
+    except dpctl.SyclQueueCreationError:
+        pytest.skip("Could not create default queues")
+    x = dpt.arange(100, dtype="i4", sycl_queue=q1)
+
+    legacy_ver = (0, 8)
+    cap = x.__dlpack__(stream=q2, max_version=legacy_ver, copy=True)
+    # `copy` ignored for legacy path
+    y = _dlp.from_dlpack_capsule(cap)
+    assert y._pointer == x._pointer
+    del x, y
+    del cap
+
+    x1 = dpt.arange(100, dtype="i4", sycl_queue=q1)
+    max_supported_ver = _dlp.get_build_dlpack_version()
+    cap = x1.__dlpack__(stream=q2, max_version=max_supported_ver, copy=False)
+    y = _dlp.from_dlpack_capsule(cap)
+    assert y._pointer == x1._pointer
+    del x1, y
+    del cap
+
+    x2 = dpt.arange(100, dtype="i4", sycl_queue=q1)
+    cap = x2.__dlpack__(stream=q2, max_version=max_supported_ver, copy=True)
+    y = _dlp.from_dlpack_capsule(cap)
+    assert y._pointer != x2._pointer
+    del x2, y
+    del cap
+
+
+def _is_capsule(o):
+    t = type(o)
+    return t.__module__ == "builtins" and t.__name__ == "PyCapsule"
+
+
+def test_dlpack_dl_device():
+    try:
+        x = dpt.arange(100, dtype="i4")
+    except dpctl.SyclDeviceCreationError:
+        pytest.skip("No SYCL devices available")
+    max_supported_ver = _dlp.get_build_dlpack_version()
+    cap1 = x.__dlpack__(
+        dl_device=x.__dlpack_device__(), max_version=max_supported_ver
+    )
+    assert _is_capsule(cap1)
+    cap2 = x.__dlpack__(dl_device=(1, 0), max_version=max_supported_ver)
+    assert _is_capsule(cap2)
+    cap3 = x.__dlpack__(
+        dl_device=(device_CPU, 0),
+        max_version=max_supported_ver,
+    )
+    assert _is_capsule(cap3)
+    cap4 = x.__dlpack__(dl_device=("kDLCPU", 0), max_version=max_supported_ver)
+    assert _is_capsule(cap4)
+    with pytest.raises(TypeError):
+        # pass method instead of return of its __call__ invocation
+        x.__dlpack__(
+            dl_device=x.__dlpack_device__, max_version=max_supported_ver
+        )
+    with pytest.raises(TypeError):
+        # exercise check for length
+        x.__dlpack__(dl_device=(3,), max_version=max_supported_ver)
+
+
+def test_from_dlpack_kdlcpu_interop_numpy():
+    """
+    Basic test that usm_ndarray can interoperate with NumPy ndarray
+    `__dlpack_device__`.
+    """
+    get_queue_or_skip()
+
+    sh = 5
+    dt = dpt.int32
+
+    X = dpt.empty(sh, dtype=dt)
+    dl_device_np = np.empty(()).__dlpack_device__()
+
+    Y = dpt.from_dlpack(X, device=dl_device_np)
+    assert isinstance(Y, np.ndarray)
+    assert X.shape == Y.shape
+    assert X.dtype == Y.dtype
+
+    V = dpt.from_dlpack(Y)
+    assert isinstance(V, np.ndarray)
+    assert Y.shape == V.shape
+    assert Y.dtype == V.dtype
+
+
+@pytest.mark.parametrize("shape", [tuple(), (2,), (3, 0, 1), (2, 2, 2)])
+def test_from_dlpack_to_kdlcpu(shape, typestr):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(typestr, q.sycl_device)
+
+    X = dpt.empty(shape, dtype=typestr, sycl_queue=q)
+    Y = dpt.from_dlpack(X, device=(device_CPU, 0))
+    assert isinstance(Y, np.ndarray)
+    assert X.shape == Y.shape
+    assert X.dtype == Y.dtype
+    # NumPy does not treat size 0 arrays consistently
+    # w.r.t. strides, so skip these cases
+    if X.ndim and X.size != 0:
+        V = Y[::-1]
+        W = dpt.from_dlpack(V)
+        assert V.strides == W.strides
+
+
+@pytest.mark.parametrize("mod", [2, 5])
+def test_from_dlpack_to_kdlcpu_strides(mod, typestr):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(typestr, q.sycl_device)
+
+    X0 = dpt.empty(3 * mod, dtype=typestr, sycl_queue=q)
+    for start in range(mod):
+        X = X0[slice(-start - 1, None, -mod)]
+        Y = dpt.from_dlpack(X, device=(device_CPU, 0))
+        assert X.shape == Y.shape
+        assert X.dtype == Y.dtype
+        if Y.ndim:
+            V = Y[::-1]
+            W = dpt.from_dlpack(V)
+            assert V.strides == W.strides
+
+
+def test_dlpack_from_subdevice_to_kdlcpu():
+    """
+    Check that array allocated on a sub-device can be
+    imported via DLPack to kDLCPU device (as a NumPy array).
+    """
+    n = 64
+    try:
+        dev = dpctl.SyclDevice()
+    except dpctl.SyclDeviceCreationError:
+        pytest.skip("No default device available")
+    try:
+        sdevs = dev.create_sub_devices(partition="next_partitionable")
+    except dpctl.SyclSubDeviceCreationError:
+        sdevs = None
+    try:
+        if sdevs is None:
+            sdevs = dev.create_sub_devices(partition=[1, 1])
+    except dpctl.SyclSubDeviceCreationError:
+        pytest.skip("Default device can not be partitioned")
+    assert isinstance(sdevs, list) and len(sdevs) > 0
+    try:
+        ctx = sdevs[0].sycl_platform.default_context
+    except dpctl.SyclContextCreationError:
+        pytest.skip("Platform's default_context is not available")
+    try:
+        q = dpctl.SyclQueue(ctx, sdevs[0])
+    except dpctl.SyclQueueCreationError:
+        pytest.skip("Queue could not be created")
+
+    ar = dpt.arange(n, dtype=dpt.int32, sycl_queue=q)
+    ar2 = dpt.from_dlpack(ar, dl_device=(device_CPU, 0))
+    assert isinstance(ar2, np.ndarray)
+
+
+def test_legacy_dlpack_capsule_from_numpy():
+    """
+    Check that NumPy's exported legacy DLPack capsule
+    will interoperate with from_dlpack_capsule,
+    especially with zero-copy.
+    """
+    x = np.arange(100, dtype="i4")
+    cap = x.__dlpack__()
+    y = _dlp.from_dlpack_capsule(cap)
+    del cap
+    assert x.ctypes.data == y.ctypes.data
+
+    x = np.arange(100, dtype="u4").reshape((10, 10)).T
+    cap = x.__dlpack__()
+    y = _dlp.from_dlpack_capsule(cap)
+    del cap
+    assert x.ctypes.data == y.ctypes.data
+    del x
+
+    x = np.arange(100, dtype="f4").reshape((10, 10), order="F")
+    cap = x.__dlpack__()
+    y = _dlp.from_dlpack_capsule(cap)
+    del cap
+    assert x.ctypes.data == y.ctypes.data
+
+    x = np.arange(100, dtype="c8")
+    x1 = x[::-2]
+    cap = x1.__dlpack__()
+    y = _dlp.from_dlpack_capsule(cap)
+    assert x1.ctypes.data == y.ctypes.data
+    del x1, y, x
+    del cap
+
+    x = np.ones(100, dtype="?")
+    x1 = x[::-2]
+    cap = x1.__dlpack__()
+    y = _dlp.from_dlpack_capsule(cap)
+    assert x1.ctypes.data == y.ctypes.data
+    del x1, y, x
+    del cap
+
+
+def test_dlpack_capsule_readonly_array_to_kdlcpu():
+    try:
+        x = dpt.arange(100, dtype="i4")
+    except dpctl.SyclDeviceCreationError:
+        pytest.skip("No default device available")
+
+    max_supported_ver = _dlp.get_build_dlpack_version()
+    # read-only array
+    x.flags["W"] = False
+    cap = x.__dlpack__(max_version=max_supported_ver, dl_device=(device_CPU, 0))
+    y = _dlp.from_dlpack_capsule(cap)
+    assert dpt.all(x == dpt.asarray(y))
+    assert not y.flags["W"]
+
+    cap1 = _dlp.numpy_to_dlpack_versioned_capsule(y, not y.flags["W"])
+    y1 = _dlp.from_dlpack_capsule(cap1)
+    assert not y1.flags["W"]
+
+
+def test_to_dlpack_capsule_c_and_f_contig():
+    try:
+        x = dpt.asarray(np.random.rand(2, 3))
+    except dpctl.SyclDeviceCreationError:
+        pytest.skip("No default device available")
+
+    cap = _dlp.to_dlpack_capsule(x)
+    y = _dlp.from_dlpack_capsule(cap)
+    assert np.allclose(dpt.asnumpy(x), dpt.asnumpy(y))
+    assert x.strides == y.strides
+
+    x_f = x.T
+    cap = _dlp.to_dlpack_capsule(x_f)
+    yf = _dlp.from_dlpack_capsule(cap)
+    assert np.allclose(dpt.asnumpy(x_f), dpt.asnumpy(yf))
+    assert x_f.strides == yf.strides
+    del cap
+
+
+def test_to_dlpack_versioned_capsule_c_and_f_contig():
+    try:
+        x = dpt.asarray(np.random.rand(2, 3))
+        max_supported_ver = _dlp.get_build_dlpack_version()
+    except dpctl.SyclDeviceCreationError:
+        pytest.skip("No default device available")
+
+    cap = x.__dlpack__(max_version=max_supported_ver)
+    y = _dlp.from_dlpack_capsule(cap)
+    assert np.allclose(dpt.asnumpy(x), dpt.asnumpy(y))
+    assert x.strides == y.strides
+
+    x_f = x.T
+    cap = x_f.__dlpack__(max_version=max_supported_ver)
+    yf = _dlp.from_dlpack_capsule(cap)
+    assert np.allclose(dpt.asnumpy(x_f), dpt.asnumpy(yf))
+    assert x_f.strides == yf.strides
+    del cap
+
+
+def test_used_dlpack_capsule_from_numpy():
+    get_queue_or_skip()
+
+    x_np = np.arange(100, dtype="i4")
+
+    cap = x_np.__dlpack__()
+    _dlp.from_dlpack_capsule(cap)
+    with pytest.raises(
+        ValueError,
+        match="A DLPack tensor object can not be consumed multiple times",
+    ):
+        _dlp.from_dlpack_capsule(cap)
+    del cap
+
+    x = dpt.asarray(x_np)
+    max_supported_ver = _dlp.get_build_dlpack_version()
+    cap = x.__dlpack__(max_version=max_supported_ver, dl_device=(device_CPU, 0))
+    _dlp.from_dlpack_capsule(cap)
+    with pytest.raises(
+        ValueError,
+        match="A DLPack tensor object can not be consumed multiple times",
+    ):
+        _dlp.from_dlpack_capsule(cap)
+    del cap
+
+
+def test_dlpack_size_0_on_kdlcpu():
+    get_queue_or_skip()
+    x_np = np.ones(0, dtype="i4")
+
+    cap = x_np.__dlpack__()
+    y = _dlp.from_dlpack_capsule(cap)
+    assert y.ctypes.data == x_np.ctypes.data
+
+
+def test_copy_via_host():
+    get_queue_or_skip()
+    x = dpt.ones(1, dtype="i4")
+    x_np = np.ones(1, dtype="i4")
+    x_dl_dev = x.__dlpack_device__()
+    y = dpt.from_dlpack(x_np, device=x_dl_dev)
+    assert isinstance(y, dpt.usm_ndarray)
+    assert y.sycl_device == x.sycl_device
+    assert y.usm_type == "device"
+
+    with pytest.raises(ValueError):
+        # incorrect length of tuple
+        dpt.from_dlpack(x_np, device=(1, 0, 0))
+    with pytest.raises(ValueError):
+        # only kDLCPU and kDLOneAPI are supported
+        dpt.from_dlpack(x, device=(2, 0))
+
+    num_devs = dpctl.get_num_devices()
+    if num_devs > 1:
+        j = [i for i in range(num_devs) if i != x_dl_dev[1]][0]
+        z = dpt.from_dlpack(x, device=(x_dl_dev[0], j))
+        assert isinstance(z, dpt.usm_ndarray)
+        assert z.usm_type == "device"
+
+
+def test_copy_via_host_gh_1789():
+    "Test based on review example from gh-1789"
+    get_queue_or_skip()
+    x_np = np.ones((10, 10), dtype="i4")
+    # strides are no longer multiple of itemsize
+    x_np.strides = (x_np.strides[0] - 1, x_np.strides[1])
+    with pytest.raises(BufferError):
+        dpt.from_dlpack(x_np)
+    with pytest.raises(BufferError):
+        dpt.from_dlpack(x_np, device=(14, 0))
+
+
+class LegacyContainer:
+    "Helper class implementing legacy `__dlpack__` protocol"
+
+    def __init__(self, array):
+        self._array = array
+
+    def __dlpack__(self, stream=None):
+        return self._array.__dlpack__(stream=stream)
+
+    def __dlpack_device__(self):
+        return self._array.__dlpack_device__()
+
+
+class Container:
+    "Helper class implementing `__dlpack__` protocol version 1.0"
+
+    def __init__(self, array):
+        self._array = array
+
+    def __dlpack__(
+        self, max_version=None, dl_device=None, copy=None, stream=None
+    ):
+        return self._array.__dlpack__(
+            max_version=max_version,
+            dl_device=dl_device,
+            copy=copy,
+            stream=stream,
+        )
+
+    def __dlpack_device__(self):
+        return self._array.__dlpack_device__()
+
+
+def test_generic_container_legacy():
+    get_queue_or_skip()
+    C = LegacyContainer(dpt.linspace(0, 100, num=20, dtype="int16"))
+
+    X = dpt.from_dlpack(C)
+    assert isinstance(X, dpt.usm_ndarray)
+    assert X._pointer == C._array._pointer
+    assert X.sycl_device == C._array.sycl_device
+    assert X.dtype == C._array.dtype
+
+    Y = dpt.from_dlpack(C, device=(dpt.DLDeviceType.kDLCPU, 0))
+    assert isinstance(Y, np.ndarray)
+    assert Y.dtype == X.dtype
+
+    Z = dpt.from_dlpack(C, device=X.device)
+    assert isinstance(Z, dpt.usm_ndarray)
+    assert Z._pointer == X._pointer
+    assert Z.device == X.device
+
+
+def test_generic_container_legacy_np():
+    get_queue_or_skip()
+    C = LegacyContainer(np.linspace(0, 100, num=20, dtype="int16"))
+
+    X = dpt.from_dlpack(C)
+    assert isinstance(X, np.ndarray)
+    assert X.ctypes.data == C._array.ctypes.data
+    assert X.dtype == C._array.dtype
+
+    Y = dpt.from_dlpack(C, device=(dpt.DLDeviceType.kDLCPU, 0))
+    assert isinstance(Y, np.ndarray)
+    assert Y.dtype == X.dtype
+
+    dev = dpt.Device.create_device()
+    Z = dpt.from_dlpack(C, device=dev)
+    assert isinstance(Z, dpt.usm_ndarray)
+    assert Z.device == dev
+
+
+def test_generic_container():
+    get_queue_or_skip()
+    C = Container(dpt.linspace(0, 100, num=20, dtype="int16"))
+
+    X = dpt.from_dlpack(C)
+    assert isinstance(X, dpt.usm_ndarray)
+    assert X._pointer == C._array._pointer
+    assert X.sycl_device == C._array.sycl_device
+    assert X.dtype == C._array.dtype
+
+    Y = dpt.from_dlpack(C, device=(dpt.DLDeviceType.kDLCPU, 0))
+    assert isinstance(Y, np.ndarray)
+    assert Y.dtype == X.dtype
+
+    Z = dpt.from_dlpack(C, device=X.device)
+    assert isinstance(Z, dpt.usm_ndarray)
+    assert Z._pointer == X._pointer
+    assert Z.device == X.device
+
+
+def test_sycl_device_to_dldevice(all_root_devices):
+    for sycl_dev in all_root_devices:
+        dev = dpt.sycl_device_to_dldevice(sycl_dev)
+        assert type(dev) is tuple
+        assert len(dev) == 2
+        assert dev[0] == device_oneAPI
+        assert dev[1] == sycl_dev.get_device_id()
+
+
+def test_dldevice_to_sycl_device(all_root_devices):
+    for sycl_dev in all_root_devices:
+        dldev = dpt.empty(0, device=sycl_dev).__dlpack_device__()
+        dev = dpt.dldevice_to_sycl_device(dldev)
+        assert type(dev) is dpctl.SyclDevice
+        assert dev.get_device_id() == sycl_dev.get_device_id()
+
+
+def test_dldevice_conversion_arg_validation():
+    bad_dldevice_type = (dpt.DLDeviceType.kDLCPU, 0)
+    with pytest.raises(ValueError):
+        dpt.dldevice_to_sycl_device(bad_dldevice_type)
+
+    bad_dldevice_len = bad_dldevice_type + (0,)
+    with pytest.raises(ValueError):
+        dpt.dldevice_to_sycl_device(bad_dldevice_len)
+
+    bad_dldevice = {}
+    with pytest.raises(TypeError):
+        dpt.dldevice_to_sycl_device(bad_dldevice)
+
+    bad_sycldevice = {}
+    with pytest.raises(TypeError):
+        dpt.sycl_device_to_dldevice(bad_sycldevice)
diff --git a/dpnp/tests/tensor/test_usm_ndarray_indexing.py b/dpnp/tests/tensor/test_usm_ndarray_indexing.py
new file mode 100644
index 000000000000..530d4ab2988c
--- /dev/null
+++ b/dpnp/tests/tensor/test_usm_ndarray_indexing.py
@@ -0,0 +1,2055 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+import dpctl
+import numpy as np
+import pytest
+from dpctl.utils import ExecutionPlacementError
+from numpy.testing import assert_array_equal
+
+import dpnp.tensor as dpt
+import dpnp.tensor._tensor_impl as ti
+from dpnp.tensor._copy_utils import _take_multi_index
+
+from .helper import (
+    get_queue_or_skip,
+    skip_if_dtype_not_supported,
+)
+
+_all_dtypes = [
+    "u1",
+    "i1",
+    "u2",
+    "i2",
+    "u4",
+    "i4",
+    "u8",
+    "i8",
+    "e",
+    "f",
+    "d",
+    "F",
+    "D",
+]
+
+_all_int_dtypes = ["u1", "i1", "u2", "i2", "u4", "i4", "u8", "i8"]
+
+
+def test_basic_slice1():
+    q = get_queue_or_skip()
+    x = dpt.empty(10, dtype="u2", sycl_queue=q)
+    y = x[0]
+    assert isinstance(y, dpt.usm_ndarray)
+    assert y.ndim == 0
+    assert y.shape == ()
+    assert y.strides == ()
+
+
+def test_basic_slice2():
+    q = get_queue_or_skip()
+    x = dpt.empty(10, dtype="i2", sycl_queue=q)
+    y = x[(0,)]
+    assert isinstance(y, dpt.usm_ndarray)
+    assert y.ndim == 0
+    assert y.shape == ()
+    assert y.strides == ()
+
+
+def test_basic_slice3():
+    q = get_queue_or_skip()
+    x = dpt.empty(10, dtype="i2", sycl_queue=q)
+    y = x[:]
+    assert isinstance(y, dpt.usm_ndarray)
+    assert y.ndim == x.ndim
+    assert y.shape == x.shape
+    assert y.strides == x.strides
+    y = x[(slice(None, None, None),)]
+    assert isinstance(y, dpt.usm_ndarray)
+    assert y.ndim == x.ndim
+    assert y.shape == x.shape
+    assert y.strides == x.strides
+
+
+def test_basic_slice4():
+    q = get_queue_or_skip()
+    n0, n1 = 5, 3
+    x = dpt.empty((n0, n1), dtype="f4", sycl_queue=q)
+    y = x[::-1]
+    assert isinstance(y, dpt.usm_ndarray)
+    assert y.shape == x.shape
+    assert y.strides == (-x.strides[0], x.strides[1])
+    actual_offset = y.__sycl_usm_array_interface__["offset"]
+    assert actual_offset == (n0 - 1) * n1
+
+
+def test_basic_slice5():
+    q = get_queue_or_skip()
+    n0, n1 = 5, 3
+    x = dpt.empty((n0, n1), dtype="c8", sycl_queue=q)
+    y = x[:, ::-1]
+    assert isinstance(y, dpt.usm_ndarray)
+    assert y.shape == x.shape
+    assert y.strides == (x.strides[0], -x.strides[1])
+    actual_offset = y.__sycl_usm_array_interface__["offset"]
+    assert actual_offset == (n1 - 1)
+
+
+def test_basic_slice6():
+    q = get_queue_or_skip()
+    i0, n0, n1 = 2, 4, 3
+    x = dpt.empty((n0, n1), dtype="c8", sycl_queue=q)
+    y = x[i0, ::-1]
+    assert isinstance(y, dpt.usm_ndarray)
+    assert y.shape == (x.shape[1],)
+    assert y.strides == (-x.strides[1],)
+    actual_offset = y.__sycl_usm_array_interface__["offset"]
+    expected_offset = i0 * x.strides[0] + (n1 - 1) * x.strides[1]
+    assert actual_offset == expected_offset
+
+
+def test_basic_slice7():
+    q = get_queue_or_skip()
+    n0, n1, n2 = 5, 3, 2
+    x = dpt.empty((n0, n1, n2), dtype="?", sycl_queue=q)
+    y = x[..., ::-1]
+    assert isinstance(y, dpt.usm_ndarray)
+    assert y.shape == x.shape
+    assert y.strides == (
+        x.strides[0],
+        x.strides[1],
+        -x.strides[2],
+    )
+    actual_offset = y.__sycl_usm_array_interface__["offset"]
+    expected_offset = (n2 - 1) * x.strides[2]
+    assert actual_offset == expected_offset
+
+
+def test_basic_slice8():
+    q = get_queue_or_skip()
+    n0, n1 = 3, 7
+    x = dpt.empty((n0, n1), dtype="u1", sycl_queue=q)
+    y = x[..., dpt.newaxis]
+    assert isinstance(y, dpt.usm_ndarray)
+    assert y.shape == (n0, n1, 1)
+    assert y.strides == (n1, 1, 0)
+
+
+def test_basic_slice9():
+    q = get_queue_or_skip()
+    n0, n1 = 3, 7
+    x = dpt.empty((n0, n1), dtype="u8", sycl_queue=q)
+    y = x[dpt.newaxis, ...]
+    assert isinstance(y, dpt.usm_ndarray)
+    assert y.shape == (1, n0, n1)
+    assert y.strides == (0, n1, 1)
+
+
+def test_basic_slice10():
+    q = get_queue_or_skip()
+    n0, n1, n2 = 3, 7, 5
+    x = dpt.empty((n0, n1, n2), dtype="u1", sycl_queue=q)
+    y = x[dpt.newaxis, ..., :]
+    assert isinstance(y, dpt.usm_ndarray)
+    assert y.shape == (1, n0, n1, n2)
+    assert y.strides == (0, n1 * n2, n2, 1)
+
+
+def _all_equal(it1, it2):
+    return all(bool(x == y) for x, y in zip(it1, it2))
+
+
+def test_advanced_slice1():
+    q = get_queue_or_skip()
+    ii = dpt.asarray([1, 2], sycl_queue=q)
+    x = dpt.arange(10, dtype="i4", sycl_queue=q)
+    y = x[ii]
+    assert isinstance(y, dpt.usm_ndarray)
+    assert y.shape == ii.shape
+    assert y.strides == (1,)
+    assert _all_equal(
+        (x[ii[k]] for k in range(ii.shape[0])),
+        (y[k] for k in range(ii.shape[0])),
+    )
+    y = x[(ii,)]
+    assert isinstance(y, dpt.usm_ndarray)
+    assert y.shape == ii.shape
+    assert y.strides == (1,)
+    assert _all_equal(
+        (x[ii[k]] for k in range(ii.shape[0])),
+        (y[k] for k in range(ii.shape[0])),
+    )
+
+
+def test_advanced_slice1_negative_strides():
+    q = get_queue_or_skip()
+    ii = dpt.asarray([0, 1], sycl_queue=q)
+    x = dpt.flip(dpt.arange(5, dtype="i4", sycl_queue=q))
+    y = x[ii]
+    assert isinstance(y, dpt.usm_ndarray)
+    assert y.shape == ii.shape
+    assert y.strides == (1,)
+    assert _all_equal(
+        (x[ii[k]] for k in range(ii.shape[0])),
+        (y[k] for k in range(ii.shape[0])),
+    )
+
+
+def test_advanced_slice2():
+    q = get_queue_or_skip()
+    ii = dpt.asarray([1, 2], sycl_queue=q)
+    x = dpt.arange(10, dtype="i4", sycl_queue=q)
+    y = x[ii, dpt.newaxis]
+    assert isinstance(y, dpt.usm_ndarray)
+    assert y.shape == ii.shape + (1,)
+    assert y.flags["C"]
+
+
+def test_advanced_slice3():
+    q = get_queue_or_skip()
+    ii = dpt.asarray([1, 2], sycl_queue=q)
+    x = dpt.arange(10, dtype="i4", sycl_queue=q)
+    y = x[dpt.newaxis, ii]
+    assert isinstance(y, dpt.usm_ndarray)
+    assert y.shape == (1,) + ii.shape
+    assert y.flags["C"]
+
+
+def _make_3d(dt, q):
+    return dpt.reshape(
+        dpt.arange(3 * 3 * 3, dtype=dt, sycl_queue=q),
+        (
+            3,
+            3,
+            3,
+        ),
+    )
+
+
+def test_advanced_slice4():
+    q = get_queue_or_skip()
+    ii = dpt.asarray([1, 2], sycl_queue=q)
+    x = _make_3d("i4", q)
+    y = x[ii, ii, ii]
+    assert isinstance(y, dpt.usm_ndarray)
+    assert y.shape == ii.shape
+    assert _all_equal(
+        (x[ii[k], ii[k], ii[k]] for k in range(ii.shape[0])),
+        (y[k] for k in range(ii.shape[0])),
+    )
+
+
+def test_advanced_slice5():
+    q = get_queue_or_skip()
+    ii = dpt.asarray([1, 2], sycl_queue=q)
+    x = _make_3d("i4", q)
+    y = x[ii, 0, ii]
+    assert isinstance(y, dpt.usm_ndarray)
+    # 0 broadcast to [0, 0] per array API
+    assert y.shape == ii.shape
+    assert _all_equal(
+        (x[ii[i], 0, ii[i]] for i in range(ii.shape[0])),
+        (y[i] for i in range(ii.shape[0])),
+    )
+
+
+def test_advanced_slice6():
+    q = get_queue_or_skip()
+    ii = dpt.asarray([1, 2], sycl_queue=q)
+    x = _make_3d("i4", q)
+    y = x[:, ii, ii]
+    assert isinstance(y, dpt.usm_ndarray)
+    assert y.shape == (
+        x.shape[0],
+        ii.shape[0],
+    )
+    assert _all_equal(
+        (
+            x[i, ii[k], ii[k]]
+            for i in range(x.shape[0])
+            for k in range(ii.shape[0])
+        ),
+        (y[i, k] for i in range(x.shape[0]) for k in range(ii.shape[0])),
+    )
+
+
+def test_advanced_slice7():
+    q = get_queue_or_skip()
+    mask = dpt.asarray(
+        [
+            [[True, True, False], [False, True, True], [True, False, True]],
+            [[True, False, False], [False, False, True], [False, True, False]],
+            [[True, True, True], [False, False, False], [False, False, True]],
+        ],
+        sycl_queue=q,
+    )
+    x = _make_3d("i2", q)
+    y = x[mask]
+    expected = [0, 1, 4, 5, 6, 8, 9, 14, 16, 18, 19, 20, 26]
+    assert isinstance(y, dpt.usm_ndarray)
+    assert y.shape == (len(expected),)
+    assert all(dpt.asnumpy(y[k]) == expected[k] for k in range(len(expected)))
+
+
+def test_advanced_slice8():
+    q = get_queue_or_skip()
+    mask = dpt.asarray(
+        [[True, False, False], [False, True, False], [False, True, False]],
+        sycl_queue=q,
+    )
+    x = _make_3d("u2", q)
+    y = x[mask]
+    expected = dpt.asarray(
+        [[0, 1, 2], [12, 13, 14], [21, 22, 23]], sycl_queue=q
+    )
+    assert isinstance(y, dpt.usm_ndarray)
+    assert y.shape == expected.shape
+    assert (dpt.asnumpy(y) == dpt.asnumpy(expected)).all()
+
+
+def test_advanced_slice9():
+    q = get_queue_or_skip()
+    mask = dpt.asarray(
+        [[True, False, False], [False, True, False], [False, True, False]],
+        sycl_queue=q,
+    )
+    x = _make_3d("u4", q)
+    y = x[:, mask]
+    expected = dpt.asarray([[0, 4, 7], [9, 13, 16], [18, 22, 25]], sycl_queue=q)
+    assert isinstance(y, dpt.usm_ndarray)
+    assert y.shape == expected.shape
+    assert (dpt.asnumpy(y) == dpt.asnumpy(expected)).all()
+
+
+def lin_id(i, j, k):
+    """global_linear_id for (3,3,3) range traversed in C-contiguous order"""
+    return 9 * i + 3 * j + k
+
+
+def test_advanced_slice10():
+    q = get_queue_or_skip()
+    x = _make_3d("u8", q)
+    i0 = dpt.asarray([0, 1, 1], device=x.device)
+    i1 = dpt.asarray([1, 1, 2], device=x.device)
+    i2 = dpt.asarray([2, 0, 1], device=x.device)
+    y = x[i0, i1, i2]
+    res_expected = dpt.asarray(
+        [
+            lin_id(0, 1, 2),
+            lin_id(1, 1, 0),
+            lin_id(1, 2, 1),
+        ],
+        sycl_queue=q,
+    )
+    assert isinstance(y, dpt.usm_ndarray)
+    assert y.shape == res_expected.shape
+    assert (dpt.asnumpy(y) == dpt.asnumpy(res_expected)).all()
+
+
+def test_advanced_slice11():
+    q = get_queue_or_skip()
+    x = _make_3d("u8", q)
+    i0 = dpt.asarray([0, 1, 1], device=x.device)
+    i2 = dpt.asarray([2, 0, 1], device=x.device)
+    with pytest.raises(IndexError):
+        x[i0, :, i2]
+
+
+def test_advanced_slice12():
+    q = get_queue_or_skip()
+    x = _make_3d("u8", q)
+    i1 = dpt.asarray([1, 1, 2], device=x.device)
+    i2 = dpt.asarray([2, 0, 1], device=x.device)
+    y = x[:, dpt.newaxis, i1, i2, dpt.newaxis]
+    res_expected = dpt.asarray(
+        [
+            [[[lin_id(0, 1, 2)], [lin_id(0, 1, 0)], [lin_id(0, 2, 1)]]],
+            [[[lin_id(1, 1, 2)], [lin_id(1, 1, 0)], [lin_id(1, 2, 1)]]],
+            [[[lin_id(2, 1, 2)], [lin_id(2, 1, 0)], [lin_id(2, 2, 1)]]],
+        ],
+        sycl_queue=q,
+    )
+    assert isinstance(y, dpt.usm_ndarray)
+    assert y.shape == res_expected.shape
+    assert (dpt.asnumpy(y) == dpt.asnumpy(res_expected)).all()
+
+
+def test_advanced_slice13():
+    q = get_queue_or_skip()
+    x = _make_3d("u8", q)
+    i1 = dpt.asarray([[1], [2]], device=x.device)
+    i2 = dpt.asarray([[0, 1]], device=x.device)
+    y = x[i1, i2, 0]
+    expected = dpt.asarray(
+        [
+            [lin_id(1, 0, 0), lin_id(1, 1, 0)],
+            [lin_id(2, 0, 0), lin_id(2, 1, 0)],
+        ],
+        device=x.device,
+    )
+    assert isinstance(y, dpt.usm_ndarray)
+    assert y.shape == expected.shape
+    assert (dpt.asnumpy(y) == dpt.asnumpy(expected)).all()
+
+
+def test_advanced_slice14():
+    q = get_queue_or_skip()
+    ii = dpt.asarray([1, 2], sycl_queue=q)
+    x = dpt.reshape(dpt.arange(3**5, dtype="i4", sycl_queue=q), (3,) * 5)
+    y = x[ii, 0, ii, 1, :]
+    assert isinstance(y, dpt.usm_ndarray)
+    # integers broadcast to ii.shape per array API
+    assert y.shape == ii.shape + x.shape[-1:]
+    assert _all_equal(
+        (
+            x[ii[i], 0, ii[i], 1, k]
+            for i in range(ii.shape[0])
+            for k in range(x.shape[-1])
+        ),
+        (y[i, k] for i in range(ii.shape[0]) for k in range(x.shape[-1])),
+    )
+
+
+def test_advanced_slice15():
+    q = get_queue_or_skip()
+    ii = dpt.asarray([1, 2], sycl_queue=q)
+    x = dpt.reshape(dpt.arange(3**5, dtype="i4", sycl_queue=q), (3,) * 5)
+    # : cannot appear between two integral arrays
+    with pytest.raises(IndexError):
+        x[ii, 0, ii, :, ii]
+
+
+def test_advanced_slice16():
+    q = get_queue_or_skip()
+    ii = dpt.asarray(1, sycl_queue=q)
+    i0 = dpt.asarray(False, sycl_queue=q)
+    i1 = dpt.asarray(True, sycl_queue=q)
+    x = dpt.reshape(dpt.arange(3**5, dtype="i4", sycl_queue=q), (3,) * 5)
+    y = x[ii, i0, ii, i1, :]
+    # TODO: add a shape check here when discrepancy with NumPy is investigated
+    assert isinstance(y, dpt.usm_ndarray)
+
+
+def test_integer_indexing_numpy_array():
+    q = get_queue_or_skip()
+    ii = np.asarray([1, 2])
+    x = dpt.arange(10, dtype="i4", sycl_queue=q)
+    y = x[ii]
+    assert isinstance(y, dpt.usm_ndarray)
+    assert y.shape == ii.shape
+    assert dpt.all(x[1:3] == y)
+
+
+def test_boolean_indexing_numpy_array():
+    q = get_queue_or_skip()
+    ii = np.asarray(
+        [False, True, True, False, False, False, False, False, False, False]
+    )
+    x = dpt.arange(10, dtype="i4", sycl_queue=q)
+    y = x[ii]
+    assert isinstance(y, dpt.usm_ndarray)
+    assert y.shape == (2,)
+    assert dpt.all(x[1:3] == y)
+
+
+def test_boolean_indexing_validation():
+    get_queue_or_skip()
+    x = dpt.zeros(10, dtype="i4")
+    ii = dpt.ones((2, 5), dtype="?")
+    with pytest.raises(IndexError):
+        x[ii]
+    with pytest.raises(IndexError):
+        x[ii[0, :]]
+
+
+def test_boolean_indexing_getitem_empty_mask():
+    get_queue_or_skip()
+    x = dpt.ones((2, 3, 4), dtype="i4")
+    ii = dpt.ones((0,), dtype="?")
+    assert x[ii].size == 0
+    ii1 = dpt.ones((0, 3), dtype="?")
+    assert x[ii1].size == 0
+    ii2 = dpt.ones((0, 3, 4), dtype="?")
+    assert x[ii2].size == 0
+
+
+def test_boolean_indexing_setitem_empty_mask():
+    get_queue_or_skip()
+    x = dpt.ones((2, 3, 4), dtype="i4")
+    ii = dpt.ones((0,), dtype="?")
+    x[ii] = 0
+    assert dpt.all(x == 1)
+    ii1 = dpt.ones((0, 3), dtype="?")
+    x[ii1] = 0
+    assert dpt.all(x == 1)
+    ii2 = dpt.ones((0, 3, 4), dtype="?")
+    x[ii2] = 0
+    assert dpt.all(x == 1)
+
+
+def test_integer_indexing_1d():
+    get_queue_or_skip()
+    x = dpt.arange(10, dtype="i4")
+    ind_1d = dpt.asarray([7, 3, 1], dtype="u2")
+    ind_2d = dpt.asarray([[2, 3, 4], [3, 4, 5], [5, 6, 7]], dtype="i4")
+
+    y1 = x[ind_1d]
+    assert y1.shape == ind_1d.shape
+    y2 = x[ind_2d]
+    assert y2.shape == ind_2d.shape
+    assert (dpt.asnumpy(y1) == np.array([7, 3, 1], dtype="i4")).all()
+    assert (
+        dpt.asnumpy(y2)
+        == np.array([[2, 3, 4], [3, 4, 5], [5, 6, 7]], dtype="i4")
+    ).all()
+
+
+def test_integer_indexing_2d():
+    get_queue_or_skip()
+    n0, n1 = 5, 7
+    x = dpt.reshape(
+        dpt.arange(n0 * n1, dtype="i4"),
+        (
+            n0,
+            n1,
+        ),
+    )
+    ind0 = dpt.arange(n0)
+    ind1 = dpt.arange(n1)
+
+    y = x[ind0[:2, dpt.newaxis], ind1[dpt.newaxis, -2:]]
+    assert y.dtype == x.dtype
+    assert (dpt.asnumpy(y) == np.array([[5, 6], [12, 13]])).all()
+
+
+def test_integer_strided_indexing():
+    get_queue_or_skip()
+    n0, n1 = 5, 7
+    x = dpt.reshape(
+        dpt.arange(2 * n0 * n1, dtype="i4"),
+        (
+            2 * n0,
+            n1,
+        ),
+    )
+    ind0 = dpt.arange(n0)
+    ind1 = dpt.arange(n1)
+
+    z = x[::-2, :]
+    y = z[ind0[:2, dpt.newaxis], ind1[dpt.newaxis, -2:]]
+    assert y.dtype == x.dtype
+    zc = dpt.copy(z, order="C")
+    yc = zc[ind0[:2, dpt.newaxis], ind1[dpt.newaxis, -2:]]
+    assert (dpt.asnumpy(y) == dpt.asnumpy(yc)).all()
+
+
+def test_TrueFalse_indexing():
+    get_queue_or_skip()
+    n0, n1 = 2, 3
+    x = dpt.ones((n0, n1))
+    for ind in [True, dpt.asarray(True)]:
+        y1 = x[ind]
+        assert y1.shape == (1, n0, n1)
+        assert y1._pointer == x._pointer
+        y2 = x[:, ind]
+        assert y2.shape == (n0, 1, n1)
+        assert y2._pointer == x._pointer
+        y3 = x[..., ind]
+        assert y3.shape == (n0, n1, 1)
+        assert y3._pointer == x._pointer
+    for ind in [False, dpt.asarray(False)]:
+        y1 = x[ind]
+        assert y1.shape == (0, n0, n1)
+        assert y1._pointer == x._pointer
+        y2 = x[:, ind]
+        assert y2.shape == (n0, 0, n1)
+        assert y2._pointer == x._pointer
+        y3 = x[..., ind]
+        assert y3.shape == (n0, n1, 0)
+        assert y3._pointer == x._pointer
+
+
+def test_mixed_index_getitem():
+    get_queue_or_skip()
+    x = dpt.reshape(dpt.arange(1000, dtype="i4"), (10, 10, 10))
+    i1b = dpt.ones(10, dtype="?")
+    info = x.__array_namespace__().__array_namespace_info__()
+    ind_dt = info.default_dtypes(device=x.device)["indexing"]
+    i0 = dpt.asarray([0, 2, 3], dtype=ind_dt)[:, dpt.newaxis]
+    i2 = dpt.asarray([3, 4, 7], dtype=ind_dt)[:, dpt.newaxis]
+    y = x[i0, i1b, i2]
+    assert y.shape == (3, dpt.sum(i1b, dtype="i8"))
+
+
+def test_mixed_index_setitem():
+    get_queue_or_skip()
+    x = dpt.reshape(dpt.arange(1000, dtype="i4"), (10, 10, 10))
+    i1b = dpt.ones(10, dtype="?")
+    info = x.__array_namespace__().__array_namespace_info__()
+    ind_dt = info.default_dtypes(device=x.device)["indexing"]
+    i0 = dpt.asarray([0, 2, 3], dtype=ind_dt)[:, dpt.newaxis]
+    i2 = dpt.asarray([3, 4, 7], dtype=ind_dt)[:, dpt.newaxis]
+    v_shape = (3, int(dpt.sum(i1b, dtype="i8")))
+    canary = 7
+    x[i0, i1b, i2] = dpt.full(v_shape, canary, dtype=x.dtype)
+    assert x[0, 0, 3] == canary
+
+
+@pytest.mark.parametrize(
+    "data_dt",
+    _all_dtypes,
+)
+@pytest.mark.parametrize(
+    "ind_dt",
+    _all_int_dtypes,
+)
+def test_take_basic(data_dt, ind_dt):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(data_dt, q)
+
+    x = dpt.arange(10, dtype=data_dt)
+    ind = dpt.arange(2, 5, dtype=ind_dt)
+    y = dpt.take(x, ind)
+    assert y.dtype == x.dtype
+    assert (dpt.asnumpy(y) == np.arange(2, 5, dtype=data_dt)).all()
+
+
+@pytest.mark.parametrize(
+    "data_dt",
+    _all_dtypes,
+)
+@pytest.mark.parametrize(
+    "ind_dt",
+    _all_int_dtypes,
+)
+def test_put_basic(data_dt, ind_dt):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(data_dt, q)
+
+    x = dpt.arange(10, dtype=data_dt)
+    ind = dpt.arange(2, 5, dtype=ind_dt)
+    val = dpt.ones(3, dtype=data_dt)
+    dpt.put(x, ind, val)
+    assert (
+        dpt.asnumpy(x)
+        == np.array([0, 1, 1, 1, 1, 5, 6, 7, 8, 9], dtype=data_dt)
+    ).all()
+
+
+def test_take_basic_axis():
+    get_queue_or_skip()
+
+    n0, n1 = 5, 7
+    x = dpt.reshape(
+        dpt.arange(n0 * n1, dtype="i4"),
+        (
+            n0,
+            n1,
+        ),
+    )
+    ind = dpt.arange(2, 4)
+    y0 = dpt.take(x, ind, axis=0)
+    y1 = dpt.take(x, ind, axis=1)
+    assert y0.shape == (2, n1)
+    assert y1.shape == (n0, 2)
+
+
+def test_put_basic_axis():
+    get_queue_or_skip()
+
+    n0, n1 = 5, 7
+    x = dpt.reshape(
+        dpt.arange(n0 * n1, dtype="i4"),
+        (
+            n0,
+            n1,
+        ),
+    )
+    ind = dpt.arange(2, 4)
+    v0 = dpt.zeros((2, n1), dtype=x.dtype)
+    v1 = dpt.zeros((n0, 2), dtype=x.dtype)
+    dpt.put(x, ind, v0, axis=0)
+    dpt.put(x, ind, v1, axis=1)
+    expected = np.arange(n0 * n1, dtype="i4").reshape((n0, n1))
+    expected[[2, 3], :] = 0
+    expected[:, [2, 3]] = 0
+    assert (expected == dpt.asnumpy(x)).all()
+
+
+@pytest.mark.parametrize("data_dt", _all_dtypes)
+def test_put_0d_val(data_dt):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(data_dt, q)
+
+    x = dpt.arange(5, dtype=data_dt, sycl_queue=q)
+    ind = dpt.asarray([0], dtype="i8", sycl_queue=q)
+    val = dpt.asarray(2, dtype=x.dtype, sycl_queue=q)
+    x[ind] = val
+    assert_array_equal(np.asarray(2, dtype=data_dt), dpt.asnumpy(x[0]))
+
+    x = dpt.asarray(5, dtype=data_dt, sycl_queue=q)
+    dpt.put(x, ind, val)
+    assert_array_equal(np.asarray(2, dtype=data_dt), dpt.asnumpy(x))
+
+
+@pytest.mark.parametrize(
+    "data_dt",
+    _all_dtypes,
+)
+def test_take_0d_data(data_dt):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(data_dt, q)
+
+    x = dpt.asarray(0, dtype=data_dt, sycl_queue=q)
+    ind = dpt.arange(5, dtype="i8", sycl_queue=q)
+
+    y = dpt.take(x, ind)
+    assert (
+        dpt.asnumpy(y)
+        == np.broadcast_to(np.asarray(0, dtype=data_dt), ind.shape)
+    ).all()
+
+
+@pytest.mark.parametrize(
+    "data_dt",
+    _all_dtypes,
+)
+def test_put_0d_data(data_dt):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(data_dt, q)
+
+    x = dpt.asarray(0, dtype=data_dt, sycl_queue=q)
+    ind = dpt.arange(5, dtype="i8", sycl_queue=q)
+    val = dpt.asarray(2, dtype=data_dt, sycl_queue=q)
+
+    dpt.put(x, ind, val, axis=0)
+    assert (
+        dpt.asnumpy(x)
+        == np.broadcast_to(np.asarray(2, dtype=data_dt), ind.shape)
+    ).all()
+
+
+@pytest.mark.parametrize(
+    "ind_dt",
+    _all_int_dtypes,
+)
+def test_indexing_0d_ind(ind_dt):
+    q = get_queue_or_skip()
+
+    x = dpt.arange(5, dtype="i4", sycl_queue=q)
+    ind = dpt.asarray(3, dtype=ind_dt, sycl_queue=q)
+
+    y = x[ind]
+    assert dpt.asnumpy(x[3]) == dpt.asnumpy(y)
+
+
+@pytest.mark.parametrize(
+    "ind_dt",
+    _all_int_dtypes,
+)
+def test_put_0d_ind(ind_dt):
+    q = get_queue_or_skip()
+
+    x = dpt.arange(5, dtype="i4", sycl_queue=q)
+    ind = dpt.asarray(3, dtype=ind_dt, sycl_queue=q)
+    val = dpt.asarray(5, dtype=x.dtype, sycl_queue=q)
+
+    x[ind] = val
+    assert dpt.asnumpy(x[3]) == dpt.asnumpy(val)
+
+
+@pytest.mark.parametrize(
+    "data_dt",
+    _all_dtypes,
+)
+def test_take_strided_1d_source(data_dt):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(data_dt, q)
+
+    x = dpt.arange(27, dtype=data_dt, sycl_queue=q)
+    ind = dpt.arange(4, 9, dtype="i8", sycl_queue=q)
+
+    x_np = dpt.asnumpy(x)
+    ind_np = dpt.asnumpy(ind)
+
+    for s in (
+        slice(None, None, 2),
+        slice(None, None, -2),
+    ):
+        assert_array_equal(
+            np.take(x_np[s], ind_np, axis=0),
+            dpt.asnumpy(dpt.take(x[s], ind, axis=0)),
+        )
+
+    # 0-strided
+    x = dpt.usm_ndarray(
+        (27,),
+        dtype=data_dt,
+        strides=(0,),
+        buffer_ctor_kwargs={"queue": q},
+    )
+    x[0] = x_np[0]
+    assert_array_equal(
+        np.broadcast_to(x_np[0], ind.shape),
+        dpt.asnumpy(dpt.take(x, ind, axis=0)),
+    )
+
+
+@pytest.mark.parametrize(
+    "data_dt",
+    _all_dtypes,
+)
+@pytest.mark.parametrize("order", ["C", "F"])
+def test_take_strided(data_dt, order):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(data_dt, q)
+
+    x = dpt.reshape(_make_3d(data_dt, q), (9, 3), order=order)
+    ind = dpt.arange(2, dtype="i8", sycl_queue=q)
+
+    x_np = dpt.asnumpy(x)
+    ind_np = dpt.asnumpy(ind)
+
+    for s in (
+        slice(None, None, 2),
+        slice(None, None, -2),
+    ):
+        for sgn in (-1, 1):
+            xs = x[s, ::sgn]
+            xs_np = x_np[s, ::sgn]
+            assert_array_equal(
+                np.take(xs_np, ind_np, axis=0),
+                dpt.asnumpy(dpt.take(xs, ind, axis=0)),
+            )
+            assert_array_equal(
+                np.take(xs_np, ind_np, axis=1),
+                dpt.asnumpy(dpt.take(xs, ind, axis=1)),
+            )
+
+
+@pytest.mark.parametrize(
+    "ind_dt",
+    _all_int_dtypes,
+)
+def test_take_strided_1d_indices(ind_dt):
+    q = get_queue_or_skip()
+
+    x = dpt.arange(27, dtype="i4", sycl_queue=q)
+    ind = dpt.arange(12, 24, dtype=ind_dt, sycl_queue=q)
+
+    x_np = dpt.asnumpy(x)
+    ind_np = dpt.asnumpy(ind).astype("i8")
+
+    for s in (
+        slice(None, None, 2),
+        slice(None, None, -2),
+    ):
+        assert_array_equal(
+            np.take(x_np, ind_np[s], axis=0),
+            dpt.asnumpy(dpt.take(x, ind[s], axis=0)),
+        )
+
+    # 0-strided
+    ind = dpt.usm_ndarray(
+        (12,),
+        dtype=ind_dt,
+        strides=(0,),
+        buffer_ctor_kwargs={"queue": q},
+    )
+    ind[0] = ind_np[0]
+    assert_array_equal(
+        np.broadcast_to(x_np[ind_np[0]], ind.shape),
+        dpt.asnumpy(dpt.take(x, ind, axis=0)),
+    )
+
+
+@pytest.mark.parametrize(
+    "ind_dt",
+    _all_int_dtypes,
+)
+@pytest.mark.parametrize("order", ["C", "F"])
+def test_take_strided_indices(ind_dt, order):
+    q = get_queue_or_skip()
+
+    x = dpt.arange(27, dtype="i4", sycl_queue=q)
+    ind = dpt.reshape(
+        dpt.arange(12, 24, dtype=ind_dt, sycl_queue=q), (4, 3), order=order
+    )
+
+    x_np = dpt.asnumpy(x)
+    ind_np = dpt.asnumpy(ind).astype("i8")
+
+    for s in (
+        slice(None, None, 2),
+        slice(None, None, -2),
+    ):
+        for sgn in [-1, 1]:
+            inds = ind[s, ::sgn]
+            inds_np = ind_np[s, ::sgn]
+            assert_array_equal(
+                np.take(x_np, inds_np, axis=0),
+                dpt.asnumpy(x[inds]),
+            )
+
+
+@pytest.mark.parametrize(
+    "data_dt",
+    _all_dtypes,
+)
+@pytest.mark.parametrize("order", ["C", "F"])
+def test_put_strided_1d_destination(data_dt, order):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(data_dt, q)
+
+    x = dpt.arange(27, dtype=data_dt, sycl_queue=q)
+    ind = dpt.arange(4, 9, dtype="i8", sycl_queue=q)
+    val = dpt.asarray(9, dtype=x.dtype, sycl_queue=q)
+
+    x_np = dpt.asnumpy(x)
+    ind_np = dpt.asnumpy(ind)
+    val_np = dpt.asnumpy(val)
+
+    for s in (
+        slice(None, None, 2),
+        slice(None, None, -2),
+    ):
+        x_np1 = x_np.copy()
+        x_np1[s][ind_np] = val_np
+
+        x1 = dpt.copy(x)
+        dpt.put(x1[s], ind, val, axis=0)
+
+        assert_array_equal(x_np1, dpt.asnumpy(x1))
+
+
+@pytest.mark.parametrize(
+    "data_dt",
+    _all_dtypes,
+)
+@pytest.mark.parametrize("order", ["C", "F"])
+def test_put_strided_destination(data_dt, order):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(data_dt, q)
+
+    x = dpt.reshape(_make_3d(data_dt, q), (9, 3), order=order)
+    ind = dpt.arange(2, dtype="i8", sycl_queue=q)
+    val = dpt.asarray(9, dtype=x.dtype, sycl_queue=q)
+
+    x_np = dpt.asnumpy(x)
+    ind_np = dpt.asnumpy(ind)
+    val_np = dpt.asnumpy(val)
+
+    for s in (
+        slice(None, None, 2),
+        slice(None, None, -2),
+    ):
+        for sgn in [-1, 1]:
+            xs = x[s, ::sgn]
+            xs_np = x_np[s, ::sgn]
+
+            x_np1 = xs_np.copy()
+            x_np1[ind_np] = val_np
+
+            x1 = dpt.copy(xs)
+            dpt.put(x1, ind, val, axis=0)
+            assert_array_equal(x_np1, dpt.asnumpy(x1))
+
+            x_np1 = xs_np.copy()
+            x_np1[:, ind_np] = val_np
+
+            x1 = dpt.copy(xs)
+            dpt.put(x1, ind, val, axis=1)
+            assert_array_equal(x_np1, dpt.asnumpy(x1))
+
+            x_np1 = xs_np.copy()
+            x_np1[ind_np, ind_np] = val_np
+
+            x1 = dpt.copy(xs)
+            x1[ind, ind] = val
+            assert_array_equal(x_np1, dpt.asnumpy(x1))
+
+
+@pytest.mark.parametrize(
+    "ind_dt",
+    _all_int_dtypes,
+)
+def test_put_strided_1d_indices(ind_dt):
+    q = get_queue_or_skip()
+
+    x = dpt.arange(27, dtype="i4", sycl_queue=q)
+    ind = dpt.arange(12, 24, dtype=ind_dt, sycl_queue=q)
+    val = dpt.asarray(-1, dtype=x.dtype, sycl_queue=q)
+
+    x_np = dpt.asnumpy(x)
+    ind_np = dpt.asnumpy(ind).astype("i8")
+    val_np = dpt.asnumpy(val)
+
+    for s in (
+        slice(None, None, 2),
+        slice(None, None, -2),
+    ):
+        x_copy = dpt.copy(x)
+        dpt.put(x_copy, ind[s], val, axis=0)
+
+        x_np_copy = x_np.copy()
+        x_np_copy[ind_np[s]] = val_np
+
+        assert_array_equal(x_np_copy, dpt.asnumpy(x_copy))
+
+
+@pytest.mark.parametrize(
+    "ind_dt",
+    _all_int_dtypes,
+)
+@pytest.mark.parametrize("order", ["C", "F"])
+def test_put_strided_indices(ind_dt, order):
+    q = get_queue_or_skip()
+
+    x = dpt.arange(27, dtype="i4", sycl_queue=q)
+    ind = dpt.reshape(
+        dpt.arange(12, 24, dtype=ind_dt, sycl_queue=q), (4, 3), order=order
+    )
+    val = dpt.asarray(-1, sycl_queue=q, dtype=x.dtype)
+
+    x_np = dpt.asnumpy(x)
+    ind_np = dpt.asnumpy(ind).astype("i8")
+    val_np = dpt.asnumpy(val)
+
+    for s in (
+        slice(None, None, 2),
+        slice(None, None, -2),
+    ):
+        for sgn in [-1, 1]:
+            inds = ind[s, ::sgn]
+            inds_np = ind_np[s, ::sgn]
+
+            x_copy = dpt.copy(x)
+            x_copy[inds] = val
+
+            x_np_copy = x_np.copy()
+            x_np_copy[inds_np] = val_np
+
+            assert_array_equal(x_np_copy, dpt.asnumpy(x_copy))
+
+
+def test_integer_indexing_modes():
+    q = get_queue_or_skip()
+
+    x = dpt.arange(5, sycl_queue=q)
+    x_np = dpt.asnumpy(x)
+
+    # wrapping negative indices
+    ind = dpt.asarray([-4, -3, 0, 2, 4], dtype="i8", sycl_queue=q)
+
+    res = dpt.take(x, ind, mode="wrap")
+    expected_arr = np.take(x_np, dpt.asnumpy(ind), mode="raise")
+
+    assert (dpt.asnumpy(res) == expected_arr).all()
+
+    # clipping to 0 (disabling negative indices)
+    ind = dpt.asarray([-6, -3, 0, 2, 6], dtype="i8", sycl_queue=q)
+
+    res = dpt.take(x, ind, mode="clip")
+    expected_arr = np.take(x_np, dpt.asnumpy(ind), mode="clip")
+
+    assert (dpt.asnumpy(res) == expected_arr).all()
+
+
+def test_take_arg_validation():
+    q = get_queue_or_skip()
+
+    x = dpt.arange(4, dtype="i4", sycl_queue=q)
+    ind0 = dpt.arange(4, dtype="i8", sycl_queue=q)
+    ind1 = dpt.arange(2.0, dtype="f", sycl_queue=q)
+
+    with pytest.raises(TypeError):
+        dpt.take(dict(), ind0, axis=0)
+    with pytest.raises(TypeError):
+        dpt.take(x, dict(), axis=0)
+    with pytest.raises(IndexError):
+        x[[]]
+    with pytest.raises(IndexError):
+        dpt.take(x, ind1, axis=0)
+    with pytest.raises(IndexError):
+        x[ind1]
+
+    with pytest.raises(ValueError):
+        dpt.take(dpt.reshape(x, (2, 2)), ind0)
+    with pytest.raises(ValueError):
+        dpt.take(x, ind0, mode=0)
+    with pytest.raises(ValueError):
+        dpt.take(dpt.reshape(x, (2, 2)), ind0, axis=None)
+    with pytest.raises(ValueError):
+        dpt.take(x, dpt.reshape(ind0, (2, 2)))
+    with pytest.raises(ValueError):
+        dpt.take(x[0], ind0, axis=2)
+    with pytest.raises(ValueError):
+        dpt.take(x[:, dpt.newaxis, dpt.newaxis], ind0, axis=None)
+
+
+def test_put_arg_validation():
+    q = get_queue_or_skip()
+
+    x = dpt.arange(4, dtype="i4", sycl_queue=q)
+    ind0 = dpt.arange(4, dtype="i8", sycl_queue=q)
+    ind1 = dpt.arange(2.0, dtype="f", sycl_queue=q)
+    val = dpt.asarray(2, dtype=x.dtype, sycl_queue=q)
+
+    with pytest.raises(TypeError):
+        dpt.put(dict(), ind0, val, axis=0)
+    with pytest.raises(TypeError):
+        dpt.put(x, dict(), val, axis=0)
+    with pytest.raises(IndexError):
+        x[[]] = val
+    with pytest.raises(IndexError):
+        dpt.put(x, ind1, val, axis=0)
+    with pytest.raises(IndexError):
+        x[ind1] = val
+    with pytest.raises(TypeError):
+        dpt.put(x, ind0, {}, axis=0)
+    with pytest.raises(TypeError):
+        x[ind0] = {}
+
+    with pytest.raises(ValueError):
+        dpt.put(x, ind0, val, mode=0)
+    with pytest.raises(ValueError):
+        dpt.put(x, dpt.reshape(ind0, (2, 2)), val)
+    with pytest.raises(ValueError):
+        dpt.put(x[0], ind0, val, axis=2)
+    with pytest.raises(ValueError):
+        dpt.put(x[:, dpt.newaxis, dpt.newaxis], ind0, val, axis=None)
+
+
+def test_advanced_indexing_compute_follows_data():
+    q1 = get_queue_or_skip()
+    q2 = get_queue_or_skip()
+
+    x = dpt.arange(4, sycl_queue=q1)
+    ind0 = dpt.asarray([0], sycl_queue=q1)
+    ind1 = dpt.asarray([0], sycl_queue=q2)
+    val0 = dpt.asarray(2, dtype=x.dtype, sycl_queue=q1)
+    val1 = dpt.asarray(2, dtype=x.dtype, sycl_queue=q2)
+
+    with pytest.raises(ExecutionPlacementError):
+        dpt.take(x, ind1, axis=0)
+    with pytest.raises(ExecutionPlacementError):
+        x[ind1]
+    with pytest.raises(ExecutionPlacementError):
+        dpt.put(x, ind1, val0, axis=0)
+    with pytest.raises(ExecutionPlacementError):
+        x[ind1] = val0
+    with pytest.raises(ExecutionPlacementError):
+        dpt.put(x, ind0, val1, axis=0)
+    with pytest.raises(ExecutionPlacementError):
+        x[ind0] = val1
+
+
+def test_extract_all_1d():
+    get_queue_or_skip()
+    x = dpt.arange(30, dtype="i4")
+    sel = dpt.ones(30, dtype="?")
+    sel[::2] = False
+
+    res = x[sel]
+    expected_res = dpt.asnumpy(x)[dpt.asnumpy(sel)]
+    assert (dpt.asnumpy(res) == expected_res).all()
+
+    res2 = dpt.extract(sel, x)
+    assert (dpt.asnumpy(res2) == expected_res).all()
+
+    # test strided case
+    x = dpt.arange(15, dtype="i4")
+    sel_np = np.zeros(15, dtype="?")
+    np.put(sel_np, np.random.choice(sel_np.size, size=7), True)
+    sel = dpt.asarray(sel_np)
+
+    res = x[sel[::-1]]
+    expected_res = dpt.asnumpy(x)[sel_np[::-1]]
+    assert (dpt.asnumpy(res) == expected_res).all()
+
+    res2 = dpt.extract(sel[::-1], x)
+    assert (dpt.asnumpy(res2) == expected_res).all()
+
+
+def test_extract_all_2d():
+    get_queue_or_skip()
+    x = dpt.reshape(dpt.arange(30, dtype="i4"), (5, 6))
+    sel = dpt.ones(30, dtype="?")
+    sel[::2] = False
+    sel = dpt.reshape(sel, x.shape)
+
+    res = x[sel]
+    expected_res = dpt.asnumpy(x)[dpt.asnumpy(sel)]
+    assert (dpt.asnumpy(res) == expected_res).all()
+
+    res2 = dpt.extract(sel, x)
+    assert (dpt.asnumpy(res2) == expected_res).all()
+
+
+def test_extract_2D_axis0():
+    get_queue_or_skip()
+    x = dpt.reshape(dpt.arange(30, dtype="i4"), (5, 6))
+    sel = dpt.ones(x.shape[0], dtype="?")
+    sel[::2] = False
+
+    res = x[sel]
+    expected_res = dpt.asnumpy(x)[dpt.asnumpy(sel)]
+    assert (dpt.asnumpy(res) == expected_res).all()
+
+
+def test_extract_2D_axis1():
+    get_queue_or_skip()
+    x = dpt.reshape(dpt.arange(30, dtype="i4"), (5, 6))
+    sel = dpt.ones(x.shape[1], dtype="?")
+    sel[::2] = False
+
+    res = x[:, sel]
+    expected = dpt.asnumpy(x)[:, dpt.asnumpy(sel)]
+    assert (dpt.asnumpy(res) == expected).all()
+
+
+def test_extract_begin():
+    get_queue_or_skip()
+    x = dpt.reshape(dpt.arange(3 * 3 * 4 * 4, dtype="i2"), (3, 4, 3, 4))
+    y = dpt.permute_dims(x, (2, 0, 3, 1))
+    sel = dpt.zeros((3, 3), dtype="?")
+    sel[0, 0] = True
+    sel[1, 1] = True
+    z = y[sel]
+    expected = dpt.asnumpy(y)[[0, 1], [0, 1]]
+    assert (dpt.asnumpy(z) == expected).all()
+
+
+def test_extract_end():
+    get_queue_or_skip()
+    x = dpt.reshape(dpt.arange(3 * 3 * 4 * 4, dtype="i2"), (3, 4, 3, 4))
+    y = dpt.permute_dims(x, (2, 0, 3, 1))
+    sel = dpt.zeros((4, 4), dtype="?")
+    sel[0, 0] = True
+    z = y[..., sel]
+    expected = dpt.asnumpy(y)[..., [0], [0]]
+    assert (dpt.asnumpy(z) == expected).all()
+
+
+def test_extract_middle():
+    get_queue_or_skip()
+    x = dpt.reshape(dpt.arange(3 * 3 * 4 * 4, dtype="i2"), (3, 4, 3, 4))
+    y = dpt.permute_dims(x, (2, 0, 3, 1))
+    sel = dpt.zeros((3, 4), dtype="?")
+    sel[0, 0] = True
+    z = y[:, sel]
+    expected = dpt.asnumpy(y)[:, [0], [0], :]
+    assert (dpt.asnumpy(z) == expected).all()
+
+
+def test_extract_empty_result():
+    get_queue_or_skip()
+    x = dpt.reshape(dpt.arange(3 * 3 * 4 * 4, dtype="i2"), (3, 4, 3, 4))
+    y = dpt.permute_dims(x, (2, 0, 3, 1))
+    sel = dpt.zeros((3, 4), dtype="?")
+    z = y[:, sel]
+    assert z.shape == (
+        y.shape[0],
+        0,
+        y.shape[3],
+    )
+
+
+def test_place_all_1d():
+    get_queue_or_skip()
+    x = dpt.arange(10, dtype="i2")
+    sel = dpt.zeros(10, dtype="?")
+    sel[0::2] = True
+    val = dpt.zeros(5, dtype=x.dtype)
+    x[sel] = val
+    assert (dpt.asnumpy(x) == np.array([0, 1, 0, 3, 0, 5, 0, 7, 0, 9])).all()
+    dpt.place(x, sel, dpt.asarray([2]))
+    assert (dpt.asnumpy(x) == np.array([2, 1, 2, 3, 2, 5, 2, 7, 2, 9])).all()
+
+
+def test_place_2d_axis0():
+    get_queue_or_skip()
+    x = dpt.reshape(dpt.arange(12, dtype="i2"), (3, 4))
+    sel = dpt.asarray([True, False, True])
+    val = dpt.zeros((2, 4), dtype=x.dtype)
+    x[sel] = val
+    expected_x = np.stack(
+        (
+            np.zeros(4, dtype="i2"),
+            np.arange(4, 8, dtype="i2"),
+            np.zeros(4, dtype="i2"),
+        )
+    )
+    assert (dpt.asnumpy(x) == expected_x).all()
+
+
+def test_place_2d_axis1():
+    get_queue_or_skip()
+    x = dpt.reshape(dpt.arange(12, dtype="i2"), (3, 4))
+    sel = dpt.asarray([True, False, True, False])
+    val = dpt.zeros((3, 2), dtype=x.dtype)
+    x[:, sel] = val
+    expected_x = np.array(
+        [[0, 1, 0, 3], [0, 5, 0, 7], [0, 9, 0, 11]], dtype="i2"
+    )
+    assert (dpt.asnumpy(x) == expected_x).all()
+
+
+def test_place_2d_axis1_scalar():
+    get_queue_or_skip()
+    x = dpt.reshape(dpt.arange(12, dtype="i2"), (3, 4))
+    sel = dpt.asarray([True, False, True, False])
+    val = dpt.zeros(tuple(), dtype=x.dtype)
+    x[:, sel] = val
+    expected_x = np.array(
+        [[0, 1, 0, 3], [0, 5, 0, 7], [0, 9, 0, 11]], dtype="i2"
+    )
+    assert (dpt.asnumpy(x) == expected_x).all()
+
+
+def test_place_all_slices():
+    get_queue_or_skip()
+    x = dpt.reshape(dpt.arange(12, dtype="i2"), (3, 4))
+    sel = dpt.asarray(
+        [
+            [False, True, True, False],
+            [True, True, False, False],
+            [False, False, True, True],
+        ],
+        dtype="?",
+    )
+    y = dpt.ones_like(x)
+    y[sel] = x[sel]
+
+
+def test_place_some_slices_begin():
+    get_queue_or_skip()
+    x = dpt.reshape(dpt.arange(3 * 3 * 4 * 4, dtype="i2"), (3, 4, 3, 4))
+    y = dpt.permute_dims(x, (2, 0, 3, 1))
+    sel = dpt.zeros((3, 3), dtype="?")
+    sel[0, 0] = True
+    sel[1, 1] = True
+    z = y[sel]
+    w = dpt.zeros_like(y)
+    w[sel] = z
+
+
+def test_place_some_slices_mid():
+    get_queue_or_skip()
+    x = dpt.reshape(dpt.arange(3 * 3 * 4 * 4, dtype="i2"), (3, 4, 3, 4))
+    y = dpt.permute_dims(x, (2, 0, 3, 1))
+    sel = dpt.zeros((3, 4), dtype="?")
+    sel[0, 0] = True
+    sel[1, 1] = True
+    z = y[:, sel]
+    w = dpt.zeros_like(y)
+    w[:, sel] = z
+
+
+def test_place_some_slices_end():
+    get_queue_or_skip()
+    x = dpt.reshape(dpt.arange(3 * 3 * 4 * 4, dtype="i2"), (3, 4, 3, 4))
+    y = dpt.permute_dims(x, (2, 0, 3, 1))
+    sel = dpt.zeros((4, 4), dtype="?")
+    sel[0, 0] = True
+    sel[1, 1] = True
+    z = y[:, :, sel]
+    w = dpt.zeros_like(y)
+    w[:, :, sel] = z
+
+
+def test_place_cycling():
+    get_queue_or_skip()
+    x = dpt.zeros(10, dtype="f4")
+    y = dpt.asarray([2, 3])
+    sel = dpt.ones(x.size, dtype="?")
+    dpt.place(x, sel, y)
+    expected = np.array(
+        [
+            2,
+            3,
+        ]
+        * 5,
+        dtype=x.dtype,
+    )
+    assert (dpt.asnumpy(x) == expected).all()
+
+
+def test_place_subset():
+    get_queue_or_skip()
+    x = dpt.zeros(10, dtype="f4")
+    y = dpt.ones_like(x)
+    sel = dpt.ones(x.size, dtype="?")
+    sel[::2] = False
+    dpt.place(x, sel, y)
+    expected = np.array([0, 1, 0, 1, 0, 1, 0, 1, 0, 1], dtype=x.dtype)
+    assert (dpt.asnumpy(x) == expected).all()
+
+
+def test_place_empty_vals_error():
+    get_queue_or_skip()
+    x = dpt.zeros(10, dtype="f4")
+    y = dpt.empty((0,), dtype=x.dtype)
+    sel = dpt.ones(x.size, dtype="?")
+    sel[::2] = False
+    with pytest.raises(ValueError):
+        dpt.place(x, sel, y)
+
+
+def test_place_empty_vals_full_false_mask():
+    get_queue_or_skip()
+    x = dpt.ones(10, dtype="f4")
+    y = dpt.empty((0,), dtype=x.dtype)
+    sel = dpt.zeros(x.size, dtype="?")
+    expected = np.ones(10, dtype=x.dtype)
+    dpt.place(x, sel, y)
+    assert (dpt.asnumpy(x) == expected).all()
+
+
+def test_nonzero():
+    get_queue_or_skip()
+    x = dpt.concat((dpt.zeros(3), dpt.ones(4), dpt.zeros(3)))
+    (i,) = dpt.nonzero(x)
+    assert (dpt.asnumpy(i) == np.array([3, 4, 5, 6])).all()
+
+
+def test_nonzero_f_contig():
+    "See gh-1370"
+    get_queue_or_skip()
+
+    mask = dpt.zeros((5, 5), dtype="?", order="F")
+    mask[2, 3] = True
+
+    expected_res = np.nonzero(dpt.asnumpy(mask))
+    result = dpt.nonzero(mask)
+
+    for exp, res in zip(expected_res, result):
+        assert_array_equal(dpt.asnumpy(res), exp)
+    assert dpt.asnumpy(mask[result]).all()
+
+
+def test_nonzero_compacting():
+    """See gh-1370.
+    Test with input where dimensionality
+    of iteration space is compacted from 3d to 2d
+    """
+    get_queue_or_skip()
+
+    mask = dpt.zeros((5, 5, 5), dtype="?", order="F")
+    mask[3, 2, 1] = True
+    mask_view = mask[..., :3]
+
+    expected_res = np.nonzero(dpt.asnumpy(mask_view))
+    result = dpt.nonzero(mask_view)
+
+    for exp, res in zip(expected_res, result):
+        assert_array_equal(dpt.asnumpy(res), exp)
+    assert dpt.asnumpy(mask_view[result]).all()
+
+
+def test_assign_scalar():
+    get_queue_or_skip()
+    x = dpt.arange(-5, 5, dtype="i8")
+    cond = dpt.asarray(
+        [True, True, True, True, True, False, False, False, False, False]
+    )
+    x[cond] = 0  # no error expected
+    x[dpt.nonzero(cond)] = -1
+    expected = np.array([-1, -1, -1, -1, -1, 0, 1, 2, 3, 4], dtype=x.dtype)
+    assert (dpt.asnumpy(x) == expected).all()
+
+
+def test_nonzero_large():
+    get_queue_or_skip()
+    m = dpt.full((60, 80), True)
+    assert m[m].size == m.size
+
+    m = dpt.full((30, 60, 80), True)
+    assert m[m].size == m.size
+
+
+def test_extract_arg_validation():
+    get_queue_or_skip()
+    with pytest.raises(TypeError):
+        dpt.extract(None, None)
+    cond = dpt.ones(10, dtype="?")
+    with pytest.raises(TypeError):
+        dpt.extract(cond, None)
+    q1 = dpctl.SyclQueue()
+    with pytest.raises(ExecutionPlacementError):
+        dpt.extract(cond.to_device(q1), dpt.zeros_like(cond, dtype="u1"))
+    with pytest.raises(ValueError):
+        dpt.extract(dpt.ones((2, 3), dtype="?"), dpt.ones((3, 2), dtype="i1"))
+
+
+def test_place_arg_validation():
+    get_queue_or_skip()
+    with pytest.raises(TypeError):
+        dpt.place(None, None, None)
+    arr = dpt.zeros(8, dtype="i1")
+    with pytest.raises(TypeError):
+        dpt.place(arr, None, None)
+    cond = dpt.ones(8, dtype="?")
+    with pytest.raises(TypeError):
+        dpt.place(arr, cond, None)
+    vals = dpt.ones_like(arr)
+    q1 = dpctl.SyclQueue()
+    with pytest.raises(ExecutionPlacementError):
+        dpt.place(arr.to_device(q1), cond, vals)
+    with pytest.raises(ValueError):
+        dpt.place(dpt.reshape(arr, (2, 2, 2)), cond, vals)
+
+
+def test_nonzero_arg_validation():
+    get_queue_or_skip()
+    with pytest.raises(TypeError):
+        dpt.nonzero(list())
+    with pytest.raises(ValueError):
+        dpt.nonzero(dpt.asarray(1))
+
+
+def test_nonzero_dtype():
+    "See gh-1322"
+    get_queue_or_skip()
+    x = dpt.ones((3, 4))
+    idx, idy = dpt.nonzero(x)
+    # create array using device's
+    # default index data type
+    index_dt = dpt.dtype(ti.default_device_index_type(x.sycl_queue))
+    assert idx.dtype == index_dt
+    assert idy.dtype == index_dt
+
+
+def test_take_empty_axes():
+    get_queue_or_skip()
+
+    x = dpt.ones((3, 0, 4, 5, 6), dtype="f4")
+    inds = dpt.ones(1, dtype="i4")
+
+    with pytest.raises(IndexError):
+        dpt.take(x, inds, axis=1)
+
+    inds = dpt.ones(0, dtype="i4")
+    r = dpt.take(x, inds, axis=1)
+    assert r.shape == x.shape
+
+
+def test_put_empty_axes():
+    get_queue_or_skip()
+
+    x = dpt.ones((3, 0, 4, 5, 6), dtype="f4")
+    inds = dpt.ones(1, dtype="i4")
+    vals = dpt.zeros((3, 1, 4, 5, 6), dtype="f4")
+
+    with pytest.raises(IndexError):
+        dpt.put(x, inds, vals, axis=1)
+
+    inds = dpt.ones(0, dtype="i4")
+    vals = dpt.zeros_like(x)
+
+    with pytest.raises(ValueError):
+        dpt.put(x, inds, vals, axis=1)
+
+
+def test_put_cast_vals():
+    get_queue_or_skip()
+
+    x = dpt.arange(10, dtype="i4")
+    inds = dpt.arange(7, 10, dtype="i4")
+    vals = dpt.zeros_like(inds, dtype="f4")
+
+    dpt.put(x, inds, vals)
+    assert dpt.all(x[7:10] == 0)
+
+
+def test_advanced_integer_indexing_cast_vals():
+    get_queue_or_skip()
+
+    x = dpt.arange(10, dtype="i4")
+    inds = dpt.arange(7, 10, dtype="i4")
+    vals = dpt.zeros_like(inds, dtype="f4")
+
+    x[inds] = vals
+    assert dpt.all(x[7:10] == 0)
+
+
+def test_advanced_integer_indexing_empty_axis():
+    get_queue_or_skip()
+
+    # getting
+    x = dpt.ones((3, 0, 4, 5, 6), dtype="f4")
+    inds = dpt.ones(1, dtype="i4")
+    with pytest.raises(IndexError):
+        x[:, inds, ...]
+    with pytest.raises(IndexError):
+        x[inds, inds, inds, ...]
+
+    # setting
+    with pytest.raises(IndexError):
+        x[:, inds, ...] = 2
+    with pytest.raises(IndexError):
+        x[inds, inds, inds, ...] = 2
+
+    # empty inds
+    inds = dpt.ones(0, dtype="i4")
+    assert x[:, inds, ...].shape == x.shape
+    assert x[inds, inds, inds, ...].shape == (0, 5, 6)
+
+    vals = dpt.zeros_like(x)
+    x[:, inds, ...] = vals
+    vals = dpt.zeros((0, 5, 6), dtype="f4")
+    x[inds, inds, inds, ...] = vals
+
+
+def test_advanced_integer_indexing_cast_indices():
+    get_queue_or_skip()
+
+    inds0 = dpt.asarray([0, 1], dtype="i1")
+    for ind_dts in (("i1", "i2", "i4"), ("i1", "u4", "i4"), ("u1", "u2", "u8")):
+        x = dpt.ones((3, 4, 5, 6), dtype="i4")
+        inds0 = dpt.asarray([0, 1], dtype=ind_dts[0])
+        inds1 = dpt.astype(inds0, ind_dts[1])
+        x[inds0, inds1, ...] = 2
+        assert dpt.all(x[inds0, inds1, ...] == 2)
+        inds2 = dpt.astype(inds0, ind_dts[2])
+        x[inds0, inds1, ...] = 2
+        assert dpt.all(x[inds0, inds1, inds2, ...] == 2)
+
+    # fail when float would be required per type promotion
+    inds0 = dpt.asarray([0, 1], dtype="i1")
+    inds1 = dpt.astype(inds0, "u4")
+    inds2 = dpt.astype(inds0, "u8")
+    x = dpt.ones((3, 4, 5, 6), dtype="i4")
+    # test getitem
+    with pytest.raises(ValueError):
+        x[inds0, inds1, inds2, ...]
+    # test setitem
+    with pytest.raises(ValueError):
+        x[inds0, inds1, inds2, ...] = 1
+
+
+def test_take_along_axis():
+    get_queue_or_skip()
+
+    n0, n1, n2 = 3, 5, 7
+    x = dpt.reshape(dpt.arange(n0 * n1 * n2), (n0, n1, n2))
+    ind_dt = dpt.__array_namespace_info__().default_dtypes(
+        device=x.sycl_device
+    )["indexing"]
+    ind0 = dpt.ones((1, n1, n2), dtype=ind_dt)
+    ind1 = dpt.ones((n0, 1, n2), dtype=ind_dt)
+    ind2 = dpt.ones((n0, n1, 1), dtype=ind_dt)
+
+    y0 = dpt.take_along_axis(x, ind0, axis=0)
+    assert y0.shape == ind0.shape
+    y1 = dpt.take_along_axis(x, ind1, axis=1)
+    assert y1.shape == ind1.shape
+    y2 = dpt.take_along_axis(x, ind2, axis=2)
+    assert y2.shape == ind2.shape
+
+
+def test_take_along_axis_validation():
+    # validate first argument
+    with pytest.raises(TypeError):
+        dpt.take_along_axis(tuple(), list())
+    get_queue_or_skip()
+    n1, n2 = 2, 5
+    x = dpt.ones(n1 * n2)
+    # validate second argument
+    with pytest.raises(TypeError):
+        dpt.take_along_axis(x, list())
+    x_dev = x.sycl_device
+    info_ = dpt.__array_namespace_info__()
+    def_dtypes = info_.default_dtypes(device=x_dev)
+    ind_dt = def_dtypes["indexing"]
+    ind = dpt.zeros(1, dtype=ind_dt)
+    # axis validation
+    with pytest.raises(ValueError):
+        dpt.take_along_axis(x, ind, axis=1)
+    # mode validation
+    with pytest.raises(ValueError):
+        dpt.take_along_axis(x, ind, axis=0, mode="invalid")
+    # same array-ranks validation
+    with pytest.raises(ValueError):
+        dpt.take_along_axis(dpt.reshape(x, (n1, n2)), ind)
+    # check compute-follows-data
+    q2 = dpctl.SyclQueue(x_dev, property="enable_profiling")
+    ind2 = dpt.zeros(1, dtype=ind_dt, sycl_queue=q2)
+    with pytest.raises(ExecutionPlacementError):
+        dpt.take_along_axis(x, ind2)
+
+
+def test_put_along_axis():
+    get_queue_or_skip()
+
+    n0, n1, n2 = 3, 5, 7
+    x = dpt.reshape(dpt.arange(n0 * n1 * n2), (n0, n1, n2))
+    ind_dt = dpt.__array_namespace_info__().default_dtypes(
+        device=x.sycl_device
+    )["indexing"]
+    ind0 = dpt.ones((1, n1, n2), dtype=ind_dt)
+    ind1 = dpt.ones((n0, 1, n2), dtype=ind_dt)
+    ind2 = dpt.ones((n0, n1, 1), dtype=ind_dt)
+
+    xc = dpt.copy(x)
+    vals = dpt.ones(ind0.shape, dtype=x.dtype)
+    dpt.put_along_axis(xc, ind0, vals, axis=0)
+    assert dpt.all(dpt.take_along_axis(xc, ind0, axis=0) == vals)
+
+    xc = dpt.copy(x)
+    vals = dpt.ones(ind1.shape, dtype=x.dtype)
+    dpt.put_along_axis(xc, ind1, vals, axis=1)
+    assert dpt.all(dpt.take_along_axis(xc, ind1, axis=1) == vals)
+
+    xc = dpt.copy(x)
+    vals = dpt.ones(ind2.shape, dtype=x.dtype)
+    dpt.put_along_axis(xc, ind2, vals, axis=2)
+    assert dpt.all(dpt.take_along_axis(xc, ind2, axis=2) == vals)
+
+    xc = dpt.copy(x)
+    vals = dpt.ones(ind2.shape, dtype=x.dtype)
+    dpt.put_along_axis(xc, ind2, dpt.asnumpy(vals), axis=2)
+    assert dpt.all(dpt.take_along_axis(xc, ind2, axis=2) == vals)
+
+
+def test_put_along_axis_validation():
+    # validate first argument
+    with pytest.raises(TypeError):
+        dpt.put_along_axis(tuple(), list(), list())
+    get_queue_or_skip()
+    n1, n2 = 2, 5
+    x = dpt.ones(n1 * n2)
+    # validate second argument
+    with pytest.raises(TypeError):
+        dpt.put_along_axis(x, list(), list())
+    x_dev = x.sycl_device
+    info_ = dpt.__array_namespace_info__()
+    def_dtypes = info_.default_dtypes(device=x_dev)
+    ind_dt = def_dtypes["indexing"]
+    ind = dpt.zeros(1, dtype=ind_dt)
+    vals = dpt.zeros(1, dtype=x.dtype)
+    # axis validation
+    with pytest.raises(ValueError):
+        dpt.put_along_axis(x, ind, vals, axis=1)
+    # mode validation
+    with pytest.raises(ValueError):
+        dpt.put_along_axis(x, ind, vals, axis=0, mode="invalid")
+    # same array-ranks validation
+    with pytest.raises(ValueError):
+        dpt.put_along_axis(dpt.reshape(x, (n1, n2)), ind, vals)
+    # check compute-follows-data
+    q2 = dpctl.SyclQueue(x_dev, property="enable_profiling")
+    ind2 = dpt.zeros(1, dtype=ind_dt, sycl_queue=q2)
+    with pytest.raises(ExecutionPlacementError):
+        dpt.put_along_axis(x, ind2, vals)
+
+
+def test_put_along_axis_application():
+    get_queue_or_skip()
+    info_ = dpt.__array_namespace_info__()
+    def_dtypes = info_.default_dtypes(device=None)
+    ind_dt = def_dtypes["indexing"]
+    all_perms = dpt.asarray(
+        [
+            [0, 1, 2, 3],
+            [0, 2, 1, 3],
+            [2, 0, 1, 3],
+            [2, 1, 0, 3],
+            [1, 0, 2, 3],
+            [1, 2, 0, 3],
+            [0, 1, 3, 2],
+            [0, 2, 3, 1],
+            [2, 0, 3, 1],
+            [2, 1, 3, 0],
+            [1, 0, 3, 2],
+            [1, 2, 3, 0],
+            [0, 3, 1, 2],
+            [0, 3, 2, 1],
+            [2, 3, 0, 1],
+            [2, 3, 1, 0],
+            [1, 3, 0, 2],
+            [1, 3, 2, 0],
+            [3, 0, 1, 2],
+            [3, 0, 2, 1],
+            [3, 2, 0, 1],
+            [3, 2, 1, 0],
+            [3, 1, 0, 2],
+            [3, 1, 2, 0],
+        ],
+        dtype=ind_dt,
+    )
+    p_mats = dpt.zeros((24, 4, 4), dtype=dpt.int64)
+    vals = dpt.ones((24, 4, 1), dtype=p_mats.dtype)
+    # form 24 permutation matrices
+    dpt.put_along_axis(p_mats, all_perms[..., dpt.newaxis], vals, axis=2)
+    p2 = p_mats @ p_mats
+    p4 = p2 @ p2
+    p8 = p4 @ p4
+    expected = dpt.eye(4, dtype=p_mats.dtype)[dpt.newaxis, ...]
+    assert dpt.all(p8 @ p4 == expected)
+
+
+def check__extract_impl_validation(fn):
+    x = dpt.ones(10)
+    ind = dpt.ones(10, dtype="?")
+    with pytest.raises(TypeError):
+        fn(list(), ind)
+    with pytest.raises(TypeError):
+        fn(x, list())
+    q2 = dpctl.SyclQueue(x.sycl_device, property="enable_profiling")
+    ind2 = dpt.ones(10, dtype="?", sycl_queue=q2)
+    with pytest.raises(ExecutionPlacementError):
+        fn(x, ind2)
+    with pytest.raises(ValueError):
+        fn(x, ind, 1)
+
+
+def check__nonzero_impl_validation(fn):
+    with pytest.raises(TypeError):
+        fn(list())
+
+
+def check__take_multi_index(fn):
+    x = dpt.ones(10)
+    x_dev = x.sycl_device
+    info_ = dpt.__array_namespace_info__()
+    def_dtypes = info_.default_dtypes(device=x_dev)
+    ind_dt = def_dtypes["indexing"]
+    ind = dpt.arange(10, dtype=ind_dt)
+    with pytest.raises(TypeError):
+        fn(list(), tuple(), 1)
+    with pytest.raises(ValueError):
+        fn(x, (ind,), 0, mode=2)
+    with pytest.raises(ValueError):
+        fn(x, (None,), 1)
+    with pytest.raises(IndexError):
+        fn(x, (x,), 1)
+    q2 = dpctl.SyclQueue(x.sycl_device, property="enable_profiling")
+    ind2 = dpt.arange(10, dtype=ind_dt, sycl_queue=q2)
+    with pytest.raises(ExecutionPlacementError):
+        fn(x, (ind2,), 0)
+    m = dpt.ones((10, 10))
+    ind_1 = dpt.arange(10, dtype="i8")
+    ind_2 = dpt.arange(10, dtype="u8")
+    with pytest.raises(ValueError):
+        fn(m, (ind_1, ind_2), 0)
+
+
+def check__place_impl_validation(fn):
+    with pytest.raises(TypeError):
+        fn(list(), list(), list())
+    x = dpt.ones(10)
+    with pytest.raises(TypeError):
+        fn(x, list(), list())
+    q2 = dpctl.SyclQueue(x.sycl_device, property="enable_profiling")
+    mask2 = dpt.ones(10, dtype="?", sycl_queue=q2)
+    with pytest.raises(ExecutionPlacementError):
+        fn(x, mask2, 1)
+    x2 = dpt.ones((5, 5))
+    mask2 = dpt.ones((5, 5), dtype="?")
+    with pytest.raises(ValueError):
+        fn(x2, mask2, x2, axis=1)
+
+
+def check__put_multi_index_validation(fn):
+    with pytest.raises(TypeError):
+        fn(list(), list(), 0, list())
+    x = dpt.ones(10)
+    inds = dpt.arange(10, dtype="i8")
+    vals = dpt.zeros(10)
+    # test inds which is not a tuple/list
+    fn(x, inds, 0, vals)
+    x2 = dpt.ones((5, 5))
+    ind1 = dpt.arange(5, dtype="i8")
+    ind2 = dpt.arange(5, dtype="u8")
+    with pytest.raises(ValueError):
+        fn(x2, (ind1, ind2), 0, x2)
+    with pytest.raises(TypeError):
+        # invalid index type
+        fn(x2, (ind1, list()), 0, x2)
+    with pytest.raises(ValueError):
+        # invalid mode keyword value
+        fn(x, inds, 0, vals, mode=100)
+
+
+def test__copy_utils():
+    import dpnp.tensor._copy_utils as cu
+
+    get_queue_or_skip()
+
+    check__extract_impl_validation(cu._extract_impl)
+    check__nonzero_impl_validation(cu._nonzero_impl)
+    check__take_multi_index(cu._take_multi_index)
+    check__place_impl_validation(cu._place_impl)
+    check__put_multi_index_validation(cu._put_multi_index)
+
+
+@pytest.mark.parametrize("mode", ["wrap", "clip"])
+def test_take_indices_oob_py_ssize_t(mode):
+    get_queue_or_skip()
+
+    x = dpt.arange(10, dtype="i4")
+    inds1 = dpt.full(5, dpt.iinfo(dpt.uint64).max, dtype=dpt.uint64)
+    inds2 = dpt.full(5, dpt.iinfo(dpt.uint64).max, dtype=dpt.uint64)
+
+    # sweep through a small range of indices
+    # to check that OOB indices are well-behaved
+    for i in range(1, 10):
+        inds2 -= i
+        r1 = dpt.take(x, inds1, mode=mode)
+        r2 = dpt.take(x, inds2, mode=mode)
+
+        assert dpt.all(r1 == r2)
+
+
+@pytest.mark.parametrize("mode", ["wrap", "clip"])
+def test_put_indices_oob_py_ssize_t(mode):
+    get_queue_or_skip()
+
+    x = dpt.full(10, -1, dtype="i4")
+    inds = dpt.full(1, dpt.iinfo(dpt.uint64).max, dtype=dpt.uint64)
+
+    # OOB inds are positive, so always
+    # clip to the top of range
+    for i in range(1, 10):
+        inds -= i
+        dpt.put(x, inds, i, mode=mode)
+
+        assert dpt.all(x[:-1] == -1)
+        assert x[-1] == i
+
+
+def test_take_along_axis_uint64_indices():
+    get_queue_or_skip()
+
+    inds = dpt.arange(1, 10, 2, dtype="u8")
+    x = dpt.tile(dpt.asarray([0, -1], dtype="i4"), 5)
+    res = dpt.take_along_axis(x, inds)
+    assert dpt.all(res == -1)
+
+    sh0 = 2
+    inds = dpt.broadcast_to(inds, (sh0,) + inds.shape)
+    x = dpt.broadcast_to(x, (sh0,) + x.shape)
+    res = dpt.take_along_axis(x, inds, axis=1)
+    assert dpt.all(res == -1)
+
+
+def test_put_along_axis_uint64_indices():
+    get_queue_or_skip()
+
+    inds = dpt.arange(1, 10, 2, dtype="u8")
+    x = dpt.zeros(10, dtype="i4")
+    dpt.put_along_axis(x, inds, dpt.asarray(2, dtype=x.dtype))
+    expected = dpt.tile(dpt.asarray([0, 2], dtype="i4"), 5)
+    assert dpt.all(x == expected)
+
+    sh0 = 2
+    inds = dpt.broadcast_to(inds, (sh0,) + inds.shape)
+    x = dpt.zeros((sh0,) + x.shape, dtype="i4")
+    dpt.put_along_axis(x, inds, dpt.asarray(2, dtype=x.dtype), axis=1)
+    expected = dpt.tile(dpt.asarray([0, 2], dtype="i4"), (2, 5))
+    assert dpt.all(expected == x)
+
+
+@pytest.mark.parametrize("data_dt", _all_dtypes)
+@pytest.mark.parametrize("order", ["C", "F"])
+def test_take_out(data_dt, order):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(data_dt, q)
+
+    axis = 0
+    x = dpt.reshape(_make_3d(data_dt, q), (9, 3), order=order)
+    ind = dpt.arange(2, dtype="i8", sycl_queue=q)
+    out_sh = x.shape[:axis] + ind.shape + x.shape[axis + 1 :]
+    out = dpt.empty(out_sh, dtype=data_dt, sycl_queue=q)
+
+    expected = dpt.take(x, ind, axis=axis)
+
+    dpt.take(x, ind, axis=axis, out=out)
+
+    assert dpt.all(out == expected)
+
+
+@pytest.mark.parametrize("data_dt", _all_dtypes)
+@pytest.mark.parametrize("order", ["C", "F"])
+def test_take_out_overlap(data_dt, order):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(data_dt, q)
+
+    axis = 0
+    x = dpt.reshape(_make_3d(data_dt, q), (9, 3), order=order)
+    ind = dpt.arange(2, dtype="i8", sycl_queue=q)
+    out = x[x.shape[axis] - ind.shape[axis] : x.shape[axis], :]
+
+    expected = dpt.take(x, ind, axis=axis)
+
+    dpt.take(x, ind, axis=axis, out=out)
+
+    assert dpt.all(out == expected)
+    assert dpt.all(x[x.shape[0] - ind.shape[0] : x.shape[0], :] == out)
+
+
+def test_take_out_errors():
+    q1 = get_queue_or_skip()
+    q2 = get_queue_or_skip()
+
+    x = dpt.arange(10, dtype="i4", sycl_queue=q1)
+    ind = dpt.arange(2, dtype="i4", sycl_queue=q1)
+
+    with pytest.raises(TypeError):
+        dpt.take(x, ind, out=dict())
+
+    out_read_only = dpt.empty(ind.shape, dtype=x.dtype, sycl_queue=q1)
+    out_read_only.flags["W"] = False
+    with pytest.raises(ValueError):
+        dpt.take(x, ind, out=out_read_only)
+
+    out_bad_shape = dpt.empty(0, dtype=x.dtype, sycl_queue=q1)
+    with pytest.raises(ValueError):
+        dpt.take(x, ind, out=out_bad_shape)
+
+    out_bad_dt = dpt.empty(ind.shape, dtype="i8", sycl_queue=q1)
+    with pytest.raises(ValueError):
+        dpt.take(x, ind, out=out_bad_dt)
+
+    out_bad_q = dpt.empty(ind.shape, dtype=x.dtype, sycl_queue=q2)
+    with pytest.raises(dpctl.utils.ExecutionPlacementError):
+        dpt.take(x, ind, out=out_bad_q)
+
+
+def test_getitem_impl_fn_invalid_inp():
+    get_queue_or_skip()
+
+    x = dpt.ones((10, 10), dtype="i4")
+
+    bad_ind_type = (dpt.ones((), dtype="i4"), 2.0)
+    with pytest.raises(TypeError):
+        _take_multi_index(x, bad_ind_type, 0, 0)
+
+    no_array_inds = (2, 3)
+    with pytest.raises(TypeError):
+        _take_multi_index(x, no_array_inds, 0, 0)
diff --git a/dpnp/tests/tensor/test_usm_ndarray_linalg.py b/dpnp/tests/tensor/test_usm_ndarray_linalg.py
new file mode 100644
index 000000000000..13b03ff66fd5
--- /dev/null
+++ b/dpnp/tests/tensor/test_usm_ndarray_linalg.py
@@ -0,0 +1,1031 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+import itertools
+
+import dpctl
+import numpy as np
+import pytest
+from dpctl.utils import ExecutionPlacementError
+
+import dpnp.tensor as dpt
+
+from .helper import (
+    get_queue_or_skip,
+    skip_if_dtype_not_supported,
+)
+
+_numeric_types = [
+    "i1",
+    "u1",
+    "i2",
+    "u2",
+    "i4",
+    "u4",
+    "i8",
+    "u8",
+    "f2",
+    "f4",
+    "f8",
+    "c8",
+    "c16",
+]
+
+
+def _map_int_to_type(n, dt):
+    assert isinstance(n, int)
+    assert n > 0
+    if dt == dpt.int8:
+        return ((n + 128) % 256) - 128
+    elif dt == dpt.uint8:
+        return n % 256
+    elif dt == dpt.int16:
+        return ((n + 32768) % 65536) - 32768
+    elif dt == dpt.uint16:
+        return n % 65536
+    return n
+
+
+def test_matrix_transpose():
+    get_queue_or_skip()
+
+    X = dpt.reshape(dpt.arange(2 * 3, dtype="i4"), (2, 3))
+    res = dpt.matrix_transpose(X)
+    expected_res = X.mT
+
+    assert expected_res.shape == res.shape
+    assert expected_res.flags["C"] == res.flags["C"]
+    assert expected_res.flags["F"] == res.flags["F"]
+    assert dpt.all(X.mT == res)
+
+
+def test_matrix_transpose_arg_validation():
+    get_queue_or_skip()
+
+    X = dpt.empty(5, dtype="i4")
+    with pytest.raises(ValueError):
+        dpt.matrix_transpose(X)
+
+    X = {}
+    with pytest.raises(TypeError):
+        dpt.matrix_transpose(X)
+
+    X = dpt.empty((5, 5), dtype="i4")
+    assert isinstance(dpt.matrix_transpose(X), dpt.usm_ndarray)
+
+
+@pytest.mark.parametrize("dtype", _numeric_types)
+def test_matmul_simple(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    n, m = 235, 17
+    m1 = dpt.zeros((m, n), dtype=dtype)
+    m2 = dpt.zeros((n, m), dtype=dtype)
+
+    dt = m1.dtype
+    if dt.kind in "ui":
+        n1 = min(n, dpt.iinfo(dt).max)
+    else:
+        n1 = n
+    m1[:, :n1] = dpt.ones((m, n1), dtype=dt)
+    m2[:n1, :] = dpt.ones((n1, m), dtype=dt)
+
+    for k in [1, 2, 3, 4, 7, 8, 9, 15, 16, 17]:
+        r = dpt.matmul(m1[:k, :], m2[:, :k])
+        assert dpt.all(r == dpt.full((k, k), fill_value=n1, dtype=dt))
+
+
+@pytest.mark.parametrize("dtype", _numeric_types)
+def test_matmul_nilpotent1(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    n = 77
+    N_mat = dpt.eye(n, k=1, dtype=dtype)
+    I_mat = dpt.eye(n, dtype=dtype)
+    R_mat = dpt.eye(n, dtype=dtype)
+    for _ in range(n + 1):
+        R_mat = I_mat + dpt.matmul(N_mat, R_mat)
+
+    assert dpt.allclose(dpt.matmul(I_mat - N_mat, R_mat), I_mat)
+
+
+@pytest.mark.parametrize("dtype", _numeric_types)
+def test_matmul_nilpotent2(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    n = 128
+    u = dpt.ones((n, 1), dtype=dtype)
+    v = dpt.ones((1, n), dtype=dtype)
+
+    uv = dpt.matmul(u, v)
+    uv_ref = u * v
+
+    assert dpt.allclose(uv, uv_ref)
+
+
+def test_matmul_null_axis():
+    get_queue_or_skip()
+    n = 3
+
+    A_mat = dpt.ones((n, 0), dtype="f4")
+    B_mat = dpt.ones((0, 1), dtype="f4")
+
+    R_mat = dpt.matmul(A_mat, B_mat)
+    assert R_mat.shape == (n, 1)
+
+    R_mat = dpt.matmul(A_mat, B_mat[:, :0])
+    assert R_mat.shape == (n, 0)
+
+
+@pytest.mark.parametrize("dtype", ["i4", "f4"])
+def test_matmul_dims(dtype):
+    get_queue_or_skip()
+
+    n, m, k, b = 4, 5, 7, 3
+    v = dpt.ones(k, dtype=dtype)
+    m1 = dpt.ones((n, k), dtype=dtype)
+    m2 = dpt.ones((k, m), dtype=dtype)
+    st1 = dpt.ones((b, n, k), dtype=dtype)
+    st2 = dpt.ones((b, k, m), dtype=dtype)
+
+    r = dpt.matmul(v, v)
+    assert r.shape == ()
+    assert dpt.round(r) == k
+
+    r = dpt.matmul(m1, v)
+    assert r.shape == (n,)
+    assert dpt.all(dpt.round(r) == k)
+
+    r = dpt.matmul(v, m2)
+    assert r.shape == (m,)
+    assert dpt.all(dpt.round(r) == k)
+
+    r = dpt.matmul(m1, m2)
+    assert r.shape == (
+        n,
+        m,
+    )
+    assert dpt.all(dpt.round(r) == k)
+
+    r = dpt.matmul(v, st2)
+    assert r.shape == (
+        b,
+        m,
+    )
+    assert dpt.all(dpt.round(r) == k)
+
+    r = dpt.matmul(st1, v)
+    assert r.shape == (
+        b,
+        n,
+    )
+    assert dpt.all(dpt.round(r) == k)
+
+    r = dpt.matmul(st1, m2)
+    assert r.shape == (
+        b,
+        n,
+        m,
+    )
+    assert dpt.all(dpt.round(r) == k)
+
+    r = dpt.matmul(m1, st2)
+    assert r.shape == (
+        b,
+        n,
+        m,
+    )
+    assert dpt.all(dpt.round(r) == k)
+
+    r = dpt.matmul(st1, st2)
+    assert r.shape == (
+        b,
+        n,
+        m,
+    )
+    assert dpt.all(dpt.round(r) == k)
+
+
+def test_matmul_arg_validation():
+    get_queue_or_skip()
+
+    s1, s2 = dpt.ones(tuple()), dpt.zeros(tuple())
+    v1, v2 = dpt.ones(16), dpt.zeros(16)
+
+    with pytest.raises(ValueError):
+        dpt.matmul(s1, v2)
+
+    with pytest.raises(ValueError):
+        dpt.matmul(v1, s2)
+
+    with pytest.raises(TypeError):
+        dpt.matmul(dict(), v2)
+
+    with pytest.raises(TypeError):
+        dpt.matmul(v2, None)
+
+
+def test_matmul_dims_validation():
+    get_queue_or_skip()
+
+    m1 = dpt.ones((16, 16))
+    m2 = dpt.ones((16, 16))
+
+    # contraction dimensions mismatch
+    with pytest.raises(ValueError):
+        dpt.matmul(m1[:, :7], m2[:3, :])
+
+    m1 = dpt.ones((3, 4, 5))
+    m2 = dpt.ones((2, 5, 3))
+    # broadcasting dimensions mismatch
+    with pytest.raises(ValueError):
+        dpt.matmul(m1, m2)
+
+
+def test_matmul_broadcasting():
+    get_queue_or_skip()
+
+    for dt1, dt2 in [
+        (dpt.int16, dpt.int32),
+        (dpt.float32, dpt.int16),
+        (dpt.int32, dpt.uint32),
+    ]:
+        m1 = dpt.ones((7, 11, 16), dtype=dt1)
+        m2 = dpt.ones((16, 13), dtype=dt2)
+
+        r = dpt.matmul(m1, m2[dpt.newaxis, ...])
+
+        assert r.shape == (7, 11, 13)
+
+
+@pytest.mark.parametrize("dtype", ["i4", "i8", "f4", "c8"])
+def test_matmul_strided(dtype):
+    get_queue_or_skip()
+
+    m1_shape = (14, 22, 32)
+    m1_size = 1
+    for el in m1_shape:
+        m1_size = m1_size * el
+
+    m1 = dpt.remainder(dpt.arange(1, m1_size + 1, dtype="i8"), 13)
+    m1_orig = dpt.reshape(dpt.astype(m1, dtype), m1_shape)
+    m2_orig = dpt.ones((14, 16, 13), dtype=dtype)
+
+    m1 = m1_orig[::2, ::-2, ::2]
+    m2 = m2_orig[::2, :, :]
+    r = dpt.matmul(m1, m2)
+
+    assert r.shape == m1.shape[:2] + m2.shape[-1:]
+    ref = np.matmul(dpt.asnumpy(m1), dpt.asnumpy(m2))
+    assert np.allclose(dpt.asnumpy(r), ref)
+
+    m1 = m1_orig[::2, ::2, ::-2]
+    m2 = m2_orig[::2, :, :]
+    r = dpt.matmul(m1, m2)
+
+    assert r.shape == m1.shape[:2] + m2.shape[-1:]
+    ref = np.matmul(dpt.asnumpy(m1), dpt.asnumpy(m2))
+    assert np.allclose(dpt.asnumpy(r), ref)
+
+    m1 = m1_orig[::-2, ::2, ::2]
+    m2 = m2_orig[::-2, :, :]
+    r = dpt.matmul(m1, m2)
+
+    assert r.shape == m1.shape[:2] + m2.shape[-1:]
+    ref = np.matmul(dpt.asnumpy(m1), dpt.asnumpy(m2))
+    assert np.allclose(dpt.asnumpy(r), ref)
+
+
+def test_matmul_out():
+    get_queue_or_skip()
+
+    m1 = (
+        dpt.arange(14, dtype="f4")[:, dpt.newaxis, dpt.newaxis]
+        + dpt.arange(17, dtype="f4")[dpt.newaxis, :, dpt.newaxis]
+        + dpt.arange(128, dtype="f4")[dpt.newaxis, dpt.newaxis, :]
+    )
+    assert m1.shape == (14, 17, 128)
+    m2 = dpt.tile(
+        dpt.reshape(dpt.asarray([1, 2], dtype="f4"), (2, 1, 1)), (7, 128, 13)
+    )
+    assert m2.shape == (14, 128, 13)
+
+    buf = dpt.zeros((2 * 14, 3 * 17, 13), dtype="f4")
+    res = dpt.matmul(m1, m2, out=buf[::-2, 1::3, :])
+
+    assert dpt.allclose(res, buf[::-2, 1::3, :])
+    assert dpt.allclose(dpt.zeros_like(res), buf[::-2, 0::3, :])
+    assert dpt.allclose(dpt.zeros_like(res), buf[::-2, 2::3, :])
+
+    m1_np = dpt.asnumpy(m1)
+    ref = np.matmul(m1_np, dpt.asnumpy(m2))
+    assert np.allclose(ref, dpt.asnumpy(res))
+
+    res = dpt.matmul(m1[:, :10, :10], m1[:, :10, :10].mT, out=m1[:, :10, :10])
+    ref = np.matmul(
+        m1_np[:, :10, :10], np.transpose(m1_np[:, :10, :10], (0, 2, 1))
+    )
+    assert np.allclose(ref, dpt.asnumpy(res))
+
+
+def test_matmul_readonly_out():
+    get_queue_or_skip()
+    m = dpt.ones((10, 10), dtype=dpt.int32)
+    r = dpt.empty_like(m)
+    r.flags["W"] = False
+
+    with pytest.raises(ValueError):
+        dpt.matmul(m, m, out=r)
+
+
+def test_matmul_dtype():
+    get_queue_or_skip()
+
+    for dt1, dt2 in [
+        (dpt.int32, dpt.int16),
+        (dpt.int16, dpt.int32),
+        (dpt.float32, dpt.int16),
+        (dpt.int32, dpt.float32),
+    ]:
+        m1 = dpt.ones((10, 10), dtype=dt1)
+        m2 = dpt.ones((10, 10), dtype=dt2)
+
+        for ord in ["C", "A", "F", "K"]:
+            r = dpt.matmul(m1, m2, dtype=dpt.float32, order=ord)
+            assert r.dtype == dpt.float32
+
+
+@pytest.mark.parametrize("dt1", _numeric_types)
+@pytest.mark.parametrize("dt2", _numeric_types)
+@pytest.mark.parametrize("order", ["C", "K"])
+def test_matmul_type_promotion(dt1, dt2, order):
+    get_queue_or_skip()
+
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dt1, q)
+    skip_if_dtype_not_supported(dt2, q)
+
+    b, n, k, m = 8, 10, 17, 10
+    m1 = dpt.ones((1, n, k), dtype=dt1)
+    m2 = dpt.ones((b, k, m), dtype=dt2)
+    expected_dt = dpt.result_type(m1, m2)
+
+    r = dpt.matmul(m1, m2, order=order)
+    assert r.shape == (b, n, m)
+    assert r.dtype == expected_dt
+
+    m1 = dpt.ones((b, n, k), dtype=dt1)
+    m2 = dpt.ones((1, k, m), dtype=dt2)
+
+    r = dpt.matmul(m1, m2, order=order)
+    assert r.shape == (b, n, m)
+    assert r.dtype == expected_dt
+
+    m1 = dpt.ones((n, k), dtype=dt1)
+    m2 = dpt.ones((k, m), dtype=dt2)
+
+    r = dpt.matmul(m1, m2, order=order)
+    assert r.shape == (n, m)
+    assert r.dtype == expected_dt
+
+
+def test_matmul_invalid_dtype():
+    get_queue_or_skip()
+
+    m1 = dpt.zeros((10, 10), dtype="f4")
+    m2 = dpt.zeros((10, 10), dtype="f4")
+    m3 = dpt.zeros((10, 10), dtype="i4")
+
+    with pytest.raises(ValueError):
+        dpt.matmul(m1, m2, dtype="i4")
+
+    with pytest.raises(ValueError):
+        dpt.matmul(m1, m3, dtype="i4")
+
+    with pytest.raises(ValueError):
+        dpt.matmul(m3, m1, dtype="i4")
+
+
+def test_matmul_out_errors():
+    q1 = get_queue_or_skip()
+    q2 = dpctl.SyclQueue()
+
+    sh = (10, 10)
+    dt = "i4"
+    m1 = dpt.zeros(sh, dtype=dt, sycl_queue=q1)
+    m2 = dpt.zeros(sh, dtype=dt, sycl_queue=q1)
+
+    with pytest.raises(TypeError):
+        dpt.matmul(m1, m2, out=dict())
+
+    with pytest.raises(ValueError):
+        dpt.matmul(m1, m2, out=dpt.empty((10,), dtype=dt, sycl_queue=q1))
+
+    with pytest.raises(ValueError):
+        dpt.matmul(m1, m2, out=dpt.empty(sh, dtype="f4", sycl_queue=q1))
+
+    with pytest.raises(ExecutionPlacementError):
+        dpt.matmul(m1, m2, out=dpt.empty(sh, dtype=dt, sycl_queue=q2))
+
+
+def test_matmul_order():
+    get_queue_or_skip()
+
+    sh = (
+        10,
+        10,
+    )
+    sh2 = tuple(2 * dim for dim in sh)
+    n = sh[-1]
+
+    for dt1, dt2 in zip(["i4", "i4", "f4"], ["i4", "f4", "i4"]):
+        ar1 = dpt.ones(sh, dtype=dt1, order="C")
+        ar2 = dpt.ones(sh, dtype=dt2, order="C")
+        r1 = dpt.matmul(ar1, ar2, order="C")
+        assert r1.flags.c_contiguous
+        r2 = dpt.matmul(ar1, ar2, order="F")
+        assert r2.flags.f_contiguous
+        r3 = dpt.matmul(ar1, ar2, order="A")
+        assert r3.flags.c_contiguous
+        r4 = dpt.matmul(ar1, ar2, order="K")
+        assert r4.flags.c_contiguous
+
+        ar1 = dpt.ones(sh, dtype=dt1, order="F")
+        ar2 = dpt.ones(sh, dtype=dt2, order="F")
+        r1 = dpt.matmul(ar1, ar2, order="C")
+        assert r1.flags.c_contiguous
+        r2 = dpt.matmul(ar1, ar2, order="F")
+        assert r2.flags.f_contiguous
+        r3 = dpt.matmul(ar1, ar2, order="A")
+        assert r3.flags.f_contiguous
+        r4 = dpt.matmul(ar1, ar2, order="K")
+        assert r4.flags.f_contiguous
+
+        ar1 = dpt.ones(sh2, dtype=dt1, order="C")[:10, ::-2]
+        ar2 = dpt.ones(sh2, dtype=dt2, order="C")[:10, ::-2]
+        r4 = dpt.matmul(ar1, ar2, order="K")
+        assert r4.strides == (n, -1)
+        r5 = dpt.matmul(ar1, ar2, order="C")
+        assert r5.strides == (n, 1)
+
+        ar1 = dpt.ones(sh2, dtype=dt1, order="C")[:10, ::-2].mT
+        ar2 = dpt.ones(sh2, dtype=dt2, order="C")[:10, ::-2].mT
+        r4 = dpt.matmul(ar1, ar2, order="K")
+        assert r4.strides == (-1, n)
+        r5 = dpt.matmul(ar1, ar2, order="C")
+        assert r5.strides == (n, 1)
+
+
+def test_matmul_invalid_order():
+    get_queue_or_skip()
+
+    sh = (
+        10,
+        10,
+    )
+    dt = "i4"
+
+    ar1 = dpt.ones(sh, dtype=dt, order="C")
+    ar2 = dpt.ones(sh, dtype=dt, order="C")
+    r = dpt.matmul(ar1, ar2, order="invalid")
+    assert r.flags.c_contiguous
+
+    ar1 = dpt.ones(sh, dtype=dt, order="F")
+    ar2 = dpt.ones(sh, dtype=dt, order="F")
+    r = dpt.matmul(ar1, ar2, order="invalid")
+    assert r.flags.f_contiguous
+
+
+def test_matmul_compute_follows_data():
+    q1 = get_queue_or_skip()
+    q2 = dpctl.SyclQueue()
+
+    sh = (
+        10,
+        10,
+    )
+    dt = "i4"
+    m1 = dpt.zeros(sh, dtype=dt, sycl_queue=q1)
+    m2 = dpt.zeros(sh, dtype=dt, sycl_queue=q2)
+
+    with pytest.raises(ExecutionPlacementError):
+        dpt.matmul(m1, m2)
+
+
+def test_matmul_inplace_broadcasting():
+    get_queue_or_skip()
+
+    sh = (3, 5, 5)
+    dt = "i4"
+
+    m1 = dpt.ones((3, 5, 5), dtype=dt)
+    m2 = dpt.ones((1, 5, 5), dtype=dt)
+    m1 @= m2
+    assert dpt.all(m1 == dpt.full(sh, 5, dtype=dt))
+
+
+def test_matmul_prepend_dims():
+    get_queue_or_skip()
+
+    n = 5
+    for dt1, dt2 in [
+        (dpt.int32, dpt.int32),
+        (dpt.int32, dpt.int64),
+        (dpt.int64, dpt.int32),
+        (dpt.int32, dpt.uint32),
+    ]:
+        m = dpt.ones((n, 4), dtype=dt1)
+        v = dpt.ones((4,), dtype=dt2)
+        r = dpt.matmul(m, v)
+        assert r.shape == (n,)
+
+        r = dpt.matmul(v, m.mT)
+        assert r.shape == (n,)
+
+
+def test_matmul_inplace_same_tensors():
+    get_queue_or_skip()
+
+    n = 5
+    sh = (
+        n,
+        n,
+    )
+
+    ar1 = dpt.ones(sh, dtype="i4")
+    ar1 @= ar1
+    assert dpt.all(ar1 == dpt.full(sh, n, dtype="i4"))
+
+    ar1 = dpt.ones(sh, dtype="i8")
+    ar2 = dpt.ones(sh, dtype="i4")
+    dpt.matmul(ar1, ar2, out=ar1)
+    assert dpt.all(ar1 == dpt.full(sh, n, dtype=ar1.dtype))
+
+    ar1 = dpt.ones(sh, dtype="i4")
+    ar2 = dpt.ones(sh, dtype="i8")
+    dpt.matmul(ar1, ar2, out=ar2)
+    assert dpt.all(ar2 == dpt.full(sh, n, dtype=ar2.dtype))
+
+
+@pytest.fixture
+def random_matrix():
+    rs = np.random.RandomState(seed=123456)
+    m_np = rs.randint(low=0, high=6, size=(400, 400))
+    return m_np
+
+
+@pytest.mark.parametrize("dtype", _numeric_types)
+def test_matmul_largish_square(dtype, random_matrix):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    m_np = random_matrix.astype(dtype)
+    x_np = np.matmul(m_np.T, m_np)
+
+    m = dpt.asarray(m_np)
+    mT = dpt.asarray(m.mT, copy=True, order="C")
+    x1 = dpt.matmul(m.mT, m)
+    x2 = dpt.matmul(mT, m)
+
+    tol = 0
+    if dpt.isdtype(x2.dtype, ("real floating", "complex floating")):
+        tol = 32 * dpt.finfo(x2.dtype).eps
+
+    assert dpt.allclose(x1, x2, atol=tol, rtol=tol)
+    assert dpt.allclose(x1, dpt.asarray(x_np), atol=tol, rtol=tol)
+
+    # check stided input
+    m_np = m_np[:-1, :-1]
+    x_np = np.matmul(m_np.T, m_np)
+
+    m = m[:-1, :-1]
+    mT = dpt.asarray(m.mT, copy=True, order="C")
+    x1 = dpt.matmul(m.mT, m)
+    x2 = dpt.matmul(mT, m)
+
+    assert dpt.allclose(x1, x2, atol=tol, rtol=tol)
+    assert dpt.allclose(x1, dpt.asarray(x_np), atol=tol, rtol=tol)
+
+
+@pytest.mark.parametrize("dtype", _numeric_types)
+def test_matmul_largish_rect(dtype, random_matrix):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    m_np = random_matrix.astype(dtype)[:, :-1]
+    x_np = np.matmul(m_np.T[:-2, :], m_np)
+
+    m = dpt.asarray(m_np)
+    mmT = m.mT[:-2, :]
+    mT = dpt.asarray(mmT, copy=True, order="C")
+    x1 = dpt.matmul(mmT, m)
+    x2 = dpt.matmul(mT, m)
+
+    tol = 0
+    if dpt.isdtype(x2.dtype, ("real floating", "complex floating")):
+        tol = 32 * dpt.finfo(x2.dtype).eps
+
+    assert dpt.allclose(x1, x2, atol=tol, rtol=tol)
+    assert dpt.allclose(x1, dpt.asarray(x_np), atol=tol, rtol=tol)
+
+    m_np = m_np[:-1, :-1]
+    x_np = np.matmul(m_np.T[:-2, :], m_np)
+
+    m = m[:-1, :-1]
+    mmT = m.mT[:-2, :]
+    mT = dpt.asarray(mmT, copy=True, order="C")
+    x1 = dpt.matmul(mmT, m)
+    x2 = dpt.matmul(mT, m)
+
+    assert dpt.allclose(x1, x2, atol=tol, rtol=tol)
+    assert dpt.allclose(x1, dpt.asarray(x_np), atol=tol, rtol=tol)
+
+
+@pytest.mark.parametrize("dtype", _numeric_types)
+def test_tensordot_outer(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    t1 = dpt.ones((3, 8), dtype=dtype)
+    t2 = dpt.ones((4, 12), dtype=dtype)
+
+    r = dpt.tensordot(t1, t2, axes=0)
+    assert r.shape == t1.shape + t2.shape
+    assert dpt.allclose(r, dpt.ones_like(r))
+
+
+@pytest.mark.parametrize("dtype", _numeric_types)
+def test_tensordot_inner(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    t1 = dpt.ones((3, 8), dtype=dtype)
+    t2 = dpt.ones((4, 8), dtype=dtype)
+
+    r = dpt.tensordot(t1, t2.mT, axes=1)
+    assert r.shape == t1.shape[:1] + t2.shape[:1]
+    assert dpt.allclose(r, dpt.full_like(r, fill_value=t1.shape[1]))
+
+
+@pytest.mark.parametrize("dtype", _numeric_types)
+def test_tensordot_double(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    t1 = dpt.ones((2, 4, 8), dtype=dtype)
+    t2 = dpt.ones((3, 4, 8), dtype=dtype)
+
+    r = dpt.tensordot(t1, dpt.permute_dims(t2, (1, 2, 0)), axes=2)
+    assert r.shape == t1.shape[:1] + t2.shape[:1]
+    expected = dpt.prod(dpt.asarray(t1.shape[1:]))
+    assert dpt.allclose(r, dpt.full_like(r, fill_value=expected))
+
+
+@pytest.mark.parametrize("dtype", ["i4", "f4"])
+def test_tensordot_axes_sequence(dtype):
+    get_queue_or_skip()
+
+    r = 4
+    t1 = dpt.ones((2, 2, 4, 3), dtype=dtype)
+    t2 = dpt.ones((3, 2, 4, 3), dtype=dtype)
+
+    assert len(t1.shape) == r
+    assert len(t2.shape) == r
+
+    expected = dpt.prod(dpt.asarray(t1.shape[1:]))
+    ps1 = itertools.permutations(range(r))
+    ps2 = itertools.permutations(range(r))
+
+    for p1 in ps1:
+        assert len(p1) == r
+        inv_p1 = sorted(range(r), key=p1.__getitem__)
+        u1 = dpt.permute_dims(t1, p1)
+        x1_axes = inv_p1[1:]
+        for p2 in ps2:
+            inv_p2 = sorted(range(r), key=p2.__getitem__)
+            u2 = dpt.permute_dims(t2, p2)
+            x2_axes = inv_p2[1:]
+
+            tdr = dpt.tensordot(u1, u2, axes=(x1_axes, x2_axes))
+            assert tdr.shape == t1.shape[:1] + t2.shape[:1]
+            assert dpt.allclose(tdr, dpt.full_like(tdr, fill_value=expected))
+
+
+def test_tensordot_validation():
+    get_queue_or_skip()
+
+    with pytest.raises(TypeError):
+        dpt.tensordot(dict(), dict())
+
+    t1 = dpt.empty((10, 10, 10))
+    with pytest.raises(TypeError):
+        dpt.tensordot(t1, dict())
+
+    t2 = dpt.empty((10, 10, 10))
+    q = dpctl.SyclQueue(t2.sycl_context, t2.sycl_device, property="in_order")
+    with pytest.raises(dpctl.utils.ExecutionPlacementError):
+        dpt.tensordot(t1, t2.to_device(q))
+
+    invalid_axes = (
+        1,
+        2,
+        3,
+    )
+    with pytest.raises(ValueError):
+        dpt.tensordot(t1, t2, axes=invalid_axes)
+
+    invalid_axes = 5.2
+    with pytest.raises(TypeError):
+        dpt.tensordot(t1, t2, axes=invalid_axes)
+
+    invalid_axes = (
+        (1,),
+        (
+            0,
+            2,
+        ),
+    )
+    with pytest.raises(ValueError):
+        dpt.tensordot(t1, t2, axes=invalid_axes)
+
+    with pytest.raises(ValueError):
+        dpt.tensordot(t1[..., :5], t2)
+
+
+def test_tensordot_promotion():
+    get_queue_or_skip()
+
+    t1 = dpt.zeros((10, 10), dtype="i4")
+    t2 = dpt.zeros((10, 10), dtype="i8")
+
+    r1 = dpt.tensordot(t1, t2)
+    assert r1.dtype == t2.dtype
+
+    r2 = dpt.tensordot(t2, t1)
+    assert r2.dtype == t2.dtype
+
+    t3 = dpt.zeros((10, 10), dtype="u4")
+    r3 = dpt.tensordot(t1, t3)
+    assert r3.dtype == dpt.result_type(t1, t3)
+
+
+def test_tensordot_axes_errors():
+    get_queue_or_skip()
+
+    m1 = dpt.zeros((10, 10), dtype="i4")
+    m2 = dpt.zeros((10, 10), dtype="i4")
+
+    with pytest.raises(ValueError):
+        dpt.tensordot(m1, m2, axes=-1)
+
+
+# tests for gh-1570
+def test_tensordot_gemm_small_k_m():
+    get_queue_or_skip()
+
+    x1 = dpt.asarray(1, dtype="i2")
+    x2 = dpt.asarray([0, 1, 0, 0], dtype="i2")
+
+    res = dpt.tensordot(x1, x2, axes=0)
+    assert dpt.all(x2 == res)
+
+
+@pytest.mark.parametrize("dtype", _numeric_types)
+def test_vecdot_1d(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    n = 511
+    v1 = dpt.ones(n, dtype=dtype)
+
+    v2 = dpt.ones(n, dtype=dtype)
+
+    r = dpt.vecdot(v1, v2)
+    expected_value = _map_int_to_type(n, r.dtype)
+    assert r == expected_value
+
+
+@pytest.mark.parametrize("dtype", _numeric_types)
+def test_vecdot_3d(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    m1, m2, n = 7, 3, 511
+    v1 = dpt.ones((m1, m2, n), dtype=dtype)
+
+    v2 = dpt.ones((m1, m2, n), dtype=dtype)
+
+    r = dpt.vecdot(v1, v2)
+
+    assert r.shape == (
+        m1,
+        m2,
+    )
+    expected_value = _map_int_to_type(n, r.dtype)
+    assert dpt.all(r == expected_value)
+
+
+@pytest.mark.parametrize("dtype", _numeric_types)
+def test_vecdot_axis(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    m1, m2, n = 7, 3, 511
+    v1 = dpt.ones((m1, n, m2), dtype=dtype)
+
+    v2 = dpt.ones((m1, n, m2), dtype=dtype)
+
+    r = dpt.vecdot(v1, v2, axis=-2)
+
+    assert r.shape == (
+        m1,
+        m2,
+    )
+    expected_value = _map_int_to_type(n, r.dtype)
+    assert dpt.all(r == expected_value)
+
+
+@pytest.mark.parametrize("dtype", _numeric_types)
+def test_vecdot_strided(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    m1, m2, n = 7, 3, 511
+    list1 = [1, 0, 2, 0]
+    pattern1 = dpt.asarray(list1, dtype=dtype)
+    n_padded1 = pattern1.size * (1 + ((n - 1) // pattern1.size))
+    v1 = dpt.tile(dpt.reshape(pattern1, (1, -1, 1)), (m1, n_padded1, m2))[
+        ::-1, :n, :
+    ]
+
+    list2 = [1, 2, 1, 2]
+    pattern2 = dpt.asarray(list2, dtype=dtype)
+    n_padded2 = pattern2.size * (1 + ((n - 1) // pattern2.size))
+    v2 = dpt.tile(dpt.reshape(pattern2, (1, -1, 1)), (m1, n_padded2, m2))[
+        :, :n, ::-1
+    ]
+
+    r = dpt.vecdot(v1, v2, axis=-2)
+
+    ref = sum(
+        el1 * el2
+        for el1, el2 in zip((list1 * n_padded1)[:n], (list2 * n_padded1)[:n])
+    )
+
+    assert r.shape == (
+        m1,
+        m2,
+    )
+    ref = _map_int_to_type(ref, r.dtype)
+    assert dpt.all(r == ref)
+
+
+def test_vector_arg_validation():
+    get_queue_or_skip()
+
+    s1, s2 = dpt.ones(tuple()), dpt.zeros(tuple())
+    v1, v2 = dpt.ones(16), dpt.zeros(16)
+
+    with pytest.raises(ValueError):
+        dpt.vecdot(s1, v2)
+
+    with pytest.raises(ValueError):
+        dpt.vecdot(v1, s2)
+
+    with pytest.raises(TypeError):
+        dpt.vecdot(dict(), v2)
+
+    with pytest.raises(TypeError):
+        dpt.vecdot(v2, None)
+
+    with pytest.raises(ValueError):
+        dpt.vecdot(v1[:5], v2[:4])
+
+    with pytest.raises(ValueError):
+        dpt.vecdot(v1, v2, axis=2)
+
+    with pytest.raises(ValueError):
+        dpt.vecdot(v1, v2, axis=-2)
+
+    q = dpctl.SyclQueue(
+        v2.sycl_context, v2.sycl_device, property="enable_profiling"
+    )
+    with pytest.raises(dpctl.utils.ExecutionPlacementError):
+        dpt.vecdot(v1, v2.to_device(q))
+
+    m1 = dpt.empty((10, 5))
+    m2 = dpt.empty((5, 5))
+    with pytest.raises(ValueError):
+        dpt.vecdot(m1, m2, axis=-1)
+
+
+def test_vecdot_broadcast():
+    get_queue_or_skip()
+
+    for dt1, dt2 in [
+        (dpt.int32, dpt.int32),
+        (dpt.int32, dpt.int64),
+        (dpt.int64, dpt.int32),
+        (dpt.int32, dpt.uint32),
+    ]:
+        m1 = dpt.zeros((1, 5), dtype=dt1)
+        m2 = dpt.zeros((5, 5), dtype=dt2)
+        r1 = dpt.vecdot(m1, m2, axis=-1)
+        r2 = dpt.vecdot(m2, m1, axis=-1)
+        assert r1.shape == r2.shape
+
+
+@pytest.mark.parametrize("dt1", _numeric_types)
+@pytest.mark.parametrize("dt2", _numeric_types)
+def test_vecdot_type_promotion(dt1, dt2):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dt1, q)
+    skip_if_dtype_not_supported(dt2, q)
+
+    v1 = dpt.ones(128, dtype=dt1)
+    v2 = dpt.ones(128, dtype=dt2)
+
+    r = dpt.vecdot(v1, v2)
+    mul = v1 * v2
+    assert r.shape == ()
+    assert r.dtype == mul.dtype
+    assert dpt.allclose(r, dpt.sum(mul, dtype=mul.dtype))
+
+
+def test_vecdot_broadcast_o1_buffer():
+    get_queue_or_skip()
+
+    v1 = dpt.arange(10, dtype="i2")
+    v2 = dpt.ones((5, 10), dtype="i4")
+
+    res1 = dpt.vecdot(v1, v2)
+    assert res1.shape == (5,)
+
+    res2 = dpt.vecdot(v2, v1)
+    assert res2.shape == (5,)
+
+
+def test_vecdot_contig_small():
+    get_queue_or_skip()
+
+    n = 1
+    for dt in [dpt.int16, dpt.int32, dpt.complex64]:
+        v1 = dpt.zeros((10, n), dtype=dt)
+        v2 = dpt.ones_like(v1, dtype=dt)
+        v1[-1] = 1
+        res = dpt.vecdot(v1, v2)
+        assert dpt.all(res[:-1] == 0)
+        assert res[-1] == n
+
+
+def test_matmul_out_appended_axes():
+    get_queue_or_skip()
+
+    n0, n1, n2 = 4, 10, 5
+    # vm
+    x1 = dpt.ones(n1, dtype="i4")
+    x2 = dpt.ones((n0, n1, n2), dtype="i4")
+    out = dpt.empty((n0, n2), dtype="i4")
+
+    dpt.matmul(x1, x2, out=out)
+    assert dpt.all(out == n1)
+
+    # mv
+    x2 = x2.mT
+    x1, x2 = x2, x1
+    dpt.matmul(x1, x2, out=out)
+    assert dpt.all(out == n1)
+
+    # vv
+    x1 = dpt.ones(n1, dtype="i4")
+    out = dpt.empty((), dtype="i4")
+    dpt.matmul(x1, x2, out=out)
+    assert out == n1
diff --git a/dpnp/tests/tensor/test_usm_ndarray_manipulation.py b/dpnp/tests/tensor/test_usm_ndarray_manipulation.py
new file mode 100644
index 000000000000..45a53aa0532d
--- /dev/null
+++ b/dpnp/tests/tensor/test_usm_ndarray_manipulation.py
@@ -0,0 +1,1609 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+import itertools
+
+import dpctl
+import numpy as np
+import pytest
+from dpctl.utils import ExecutionPlacementError
+from numpy.testing import assert_, assert_array_equal, assert_raises_regex
+
+import dpnp.tensor as dpt
+from dpnp.tensor._numpy_helper import AxisError
+
+from .helper import get_queue_or_skip
+
+
+def test_permute_dims_incorrect_type():
+    X_list = list([[1, 2, 3], [4, 5, 6]])
+    X_tuple = tuple(X_list)
+    Xnp = np.array(X_list)
+
+    pytest.raises(TypeError, dpt.permute_dims, X_list, (1, 0))
+    pytest.raises(TypeError, dpt.permute_dims, X_tuple, (1, 0))
+    pytest.raises(TypeError, dpt.permute_dims, Xnp, (1, 0))
+
+
+def test_permute_dims_empty_array():
+    q = get_queue_or_skip()
+
+    Xnp = np.empty((10, 0))
+    X = dpt.asarray(Xnp, sycl_queue=q)
+    Y = dpt.permute_dims(X, (1, 0))
+    Ynp = np.transpose(Xnp, (1, 0))
+    assert_array_equal(Ynp, dpt.asnumpy(Y))
+
+
+def test_permute_dims_0d_1d():
+    q = get_queue_or_skip()
+
+    Xnp_0d = np.array(1, dtype="int64")
+    X_0d = dpt.asarray(Xnp_0d, sycl_queue=q)
+    Y_0d = dpt.permute_dims(X_0d, ())
+    assert_array_equal(dpt.asnumpy(Y_0d), dpt.asnumpy(X_0d))
+
+    Xnp_1d = np.random.randint(0, 2, size=6, dtype="int64")
+    X_1d = dpt.asarray(Xnp_1d, sycl_queue=q)
+    Y_1d = dpt.permute_dims(X_1d, (0))
+    assert_array_equal(dpt.asnumpy(Y_1d), dpt.asnumpy(X_1d))
+
+    pytest.raises(ValueError, dpt.permute_dims, X_1d, ())
+    pytest.raises(AxisError, dpt.permute_dims, X_1d, (1))
+    pytest.raises(ValueError, dpt.permute_dims, X_1d, (1, 0))
+    pytest.raises(
+        ValueError, dpt.permute_dims, dpt.reshape(X_1d, (2, 3)), (1, 1)
+    )
+
+
+@pytest.mark.parametrize("shapes", [(2, 2), (1, 4), (3, 3, 3), (4, 1, 3)])
+def test_permute_dims_2d_3d(shapes):
+    q = get_queue_or_skip()
+
+    Xnp_size = np.prod(shapes)
+
+    Xnp = np.random.randint(0, 2, size=Xnp_size, dtype="int64").reshape(shapes)
+    X = dpt.asarray(Xnp, sycl_queue=q)
+    X_ndim = X.ndim
+    if X_ndim == 2:
+        Y = dpt.permute_dims(X, (1, 0))
+        Ynp = np.transpose(Xnp, (1, 0))
+    elif X_ndim == 3:
+        X = dpt.asarray(Xnp, sycl_queue=q)
+        Y = dpt.permute_dims(X, (2, 0, 1))
+        Ynp = np.transpose(Xnp, (2, 0, 1))
+    assert_array_equal(Ynp, dpt.asnumpy(Y))
+
+
+def test_expand_dims_incorrect_type():
+    X_list = [1, 2, 3, 4, 5]
+    with pytest.raises(TypeError):
+        dpt.permute_dims(X_list, axis=1)
+
+
+def test_expand_dims_0d():
+    q = get_queue_or_skip()
+
+    Xnp = np.array(1, dtype="int64")
+    X = dpt.asarray(Xnp, sycl_queue=q)
+
+    Y = dpt.expand_dims(X, axis=0)
+    Ynp = np.expand_dims(Xnp, axis=0)
+    assert_array_equal(Ynp, dpt.asnumpy(Y))
+
+    Y = dpt.expand_dims(X, axis=-1)
+    Ynp = np.expand_dims(Xnp, axis=-1)
+    assert_array_equal(Ynp, dpt.asnumpy(Y))
+
+    pytest.raises(AxisError, dpt.expand_dims, X, axis=1)
+    pytest.raises(AxisError, dpt.expand_dims, X, axis=-2)
+
+
+@pytest.mark.parametrize("shapes", [(3,), (3, 3), (3, 3, 3)])
+def test_expand_dims_1d_3d(shapes):
+    q = get_queue_or_skip()
+
+    Xnp_size = np.prod(shapes)
+
+    Xnp = np.random.randint(0, 2, size=Xnp_size, dtype="int64").reshape(shapes)
+    X = dpt.asarray(Xnp, sycl_queue=q)
+    shape_len = len(shapes)
+    for axis in range(-shape_len - 1, shape_len):
+        Y = dpt.expand_dims(X, axis=axis)
+        Ynp = np.expand_dims(Xnp, axis=axis)
+        assert_array_equal(Ynp, dpt.asnumpy(Y))
+
+    pytest.raises(AxisError, dpt.expand_dims, X, axis=shape_len + 1)
+    pytest.raises(AxisError, dpt.expand_dims, X, axis=-shape_len - 2)
+
+
+@pytest.mark.parametrize(
+    "axes", [(0, 1, 2), (0, -1, -2), (0, 3, 5), (0, -3, -5)]
+)
+def test_expand_dims_tuple(axes):
+    q = get_queue_or_skip()
+
+    Xnp = np.empty((3, 3, 3), dtype="u1")
+    X = dpt.asarray(Xnp, sycl_queue=q)
+    Y = dpt.expand_dims(X, axis=axes)
+    Ynp = np.expand_dims(Xnp, axis=axes)
+    assert_array_equal(Ynp, dpt.asnumpy(Y))
+
+
+def test_expand_dims_incorrect_tuple():
+    try:
+        X = dpt.empty((3, 3, 3), dtype="i4")
+    except dpctl.SyclDeviceCreationError:
+        pytest.skip("No SYCL devices available")
+    with pytest.raises(AxisError):
+        dpt.expand_dims(X, axis=(0, -6))
+    with pytest.raises(AxisError):
+        dpt.expand_dims(X, axis=(0, 5))
+
+    with pytest.raises(ValueError):
+        dpt.expand_dims(X, axis=(1, 1))
+
+
+def test_squeeze_incorrect_type():
+    X_list = [1, 2, 3, 4, 5]
+    with pytest.raises(TypeError):
+        dpt.permute_dims(X_list, 1)
+
+
+def test_squeeze_0d():
+    q = get_queue_or_skip()
+
+    Xnp = np.array(1)
+    X = dpt.asarray(Xnp, sycl_queue=q)
+    Y = dpt.squeeze(X)
+    Ynp = Xnp.squeeze()
+    assert_array_equal(Ynp, dpt.asnumpy(Y))
+
+    Y = dpt.squeeze(X, 0)
+    Ynp = Xnp.squeeze(0)
+    assert_array_equal(Ynp, dpt.asnumpy(Y))
+
+    Y = dpt.squeeze(X, (0))
+    Ynp = Xnp.squeeze(0)
+    assert_array_equal(Ynp, dpt.asnumpy(Y))
+
+    Y = dpt.squeeze(X, -1)
+    Ynp = Xnp.squeeze(-1)
+    assert_array_equal(Ynp, dpt.asnumpy(Y))
+
+    pytest.raises(AxisError, dpt.squeeze, X, 1)
+    pytest.raises(AxisError, dpt.squeeze, X, -2)
+    pytest.raises(AxisError, dpt.squeeze, X, (1))
+    pytest.raises(AxisError, dpt.squeeze, X, (-2))
+    pytest.raises(ValueError, dpt.squeeze, X, (0, 0))
+
+
+@pytest.mark.parametrize(
+    "shapes",
+    [
+        (0),
+        (1),
+        (1, 2),
+        (2, 1),
+        (1, 1),
+        (2, 2),
+        (1, 0),
+        (0, 1),
+        (1, 2, 1),
+        (2, 1, 2),
+        (2, 2, 2),
+        (1, 1, 1),
+        (1, 0, 1),
+        (0, 1, 0),
+    ],
+)
+def test_squeeze_without_axes(shapes):
+    q = get_queue_or_skip()
+
+    Xnp = np.empty(shapes, dtype="u1")
+    X = dpt.asarray(Xnp, sycl_queue=q)
+    Y = dpt.squeeze(X)
+    Ynp = Xnp.squeeze()
+    assert_array_equal(Ynp, dpt.asnumpy(Y))
+
+
+@pytest.mark.parametrize("axes", [0, 2, (0), (2), (0, 2)])
+def test_squeeze_axes_arg(axes):
+    q = get_queue_or_skip()
+
+    Xnp = np.array([[[1], [2], [3]]], dtype="u1")
+    X = dpt.asarray(Xnp, sycl_queue=q)
+    Y = dpt.squeeze(X, axes)
+    Ynp = Xnp.squeeze(axes)
+    assert_array_equal(Ynp, dpt.asnumpy(Y))
+
+
+@pytest.mark.parametrize("axes", [1, -2, (1), (-2), (0, 0), (1, 1)])
+def test_squeeze_axes_arg_error(axes):
+    q = get_queue_or_skip()
+
+    Xnp = np.array([[[1], [2], [3]]], dtype="u1")
+    X = dpt.asarray(Xnp, sycl_queue=q)
+    pytest.raises(ValueError, dpt.squeeze, X, axes)
+
+
+@pytest.mark.parametrize(
+    "data",
+    [
+        [np.array(0, dtype="u1"), (0,)],
+        [np.array(0, dtype="u1"), (1,)],
+        [np.array(0, dtype="u1"), (3,)],
+        [np.ones(1, dtype="u1"), (1,)],
+        [np.ones(1, dtype="u1"), (2,)],
+        [np.ones(1, dtype="u1"), (1, 2, 3)],
+        [np.arange(3, dtype="u1"), (3,)],
+        [np.arange(3, dtype="u1"), (1, 3)],
+        [np.arange(3, dtype="u1"), (2, 3)],
+        [np.ones(0, dtype="u1"), 0],
+        [np.ones(1, dtype="u1"), 1],
+        [np.ones(1, dtype="u1"), 2],
+        [np.ones(1, dtype="u1"), (0,)],
+        [np.ones((1, 2), dtype="u1"), (0, 2)],
+        [np.ones((2, 1), dtype="u1"), (2, 0)],
+    ],
+)
+def test_broadcast_to_succeeds(data):
+    q = get_queue_or_skip()
+
+    Xnp, target_shape = data
+    X = dpt.asarray(Xnp, sycl_queue=q)
+    Y = dpt.broadcast_to(X, target_shape)
+    Ynp = np.broadcast_to(Xnp, target_shape)
+    assert_array_equal(dpt.asnumpy(Y), Ynp)
+
+
+@pytest.mark.parametrize(
+    "data",
+    [
+        [(0,), ()],
+        [(1,), ()],
+        [(3,), ()],
+        [(3,), (1,)],
+        [(3,), (2,)],
+        [(3,), (4,)],
+        [(1, 2), (2, 1)],
+        [(1, 1), (1,)],
+        [(1,), -1],
+        [(1,), (-1,)],
+        [(1, 2), (-1, 2)],
+    ],
+)
+def test_broadcast_to_raises(data):
+    q = get_queue_or_skip()
+
+    orig_shape, target_shape = data
+    Xnp = np.zeros(orig_shape, dtype="i1")
+    X = dpt.asarray(Xnp, sycl_queue=q)
+    pytest.raises(ValueError, dpt.broadcast_to, X, target_shape)
+
+
+def assert_broadcast_correct(input_shapes):
+    q = get_queue_or_skip()
+    np_arrays = [np.zeros(s, dtype="i1") for s in input_shapes]
+    out_np_arrays = np.broadcast_arrays(*np_arrays)
+    usm_arrays = [dpt.asarray(Xnp, sycl_queue=q) for Xnp in np_arrays]
+    out_usm_arrays = dpt.broadcast_arrays(*usm_arrays)
+    for Xnp, X in zip(out_np_arrays, out_usm_arrays):
+        assert_array_equal(
+            Xnp, dpt.asnumpy(X), err_msg=f"Failed for {input_shapes})"
+        )
+
+
+def assert_broadcast_arrays_raise(input_shapes):
+    q = get_queue_or_skip()
+    usm_arrays = [dpt.asarray(np.zeros(s), sycl_queue=q) for s in input_shapes]
+    pytest.raises(ValueError, dpt.broadcast_arrays, *usm_arrays)
+
+
+def test_broadcast_arrays_same():
+    q = get_queue_or_skip()
+    Xnp = np.arange(10)
+    Ynp = np.arange(10)
+    res_Xnp, res_Ynp = np.broadcast_arrays(Xnp, Ynp)
+    X = dpt.asarray(Xnp, sycl_queue=q)
+    Y = dpt.asarray(Ynp, sycl_queue=q)
+    res_X, res_Y = dpt.broadcast_arrays(X, Y)
+    assert_array_equal(res_Xnp, dpt.asnumpy(res_X))
+    assert_array_equal(res_Ynp, dpt.asnumpy(res_Y))
+
+
+def test_broadcast_arrays_one_off():
+    q = get_queue_or_skip()
+    Xnp = np.array([[1, 2, 3]])
+    Ynp = np.array([[1], [2], [3]])
+    res_Xnp, res_Ynp = np.broadcast_arrays(Xnp, Ynp)
+    X = dpt.asarray(Xnp, sycl_queue=q)
+    Y = dpt.asarray(Ynp, sycl_queue=q)
+    res_X, res_Y = dpt.broadcast_arrays(X, Y)
+    assert_array_equal(res_Xnp, dpt.asnumpy(res_X))
+    assert_array_equal(res_Ynp, dpt.asnumpy(res_Y))
+
+
+@pytest.mark.parametrize(
+    "shapes",
+    [
+        (),
+        (1,),
+        (3,),
+        (0, 1),
+        (0, 3),
+        (1, 0),
+        (3, 0),
+        (1, 3),
+        (3, 1),
+        (3, 3),
+    ],
+)
+def test_broadcast_arrays_same_shapes(shapes):
+    for shape in shapes:
+        single_input_shapes = [shape]
+        assert_broadcast_correct(single_input_shapes)
+        double_input_shapes = [shape, shape]
+        assert_broadcast_correct(double_input_shapes)
+        triple_input_shapes = [shape, shape, shape]
+        assert_broadcast_correct(triple_input_shapes)
+
+
+@pytest.mark.parametrize(
+    "shapes",
+    [
+        [[(1,), (3,)]],
+        [[(1, 3), (3, 3)]],
+        [[(3, 1), (3, 3)]],
+        [[(1, 3), (3, 1)]],
+        [[(1, 1), (3, 3)]],
+        [[(1, 1), (1, 3)]],
+        [[(1, 1), (3, 1)]],
+        [[(1, 0), (0, 0)]],
+        [[(0, 1), (0, 0)]],
+        [[(1, 0), (0, 1)]],
+        [[(1, 1), (0, 0)]],
+        [[(1, 1), (1, 0)]],
+        [[(1, 1), (0, 1)]],
+    ],
+)
+def test_broadcast_arrays_same_len_shapes(shapes):
+    # Check that two different input shapes of the same length, but some have
+    # ones, broadcast to the correct shape.
+
+    for input_shapes in shapes:
+        assert_broadcast_correct(input_shapes)
+        assert_broadcast_correct(input_shapes[::-1])
+
+
+@pytest.mark.parametrize(
+    "shapes",
+    [
+        [[(), (3,)]],
+        [[(3,), (3, 3)]],
+        [[(3,), (3, 1)]],
+        [[(1,), (3, 3)]],
+        [[(), (3, 3)]],
+        [[(1, 1), (3,)]],
+        [[(1,), (3, 1)]],
+        [[(1,), (1, 3)]],
+        [[(), (1, 3)]],
+        [[(), (3, 1)]],
+        [[(), (0,)]],
+        [[(0,), (0, 0)]],
+        [[(0,), (0, 1)]],
+        [[(1,), (0, 0)]],
+        [[(), (0, 0)]],
+        [[(1, 1), (0,)]],
+        [[(1,), (0, 1)]],
+        [[(1,), (1, 0)]],
+        [[(), (1, 0)]],
+        [[(), (0, 1)]],
+    ],
+)
+def test_broadcast_arrays_different_len_shapes(shapes):
+    # Check that two different input shapes (of different lengths) broadcast
+    # to the correct shape.
+
+    for input_shapes in shapes:
+        assert_broadcast_correct(input_shapes)
+        assert_broadcast_correct(input_shapes[::-1])
+
+
+@pytest.mark.parametrize(
+    "shapes",
+    [
+        [[(3,), (4,)]],
+        [[(2, 3), (2,)]],
+        [[(3,), (3,), (4,)]],
+        [[(1, 3, 4), (2, 3, 3)]],
+    ],
+)
+def test_incompatible_shapes_raise_valueerror(shapes):
+    for input_shapes in shapes:
+        assert_broadcast_arrays_raise(input_shapes)
+        assert_broadcast_arrays_raise(input_shapes[::-1])
+
+
+def test_broadcast_arrays_no_args():
+    with pytest.raises(ValueError):
+        dpt.broadcast_arrays()
+
+
+def test_flip_axis_incorrect():
+    q = get_queue_or_skip()
+
+    X_np = np.ones((4, 4))
+    X = dpt.asarray(X_np, sycl_queue=q)
+
+    pytest.raises(AxisError, dpt.flip, dpt.asarray(np.ones(4)), axis=1)
+    pytest.raises(AxisError, dpt.flip, X, axis=2)
+    pytest.raises(AxisError, dpt.flip, X, axis=-3)
+    pytest.raises(AxisError, dpt.flip, X, axis=(0, 3))
+
+
+def test_flip_0d():
+    q = get_queue_or_skip()
+
+    Xnp = np.array(1, dtype="int64")
+    X = dpt.asarray(Xnp, sycl_queue=q)
+    Ynp = np.flip(Xnp)
+    Y = dpt.flip(X)
+    assert_array_equal(Ynp, dpt.asnumpy(Y))
+
+    pytest.raises(AxisError, dpt.flip, X, axis=0)
+    pytest.raises(AxisError, dpt.flip, X, axis=1)
+    pytest.raises(AxisError, dpt.flip, X, axis=-1)
+
+
+def test_flip_1d():
+    q = get_queue_or_skip()
+
+    Xnp = np.arange(6)
+    X = dpt.asarray(Xnp, sycl_queue=q)
+
+    for ax in range(-X.ndim, X.ndim):
+        Ynp = np.flip(Xnp, axis=ax)
+        Y = dpt.flip(X, axis=ax)
+        assert_array_equal(Ynp, dpt.asnumpy(Y))
+
+    Ynp = np.flip(Xnp, axis=0)
+    Y = dpt.flip(X, axis=0)
+    assert_array_equal(Ynp, dpt.asnumpy(Y))
+
+
+@pytest.mark.parametrize(
+    "shapes",
+    [
+        (3, 2),
+        (2, 3),
+        (2, 2),
+        (3, 3),
+        (3, 2, 3),
+        (2, 3, 2),
+        (2, 2, 2),
+        (3, 3, 3),
+    ],
+)
+def test_flip_2d_3d(shapes):
+    q = get_queue_or_skip()
+
+    Xnp_size = np.prod(shapes)
+    Xnp = np.arange(Xnp_size).reshape(shapes)
+    X = dpt.asarray(Xnp, sycl_queue=q)
+    for ax in range(-X.ndim, X.ndim):
+        Y = dpt.flip(X, axis=ax)
+        Ynp = np.flip(Xnp, axis=ax)
+        assert_array_equal(Ynp, dpt.asnumpy(Y))
+
+
+@pytest.mark.parametrize(
+    "shapes",
+    [
+        (1,),
+        (3,),
+        (2, 3),
+        (3, 2),
+        (2, 2),
+        (1, 2, 3),
+        (2, 1, 3),
+        (2, 3, 1),
+        (3, 2, 1),
+        (3, 3, 3),
+    ],
+)
+def test_flip_default_axes(shapes):
+    q = get_queue_or_skip()
+
+    Xnp_size = np.prod(shapes)
+    Xnp = np.arange(Xnp_size).reshape(shapes)
+    X = dpt.asarray(Xnp, sycl_queue=q)
+    Y = dpt.flip(X)
+    Ynp = np.flip(Xnp)
+    assert_array_equal(Ynp, dpt.asnumpy(Y))
+
+
+@pytest.mark.parametrize(
+    "shapes",
+    [
+        (0),
+        (1),
+        (1, 1),
+        (1, 0),
+        (0, 1),
+        (1, 1, 1),
+        (1, 0, 1),
+        (0, 1, 0),
+    ],
+)
+def test_flip_empty_0_size_dim(shapes):
+    q = get_queue_or_skip()
+
+    X = dpt.empty(shapes, sycl_queue=q)
+    dpt.flip(X)
+
+
+@pytest.mark.parametrize(
+    "data",
+    [
+        [(2, 3), (0, 1)],
+        [(2, 3), (1, 0)],
+        [(2, 3), ()],
+        [(2, 1, 3), (0, 2)],
+        [(3, 1, 2), (2, 0)],
+        [(3, 3, 3), (2,)],
+        [(1, 2, 3), [0, -2]],
+        [(3, 1, 2), [-1, 0]],
+        [(3, 3, 3), [-2, -1]],
+    ],
+)
+def test_flip_multiple_axes(data):
+    q = get_queue_or_skip()
+
+    shape, axes = data
+    Xnp_size = np.prod(shape)
+    Xnp = np.arange(Xnp_size).reshape(shape)
+    X = dpt.asarray(Xnp, sycl_queue=q)
+    Y = dpt.flip(X, axis=axes)
+    Ynp = np.flip(Xnp, axis=axes)
+    assert_array_equal(Ynp, dpt.asnumpy(Y))
+
+
+def test_roll_scalar():
+    q = get_queue_or_skip()
+
+    Xnp = np.ones([], dtype="f4")
+    X = dpt.asarray(Xnp, sycl_queue=q)
+
+    Y = dpt.roll(X, 1)
+    Ynp = np.roll(Xnp, 1)
+    assert_array_equal(Ynp, dpt.asnumpy(Y))
+    with pytest.raises(AxisError):
+        dpt.roll(X, 1, axis=0)
+    with pytest.raises(AxisError):
+        dpt.roll(X, 1, axis=1)
+
+
+@pytest.mark.parametrize(
+    "data",
+    [
+        [2, None],
+        [-2, None],
+        [2, 0],
+        [-2, 0],
+        [2, ()],
+        [11, 0],
+    ],
+)
+def test_roll_1d(data):
+    q = get_queue_or_skip()
+
+    Xnp = np.arange(10)
+    X = dpt.asarray(Xnp, sycl_queue=q)
+    sh, ax = data
+
+    Y = dpt.roll(X, sh, axis=ax)
+    Ynp = np.roll(Xnp, sh, axis=ax)
+    assert_array_equal(Ynp, dpt.asnumpy(Y))
+
+    Y = dpt.roll(X, sh, axis=ax)
+    Ynp = np.roll(Xnp, sh, axis=ax)
+    assert_array_equal(Ynp, dpt.asnumpy(Y))
+
+
+@pytest.mark.parametrize(
+    "data",
+    [
+        [1, None],
+        [1, 0],
+        [1, 1],
+        [1, ()],
+        # Roll multiple axes at once
+        [1, (0, 1)],
+        [(1, 0), (0, 1)],
+        [(-1, 0), (1, 0)],
+        [(0, 1), (0, 1)],
+        [(0, -1), (0, 1)],
+        [(1, 1), (0, 1)],
+        [(-1, -1), (0, 1)],
+        # Roll the same axis multiple times.
+        [1, (0, 0)],
+        [1, (1, 1)],
+        # Roll more than one turn in either direction.
+        [6, 1],
+        [-4, 1],
+    ],
+)
+def test_roll_2d(data):
+    q = get_queue_or_skip()
+
+    Xnp = np.arange(10).reshape(2, 5)
+    X = dpt.asarray(Xnp, sycl_queue=q)
+    sh, ax = data
+
+    Y = dpt.roll(X, sh, axis=ax)
+    Ynp = np.roll(Xnp, sh, axis=ax)
+    assert_array_equal(Ynp, dpt.asnumpy(Y))
+
+
+def test_roll_out_bounds_shifts():
+    "See gh-1857"
+    get_queue_or_skip()
+
+    x = dpt.arange(4)
+    y = dpt.roll(x, np.uint64(2**63 + 2))
+    expected = dpt.roll(x, 2)
+    assert dpt.all(y == expected)
+
+    x_empty = x[1:1]
+    y = dpt.roll(x_empty, 11)
+    assert y.size == 0
+
+    x_2d = dpt.reshape(x, (2, 2))
+    y = dpt.roll(x_2d, np.uint64(2**63 + 1), axis=1)
+    expected = dpt.roll(x_2d, 1, axis=1)
+    assert dpt.all(y == expected)
+
+    x_2d_empty = x_2d[:, 1:1]
+    y = dpt.roll(x_2d_empty, 3, axis=1)
+    expected = dpt.empty_like(x_2d_empty)
+    assert dpt.all(y == expected)
+
+
+def test_roll_validation():
+    get_queue_or_skip()
+
+    X = {}
+    with pytest.raises(TypeError):
+        dpt.roll(X)
+
+    X = dpt.empty((1, 2, 3))
+    shift = ((2, 3, 1), (1, 2, 3))
+    with pytest.raises(ValueError):
+        dpt.roll(X, shift=shift, axis=(0, 1, 2))
+
+
+def test_concat_incorrect_type():
+    Xnp = np.ones((2, 2))
+    with pytest.raises(TypeError):
+        dpt.concat()
+    with pytest.raises(TypeError):
+        dpt.concat([])
+    with pytest.raises(TypeError):
+        dpt.concat(Xnp)
+    with pytest.raises(TypeError):
+        dpt.concat([Xnp, Xnp])
+
+
+def test_concat_incorrect_queue():
+    q1 = get_queue_or_skip()
+    q2 = get_queue_or_skip()
+
+    X = dpt.ones((2, 2), sycl_queue=q1)
+    Y = dpt.ones((2, 2), sycl_queue=q2)
+
+    pytest.raises(ValueError, dpt.concat, [X, Y])
+
+
+def test_concat_different_dtype():
+    q = get_queue_or_skip()
+
+    X = dpt.ones((2, 2), dtype=np.int64, sycl_queue=q)
+    Y = dpt.ones((3, 2), dtype=np.uint32, sycl_queue=q)
+
+    XY = dpt.concat([X, Y])
+
+    assert XY.dtype is X.dtype
+    assert XY.shape == (5, 2)
+    assert XY.sycl_queue == q
+
+    X1 = dpt.arange(10, dtype="i2", sycl_queue=q)
+    Y1 = dpt.arange(5, dtype="i4", sycl_queue=q)
+
+    XY1 = dpt.concat([X1[::2], Y1[::-1]], axis=None)
+    assert XY1.shape == (10,)
+    assert XY1.sycl_queue == q
+    assert XY1.dtype == Y1.dtype
+
+
+def test_concat_incorrect_ndim():
+    q = get_queue_or_skip()
+
+    X = dpt.ones((2, 2), sycl_queue=q)
+    Y = dpt.ones((2, 2, 2), sycl_queue=q)
+
+    pytest.raises(ValueError, dpt.concat, [X, Y])
+
+
+@pytest.mark.parametrize(
+    "data",
+    [
+        [(2, 2), (3, 3), 0],
+        [(2, 2), (3, 3), 1],
+        [(3, 2), (3, 3), 0],
+        [(2, 3), (3, 3), 1],
+    ],
+)
+def test_concat_incorrect_shape(data):
+    q = get_queue_or_skip()
+
+    Xshape, Yshape, axis = data
+
+    X = dpt.ones(Xshape, sycl_queue=q)
+    Y = dpt.ones(Yshape, sycl_queue=q)
+
+    pytest.raises(ValueError, dpt.concat, [X, Y], axis=axis)
+
+
+@pytest.mark.parametrize(
+    "data",
+    [
+        [(6,), 0],
+        [(2, 3), 1],
+        [(3, 2), -1],
+        [(1, 6), 0],
+        [(2, 1, 3), 2],
+    ],
+)
+def test_concat_1array(data):
+    q = get_queue_or_skip()
+
+    Xshape, axis = data
+
+    Xnp = np.arange(6).reshape(Xshape)
+    X = dpt.asarray(Xnp, sycl_queue=q)
+
+    Ynp = np.concatenate([Xnp], axis=axis)
+    Y = dpt.concat([X], axis=axis)
+
+    assert_array_equal(Ynp, dpt.asnumpy(Y))
+
+    Ynp = np.concatenate((Xnp,), axis=axis)
+    Y = dpt.concat((X,), axis=axis)
+
+    assert_array_equal(Ynp, dpt.asnumpy(Y))
+
+
+@pytest.mark.parametrize(
+    "data",
+    [
+        [(1,), (1,), 0],
+        [(0, 2), (0, 2), 1],
+        [(0, 2), (2, 2), 0],
+        [(2, 1), (2, 2), -1],
+        [(2, 2, 2), (2, 1, 2), 1],
+        [(3, 3, 3), (2, 2), None],
+    ],
+)
+def test_concat_2arrays(data):
+    q = get_queue_or_skip()
+
+    Xshape, Yshape, axis = data
+
+    Xnp = np.ones(Xshape)
+    X = dpt.asarray(Xnp, sycl_queue=q)
+
+    Ynp = np.zeros(Yshape)
+    Y = dpt.asarray(Ynp, sycl_queue=q)
+
+    Znp = np.concatenate([Xnp, Ynp], axis=axis)
+    Z = dpt.concat([X, Y], axis=axis)
+
+    assert_array_equal(Znp, dpt.asnumpy(Z))
+
+
+@pytest.mark.parametrize(
+    "data",
+    [
+        [(1,), (1,), (1,), 0],
+        [(0, 2), (2, 2), (1, 2), 0],
+        [(2, 1, 2), (2, 2, 2), (2, 4, 2), 1],
+    ],
+)
+def test_concat_3arrays(data):
+    q = get_queue_or_skip()
+
+    Xshape, Yshape, Zshape, axis = data
+
+    Xnp = np.ones(Xshape)
+    X = dpt.asarray(Xnp, sycl_queue=q)
+
+    Ynp = np.zeros(Yshape)
+    Y = dpt.asarray(Ynp, sycl_queue=q)
+
+    Znp = np.full(Zshape, 2.0)
+    Z = dpt.asarray(Znp, sycl_queue=q)
+
+    Rnp = np.concatenate([Xnp, Ynp, Znp], axis=axis)
+    R = dpt.concat([X, Y, Z], axis=axis)
+
+    assert_array_equal(Rnp, dpt.asnumpy(R))
+
+
+def test_concat_axis_none_strides():
+    q = get_queue_or_skip()
+    Xnp = np.arange(0, 18).reshape((6, 3))
+    X = dpt.asarray(Xnp, sycl_queue=q)
+
+    Ynp = np.arange(20, 36).reshape((4, 2, 2))
+    Y = dpt.asarray(Ynp, sycl_queue=q)
+
+    Znp = np.concatenate([Xnp[::2], Ynp[::2]], axis=None)
+    Z = dpt.concat([X[::2], Y[::2]], axis=None)
+
+    assert_array_equal(Znp, dpt.asnumpy(Z))
+
+
+def test_stack_incorrect_shape():
+    q = get_queue_or_skip()
+
+    X = dpt.ones((1,), sycl_queue=q)
+    Y = dpt.ones((2,), sycl_queue=q)
+
+    with pytest.raises(ValueError):
+        dpt.stack([X, Y], axis=0)
+
+
+@pytest.mark.parametrize(
+    "data",
+    [
+        [(6,), 0],
+        [(2, 3), 1],
+        [(3, 2), -1],
+        [(1, 6), 2],
+        [(2, 1, 3), 2],
+    ],
+)
+def test_stack_1array(data):
+    q = get_queue_or_skip()
+
+    shape, axis = data
+
+    Xnp = np.arange(6).reshape(shape)
+    X = dpt.asarray(Xnp, sycl_queue=q)
+
+    Ynp = np.stack([Xnp], axis=axis)
+    Y = dpt.stack([X], axis=axis)
+
+    assert_array_equal(Ynp, dpt.asnumpy(Y))
+
+    Ynp = np.stack((Xnp,), axis=axis)
+    Y = dpt.stack((X,), axis=axis)
+
+    assert_array_equal(Ynp, dpt.asnumpy(Y))
+
+
+@pytest.mark.parametrize(
+    "data",
+    [
+        [(1,), 0],
+        [(0, 2), 0],
+        [(2, 0), 0],
+        [(2, 3), 0],
+        [(2, 3), 1],
+        [(2, 3), 2],
+        [(2, 3), -1],
+        [(2, 3), -2],
+        [(2, 2, 2), 1],
+    ],
+)
+def test_stack_2arrays(data):
+    q = get_queue_or_skip()
+
+    shape, axis = data
+
+    Xnp = np.ones(shape)
+    X = dpt.asarray(Xnp, sycl_queue=q)
+
+    Ynp = np.zeros(shape)
+    Y = dpt.asarray(Ynp, sycl_queue=q)
+
+    Znp = np.stack([Xnp, Ynp], axis=axis)
+    Z = dpt.stack([X, Y], axis=axis)
+
+    assert_array_equal(Znp, dpt.asnumpy(Z))
+
+
+@pytest.mark.parametrize(
+    "data",
+    [
+        [(1,), 0],
+        [(0, 2), 0],
+        [(2, 1, 2), 1],
+    ],
+)
+def test_stack_3arrays(data):
+    q = get_queue_or_skip()
+
+    shape, axis = data
+
+    Xnp = np.ones(shape)
+    X = dpt.asarray(Xnp, sycl_queue=q)
+
+    Ynp = np.zeros(shape)
+    Y = dpt.asarray(Ynp, sycl_queue=q)
+
+    Znp = np.full(shape, 2.0)
+    Z = dpt.asarray(Znp, sycl_queue=q)
+
+    Rnp = np.stack([Xnp, Ynp, Znp], axis=axis)
+    R = dpt.stack([X, Y, Z], axis=axis)
+
+    assert_array_equal(Rnp, dpt.asnumpy(R))
+
+
+def test_can_cast():
+    q = get_queue_or_skip()
+
+    # incorrect input
+    X = dpt.ones((2, 2), dtype=dpt.int16, sycl_queue=q)
+    pytest.raises(TypeError, dpt.can_cast, X, 1)
+    pytest.raises(TypeError, dpt.can_cast, X, X)
+    X_np = np.ones((2, 2), dtype=np.int16)
+
+    assert dpt.can_cast(X, "float32") == np.can_cast(X_np, "float32")
+    assert dpt.can_cast(X, dpt.int32) == np.can_cast(X_np, np.int32)
+    assert dpt.can_cast(X, dpt.int64) == np.can_cast(X_np, np.int64)
+
+
+def test_result_type():
+    q = get_queue_or_skip()
+
+    usm_ar = dpt.ones((2), dtype=dpt.int16, sycl_queue=q)
+    np_ar = dpt.asnumpy(usm_ar)
+
+    X = [usm_ar, dpt.int32, "int64", usm_ar]
+    X_np = [np_ar, np.int32, "int64", np_ar]
+
+    assert dpt.result_type(*X) == np.result_type(*X_np)
+
+    X = [usm_ar, dpt.int32, "int64", True]
+    X_np = [np_ar, np.int32, "int64", True]
+
+    assert dpt.result_type(*X) == np.result_type(*X_np)
+
+    X = [usm_ar, dpt.int32, "int64", 2]
+    X_np = [np_ar, np.int32, "int64", 2]
+
+    assert dpt.result_type(*X) == np.result_type(*X_np)
+
+    X = [dpt.int32, "int64", 2]
+    X_np = [np.int32, "int64", 2]
+
+    assert dpt.result_type(*X) == np.result_type(*X_np)
+
+    X = [usm_ar, dpt.int32, "int64", 2.0]
+    X_np = [np_ar, np.int32, "int64", 2.0]
+
+    assert dpt.result_type(*X).kind == np.result_type(*X_np).kind
+
+    X = [usm_ar, dpt.int32, "int64", 2.0 + 1j]
+    X_np = [np_ar, np.int32, "int64", 2.0 + 1j]
+
+    assert dpt.result_type(*X).kind == np.result_type(*X_np).kind
+
+
+def test_swapaxes_1d():
+    get_queue_or_skip()
+    x = np.array([[1, 2, 3]])
+    exp = np.swapaxes(x, 0, 1)
+
+    y = dpt.asarray([[1, 2, 3]])
+    res = dpt.swapaxes(y, 0, 1)
+
+    assert_array_equal(exp, dpt.asnumpy(res))
+
+
+def test_swapaxes_2d():
+    get_queue_or_skip()
+    x = np.array([[[0, 1], [2, 3]], [[4, 5], [6, 7]]])
+    exp = np.swapaxes(x, 0, 2)
+
+    y = dpt.asarray([[[0, 1], [2, 3]], [[4, 5], [6, 7]]])
+    res = dpt.swapaxes(y, 0, 2)
+
+    assert_array_equal(exp, dpt.asnumpy(res))
+
+
+@pytest.mark.parametrize(
+    "source, expected",
+    [
+        (0, (6, 7, 5)),
+        (1, (5, 7, 6)),
+        (2, (5, 6, 7)),
+        (-1, (5, 6, 7)),
+    ],
+)
+def test_moveaxis_move_to_end(source, expected):
+    get_queue_or_skip()
+    x = dpt.reshape(dpt.arange(5 * 6 * 7), (5, 6, 7))
+    actual = dpt.moveaxis(x, source, -1).shape
+    assert_(actual, expected)
+
+
+@pytest.mark.parametrize(
+    "source, destination, expected",
+    [
+        (0, 1, (2, 1, 3, 4)),
+        (1, 2, (1, 3, 2, 4)),
+        (1, -1, (1, 3, 4, 2)),
+    ],
+)
+def test_moveaxis_new_position(source, destination, expected):
+    get_queue_or_skip()
+    x = dpt.reshape(dpt.arange(24), (1, 2, 3, 4))
+    actual = dpt.moveaxis(x, source, destination).shape
+    assert_(actual, expected)
+
+
+@pytest.mark.parametrize(
+    "source, destination",
+    [
+        (0, 0),
+        (3, -1),
+        (-1, 3),
+        ([0, -1], [0, -1]),
+        ([2, 0], [2, 0]),
+    ],
+)
+def test_moveaxis_preserve_order(source, destination):
+    get_queue_or_skip()
+    x = dpt.zeros((1, 2, 3, 4))
+    actual = dpt.moveaxis(x, source, destination).shape
+    assert_(actual, (1, 2, 3, 4))
+
+
+@pytest.mark.parametrize(
+    "shape, source, destination, expected",
+    [
+        ((0, 1, 2, 3), [0, 1], [2, 3], (2, 3, 0, 1)),
+        ((0, 1, 2, 3), [2, 3], [0, 1], (2, 3, 0, 1)),
+        ((0, 1, 2, 3), [0, 1, 2], [2, 3, 0], (2, 3, 0, 1)),
+        ((0, 1, 2, 3), [3, 0], [1, 0], (0, 3, 1, 2)),
+        ((0, 1, 2, 3), [0, 3], [0, 1], (0, 3, 1, 2)),
+        ((1, 2, 3, 4), range(4), range(4), (1, 2, 3, 4)),
+    ],
+)
+def test_moveaxis_move_multiples(shape, source, destination, expected):
+    get_queue_or_skip()
+    x = dpt.zeros(shape)
+    y = dpt.moveaxis(x, source, destination)
+    actual = y.shape
+    assert_(actual, expected)
+    assert y._pointer == x._pointer
+
+
+def test_moveaxis_errors():
+    try:
+        x_flat = dpt.arange(6)
+    except dpctl.SyclDeviceCreationError:
+        pytest.skip("No SYCL devices available")
+    x = dpt.reshape(x_flat, (1, 2, 3))
+    assert_raises_regex(
+        AxisError, "source.*out of bounds", dpt.moveaxis, x, 3, 0
+    )
+    assert_raises_regex(
+        AxisError, "source.*out of bounds", dpt.moveaxis, x, -4, 0
+    )
+    assert_raises_regex(
+        AxisError, "destination.*out of bounds", dpt.moveaxis, x, 0, 5
+    )
+    assert_raises_regex(
+        ValueError, "repeated axis in `source`", dpt.moveaxis, x, [0, 0], [0, 1]
+    )
+    assert_raises_regex(
+        ValueError,
+        "repeated axis in `destination`",
+        dpt.moveaxis,
+        x,
+        [0, 1],
+        [1, 1],
+    )
+    assert_raises_regex(
+        ValueError, "must have the same number", dpt.moveaxis, x, 0, [0, 1]
+    )
+    assert_raises_regex(
+        ValueError, "must have the same number", dpt.moveaxis, x, [0, 1], [0]
+    )
+
+
+def test_unstack_axis0():
+    try:
+        x_flat = dpt.arange(6)
+    except dpctl.SyclDeviceCreationError:
+        pytest.skip("No SYCL devices available")
+    y = dpt.reshape(x_flat, (2, 3))
+    res = dpt.unstack(y)
+
+    assert_array_equal(dpt.asnumpy(y[0, ...]), dpt.asnumpy(res[0]))
+    assert_array_equal(dpt.asnumpy(y[1, ...]), dpt.asnumpy(res[1]))
+
+
+def test_unstack_axis1():
+    try:
+        x_flat = dpt.arange(6)
+    except dpctl.SyclDeviceCreationError:
+        pytest.skip("No SYCL devices available")
+    y = dpt.reshape(x_flat, (2, 3))
+    res = dpt.unstack(y, axis=1)
+
+    assert_array_equal(dpt.asnumpy(y[:, 0, ...]), dpt.asnumpy(res[0]))
+    assert_array_equal(dpt.asnumpy(y[:, 1, ...]), dpt.asnumpy(res[1]))
+    assert_array_equal(dpt.asnumpy(y[:, 2, ...]), dpt.asnumpy(res[2]))
+
+
+def test_unstack_axis2():
+    try:
+        x_flat = dpt.arange(60)
+    except dpctl.SyclDeviceCreationError:
+        pytest.skip("No SYCL devices available")
+    y = dpt.reshape(x_flat, (4, 5, 3))
+    res = dpt.unstack(y, axis=2)
+
+    assert_array_equal(dpt.asnumpy(y[:, :, 0, ...]), dpt.asnumpy(res[0]))
+    assert_array_equal(dpt.asnumpy(y[:, :, 1, ...]), dpt.asnumpy(res[1]))
+    assert_array_equal(dpt.asnumpy(y[:, :, 2, ...]), dpt.asnumpy(res[2]))
+
+
+def test_finfo_object():
+    fi = dpt.finfo(dpt.float32)
+    assert isinstance(fi.bits, int)
+    assert isinstance(fi.max, float)
+    assert isinstance(fi.min, float)
+    assert isinstance(fi.eps, float)
+    assert isinstance(fi.epsneg, float)
+    assert isinstance(fi.smallest_normal, float)
+    assert isinstance(fi.tiny, float)
+    assert isinstance(fi.precision, float)
+    assert isinstance(fi.resolution, float)
+    assert isinstance(fi.dtype, dpt.dtype)
+    assert isinstance(str(fi), str)
+    assert isinstance(repr(fi), str)
+
+
+def test_repeat_scalar_sequence_agreement():
+    get_queue_or_skip()
+
+    x = dpt.arange(5, dtype="i4")
+    expected_res = dpt.empty(10, dtype="i4")
+    expected_res[1::2], expected_res[::2] = x, x
+
+    # scalar case
+    reps = 2
+    res = dpt.repeat(x, reps)
+    assert dpt.all(res == expected_res)
+
+    # tuple
+    reps = (2, 2, 2, 2, 2)
+    res = dpt.repeat(x, reps)
+    assert dpt.all(res == expected_res)
+
+
+def test_repeat_as_broadcasting():
+    get_queue_or_skip()
+
+    reps = 5
+    x = dpt.arange(reps, dtype="i4")
+    x1 = x[:, dpt.newaxis]
+    expected_res = dpt.broadcast_to(x1, (reps, reps))
+
+    res = dpt.repeat(x1, reps, axis=1)
+    assert dpt.all(res == expected_res)
+
+    x2 = x[dpt.newaxis, :]
+    expected_res = dpt.broadcast_to(x2, (reps, reps))
+
+    res = dpt.repeat(x2, reps, axis=0)
+    assert dpt.all(res == expected_res)
+
+
+def test_repeat_axes():
+    get_queue_or_skip()
+
+    reps = 2
+    x = dpt.reshape(dpt.arange(5 * 10, dtype="i4"), (5, 10))
+    expected_res = dpt.empty((x.shape[0] * 2, x.shape[1]), dtype=x.dtype)
+    expected_res[::2, :], expected_res[1::2] = x, x
+    res = dpt.repeat(x, reps, axis=0)
+    assert dpt.all(res == expected_res)
+
+    expected_res = dpt.empty((x.shape[0], x.shape[1] * 2), dtype=x.dtype)
+    expected_res[:, ::2], expected_res[:, 1::2] = x, x
+    res = dpt.repeat(x, reps, axis=1)
+    assert dpt.all(res == expected_res)
+
+    x = dpt.arange(10, dtype="i4")
+    expected_res = dpt.empty(x.shape[0] * reps, dtype=x.dtype)
+    expected_res[::2], expected_res[1::2] = x, x
+    res = dpt.repeat(x, reps, axis=0)
+    assert dpt.all(res == expected_res)
+
+
+def test_repeat_size_0_outputs():
+    get_queue_or_skip()
+
+    x = dpt.ones((3, 0, 5), dtype="i4")
+    reps = 10
+    res = dpt.repeat(x, reps, axis=0)
+    assert res.size == 0
+    assert res.shape == (30, 0, 5)
+
+    res = dpt.repeat(x, reps, axis=1)
+    assert res.size == 0
+    assert res.shape == (3, 0, 5)
+
+    res = dpt.repeat(x, (2, 2, 2), axis=0)
+    assert res.size == 0
+    assert res.shape == (6, 0, 5)
+
+    x = dpt.ones((3, 2, 5))
+    res = dpt.repeat(x, 0, axis=1)
+    assert res.size == 0
+    assert res.shape == (3, 0, 5)
+
+    res = dpt.repeat(x, (0, 0), axis=1)
+    assert res.size == 0
+    assert res.shape == (3, 0, 5)
+
+    # axis=None cases
+    res = dpt.repeat(x, 0)
+    assert res.size == 0
+
+    res = dpt.repeat(x, (0,) * x.size)
+    assert res.size == 0
+
+
+def test_repeat_strides():
+    get_queue_or_skip()
+
+    reps = 2
+    x = dpt.reshape(dpt.arange(10 * 10, dtype="i4"), (10, 10))
+    x1 = x[:, ::-2]
+    expected_res = dpt.empty((10, 10), dtype="i4")
+    expected_res[:, ::2], expected_res[:, 1::2] = x1, x1
+    res = dpt.repeat(x1, reps, axis=1)
+    assert dpt.all(res == expected_res)
+    res = dpt.repeat(x1, (reps,) * x1.shape[1], axis=1)
+    assert dpt.all(res == expected_res)
+
+    x1 = x[::-2, :]
+    expected_res = dpt.empty((10, 10), dtype="i4")
+    expected_res[::2, :], expected_res[1::2, :] = x1, x1
+    res = dpt.repeat(x1, reps, axis=0)
+    assert dpt.all(res == expected_res)
+    res = dpt.repeat(x1, (reps,) * x1.shape[0], axis=0)
+    assert dpt.all(res == expected_res)
+
+    # axis=None
+    x = dpt.reshape(dpt.arange(10 * 10), (10, 10))
+    x1 = dpt.reshape(x[::-2, :], -1)
+    x2 = x[::-2, :]
+    expected_res = dpt.empty(10 * 10, dtype="i4")
+    expected_res[::2], expected_res[1::2] = x1, x1
+    res = dpt.repeat(x2, reps)
+    assert dpt.all(res == expected_res)
+    res = dpt.repeat(x2, (reps,) * x1.size)
+    assert dpt.all(res == expected_res)
+
+
+def test_repeat_casting():
+    get_queue_or_skip()
+
+    x = dpt.arange(5, dtype="i4")
+    # i4 is cast to i8
+    reps = dpt.ones(5, dtype="i4")
+    res = dpt.repeat(x, reps)
+    assert res.shape == x.shape
+    assert dpt.all(res == x)
+
+
+def test_repeat_strided_repeats():
+    get_queue_or_skip()
+
+    x = dpt.arange(5, dtype="i4")
+    reps = dpt.ones(10, dtype="i8")
+    reps[::2] = 0
+    reps = reps[::-2]
+    res = dpt.repeat(x, reps)
+    assert res.shape == x.shape
+    assert dpt.all(res == x)
+
+
+def test_repeat_size1_repeats():
+    get_queue_or_skip()
+
+    x = dpt.arange(5, dtype="i4")
+    expected_res = dpt.repeat(x, 2)
+    # 0D repeats
+    reps_0d = dpt.asarray(2, dtype="i8")
+    res = dpt.repeat(x, reps_0d)
+    assert dpt.all(res == expected_res)
+    # 1D repeats
+    reps_1d = dpt.asarray([2], dtype="i8")
+    res = dpt.repeat(x, reps_1d)
+    assert dpt.all(res == expected_res)
+
+
+def test_repeat_arg_validation():
+    get_queue_or_skip()
+
+    x = {}
+    with pytest.raises(TypeError):
+        dpt.repeat(x, 2)
+
+    # axis must be 0 for scalar
+    x = dpt.empty(())
+    with pytest.raises(ValueError):
+        dpt.repeat(x, 2, axis=1)
+
+    # repeats must be positive
+    x = dpt.empty(5)
+    with pytest.raises(ValueError):
+        dpt.repeat(x, -2)
+
+    # repeats must be integers
+    with pytest.raises(TypeError):
+        dpt.repeat(x, 2.0)
+
+    # repeats tuple must be the same length as axis
+    with pytest.raises(ValueError):
+        dpt.repeat(x, (1, 2))
+
+    # repeats tuple elements must be positive
+    with pytest.raises(ValueError):
+        dpt.repeat(x, (-1,))
+
+    # repeats must be int or tuple
+    with pytest.raises(TypeError):
+        dpt.repeat(x, dict())
+
+    # repeats array must be 0d or 1d
+    with pytest.raises(ValueError):
+        dpt.repeat(x, dpt.ones((1, 1), dtype="i8"))
+
+    # repeats must be castable to i8
+    with pytest.raises(TypeError):
+        dpt.repeat(x, dpt.asarray(2.0, dtype="f4"))
+
+    # compute follows data
+    q2 = dpctl.SyclQueue()
+    reps = dpt.asarray(1, dtype="i8", sycl_queue=q2)
+    with pytest.raises(ExecutionPlacementError):
+        dpt.repeat(x, reps)
+
+    # repeats array must not contain negative elements
+    reps = dpt.asarray(-1, dtype="i8")
+    with pytest.raises(ValueError):
+        dpt.repeat(x, reps)
+    reps = dpt.asarray([1, 1, 1, 1, -1], dtype="i8")
+    with pytest.raises(ValueError):
+        dpt.repeat(x, reps)
+
+    # repeats must broadcastable to axis size
+    reps = dpt.arange(10, dtype="i8")
+    with pytest.raises(ValueError):
+        dpt.repeat(x, reps)
+
+
+def test_tile_basic():
+    get_queue_or_skip()
+
+    reps = 2
+    x = dpt.arange(5, dtype="i4")
+    res = dpt.tile(x, reps)
+    assert res.shape == (x.shape[0] * reps,)
+    assert dpt.all(res[: x.size] == res[x.size :])
+
+    reps = (2, 1)
+    expected_sh = (2, x.shape[0])
+    expected_res = dpt.broadcast_to(x, expected_sh)
+    res = dpt.tile(x, reps)
+    assert res.shape == expected_sh
+    assert dpt.all(expected_res == res)
+
+
+def test_tile_size_1():
+    get_queue_or_skip()
+
+    reps = 5
+    # test for 0d array
+    x1 = dpt.asarray(2, dtype="i4")
+    res = dpt.tile(x1, reps)
+    assert dpt.all(res == dpt.full(reps, 2, dtype="i4"))
+
+    # test for 1d array with single element
+    x2 = dpt.asarray([2], dtype="i4")
+    res = dpt.tile(x2, reps)
+    assert dpt.all(res == dpt.full(reps, 2, dtype="i4"))
+
+    reps = ()
+    # test for gh-1627 behavior
+    res = dpt.tile(x1, reps)
+    assert x1.shape == res.shape
+    assert_array_equal(dpt.asnumpy(x1), dpt.asnumpy(res))
+
+    res = dpt.tile(x2, reps)
+    assert x2.shape == res.shape
+    assert_array_equal(dpt.asnumpy(x2), dpt.asnumpy(res))
+
+
+def test_tile_prepends_axes():
+    get_queue_or_skip()
+
+    reps = (2,)
+    x = dpt.ones((5, 10), dtype="i4")
+    expected_res = dpt.ones((5, 20), dtype="i4")
+    res = dpt.tile(x, reps)
+    assert dpt.all(res == expected_res)
+
+    reps = (3, 2, 2)
+    expected_res = dpt.ones((3, 10, 20), dtype="i4")
+    res = dpt.tile(x, reps)
+    assert dpt.all(res == expected_res)
+
+
+def test_tile_empty_outputs():
+    get_queue_or_skip()
+
+    x = dpt.asarray((), dtype="i4")
+    reps = 10
+    res = dpt.tile(x, reps)
+    assert res.size == 0
+    assert res.shape == (0,)
+
+    x = dpt.ones((3, 0, 5), dtype="i4")
+    res = dpt.tile(x, reps)
+    assert res.size == 0
+    assert res.shape == (3, 0, 50)
+
+    reps = (2, 1, 2)
+    res = dpt.tile(x, reps)
+    assert res.size == 0
+    assert res.shape == (6, 0, 10)
+
+    x = dpt.ones((2, 3, 4), dtype="i4")
+    reps = (0, 1, 1)
+    res = dpt.tile(x, reps)
+    assert res.size == 0
+    assert res.shape == (0, 3, 4)
+
+
+def test_tile_strides():
+    get_queue_or_skip()
+
+    reps = (1, 2)
+    x = dpt.reshape(dpt.arange(10 * 10, dtype="i4"), (10, 10))
+    x1 = x[:, ::-2]
+    expected_res = dpt.empty((10, 10), dtype="i4")
+    expected_res[:, : x1.shape[1]], expected_res[:, x1.shape[1] :] = x1, x1
+    res = dpt.tile(x1, reps)
+    assert dpt.all(res == expected_res)
+
+    reps = (2, 1)
+    x1 = x[::-2, :]
+    expected_res = dpt.empty((10, 10), dtype="i4")
+    expected_res[: x1.shape[0], :], expected_res[x1.shape[0] :, :] = x1, x1
+    res = dpt.tile(x1, reps)
+    assert dpt.all(res == expected_res)
+
+
+def test_tile_size_1_axes():
+    get_queue_or_skip()
+
+    reps = (1, 2, 1)
+    x = dpt.ones((2, 1, 3), dtype="i4")
+    res = dpt.tile(x, reps)
+    expected_res = dpt.broadcast_to(x, (2, 2, 3))
+    assert dpt.all(res == expected_res)
+
+
+def test_tile_arg_validation():
+    get_queue_or_skip()
+
+    with pytest.raises(TypeError):
+        dpt.tile(dict(), 2)
+
+    # repetitions must be int or tuple
+    x = dpt.empty(())
+    with pytest.raises(TypeError):
+        dpt.tile(x, dict())
+
+
+def test_repeat_0_size():
+    get_queue_or_skip()
+
+    x = dpt.ones((0, 10, 0), dtype="i4")
+    repetitions = 2
+    res = dpt.repeat(x, repetitions)
+    assert res.shape == (0,)
+    res = dpt.repeat(x, repetitions, axis=2)
+    assert res.shape == x.shape
+    res = dpt.repeat(x, repetitions, axis=1)
+    axis_sz = x.shape[1] * repetitions
+    assert res.shape == (0, 20, 0)
+
+    repetitions = dpt.asarray(2, dtype="i4")
+    res = dpt.repeat(x, repetitions)
+    assert res.shape == (0,)
+    res = dpt.repeat(x, repetitions, axis=2)
+    assert res.shape == x.shape
+    res = dpt.repeat(x, repetitions, axis=1)
+    assert res.shape == (0, 20, 0)
+
+    repetitions = dpt.arange(10, dtype="i4")
+    res = dpt.repeat(x, repetitions, axis=1)
+    axis_sz = dpt.sum(repetitions)
+    assert res.shape == (0, axis_sz, 0)
+
+    repetitions = (2,) * 10
+    res = dpt.repeat(x, repetitions, axis=1)
+    axis_sz = 2 * x.shape[1]
+    assert res.shape == (0, axis_sz, 0)
+
+
+def test_result_type_bug_1874():
+    py_sc = True
+    np_sc = np.asarray([py_sc])[0]
+    dts_bool = [py_sc, np_sc]
+    py_sc = int(1)
+    np_sc = np.asarray([py_sc])[0]
+    dts_ints = [py_sc, np_sc]
+    dts_floats = [float(1), np.float64(1)]
+    dts_complexes = [complex(1), np.complex128(1)]
+
+    # iterate over two categories
+    for dts1, dts2 in itertools.product(
+        [dts_bool, dts_ints, dts_floats, dts_complexes], repeat=2
+    ):
+        res_dts = []
+        # iterate over Python scalar/NumPy scalar choices within categories
+        for dt1, dt2 in itertools.product(dts1, dts2):
+            res_dt = dpt.result_type(dt1, dt2)
+            res_dts.append(res_dt)
+        # check that all results are the same
+        assert res_dts and all(res_dts[0] == el for el in res_dts[1:])
diff --git a/dpnp/tests/tensor/test_usm_ndarray_operators.py b/dpnp/tests/tensor/test_usm_ndarray_operators.py
new file mode 100644
index 000000000000..8ac178def197
--- /dev/null
+++ b/dpnp/tests/tensor/test_usm_ndarray_operators.py
@@ -0,0 +1,154 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+import dpctl
+import pytest
+
+import dpnp.tensor as dpt
+
+
+class Dummy:
+    @staticmethod
+    def abs(a):
+        return a
+
+    @staticmethod
+    def add(a, b):
+        if isinstance(a, dpt.usm_ndarray):
+            return a
+        else:
+            return b
+
+    @staticmethod
+    def subtract(a, b):
+        if isinstance(a, dpt.usm_ndarray):
+            return a
+        else:
+            return b
+
+    @staticmethod
+    def multiply(a, b):
+        if isinstance(a, dpt.usm_ndarray):
+            return a
+        else:
+            return b
+
+
+@pytest.mark.parametrize("namespace", [dpt, Dummy()])
+def test_fp_ops(namespace):
+    try:
+        X = dpt.ones(1)
+    except dpctl.SyclDeviceCreationError:
+        pytest.skip("No SYCL devices available")
+    X._set_namespace(namespace)
+    assert X.__array_namespace__() is namespace
+    X[0] = -2.5
+    X.__abs__()
+    X.__add__(1.0)
+    X.__radd__(1.0)
+    X.__sub__(1.0)
+    X.__rsub__(1.0)
+    X.__mul__(1.0)
+    X.__rmul__(1.0)
+    X.__truediv__(1.0)
+    X.__rtruediv__(1.0)
+    X.__floordiv__(1.0)
+    X.__rfloordiv__(1.0)
+    X.__pos__()
+    X.__neg__()
+    X.__eq__(-2.5)
+    X.__ne__(-2.5)
+    X.__le__(-2.5)
+    X.__ge__(-2.5)
+    X.__gt__(-2.0)
+    X.__iadd__(X)
+    X.__isub__(X)
+    X.__imul__(X)
+    X.__itruediv__(1.0)
+    X.__ifloordiv__(1.0)
+
+
+@pytest.mark.parametrize("namespace", [dpt, Dummy()])
+def test_int_ops(namespace):
+    try:
+        X = dpt.usm_ndarray(1, "i4")
+    except dpctl.SyclDeviceCreationError:
+        pytest.skip("No SYCL devices available")
+    X._set_namespace(namespace)
+    assert X.__array_namespace__() is namespace
+    X.__lshift__(2)
+    X.__rshift__(2)
+    X.__rlshift__(2)
+    X.__rrshift__(2)
+    X.__ilshift__(2)
+    X.__irshift__(2)
+    X.__and__(X)
+    X.__rand__(X)
+    X.__iand__(X)
+    X.__or__(X)
+    X.__ror__(X)
+    X.__ior__(X)
+    X.__xor__(X)
+    X.__rxor__(X)
+    X.__ixor__(X)
+    X.__invert__()
+    X.__mod__(5)
+    X.__rmod__(5)
+    X.__imod__(5)
+    X.__pow__(2)
+    X.__rpow__(2)
+    X.__ipow__(2)
+
+
+@pytest.mark.parametrize("namespace", [dpt, Dummy()])
+def test_mat_ops(namespace):
+    try:
+        M = dpt.eye(3, 3)
+    except dpctl.SyclDeviceCreationError:
+        pytest.skip("No SYCL devices available")
+    M._set_namespace(namespace)
+    assert M.__array_namespace__() is namespace
+    M.__matmul__(M)
+    M.__imatmul__(M)
+    M.__rmatmul__(M)
+
+
+@pytest.mark.parametrize("namespace", [dpt, Dummy()])
+def test_comp_ops(namespace):
+    try:
+        X = dpt.asarray(1, dtype="u8")
+    except dpctl.SyclDeviceCreationError:
+        pytest.skip("No SYCL devices available")
+    X._set_namespace(namespace)
+    assert X.__array_namespace__() is namespace
+    assert X.__gt__(-1)
+    assert X.__ge__(-1)
+    assert not X.__lt__(-1)
+    assert not X.__le__(-1)
+    assert not X.__eq__(-1)
+    assert X.__ne__(-1)
diff --git a/dpnp/tests/tensor/test_usm_ndarray_print.py b/dpnp/tests/tensor/test_usm_ndarray_print.py
new file mode 100644
index 000000000000..94dbfca7c198
--- /dev/null
+++ b/dpnp/tests/tensor/test_usm_ndarray_print.py
@@ -0,0 +1,408 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+import dpctl
+import numpy as np
+import pytest
+
+import dpnp.tensor as dpt
+
+from .helper import (
+    get_queue_or_skip,
+    skip_if_dtype_not_supported,
+)
+
+
+class TestPrint:
+    def setup_method(self):
+        self._retain_options = dpt.get_print_options()
+
+    def teardown_method(self):
+        dpt.set_print_options(**self._retain_options)
+
+
+class TestArgValidation(TestPrint):
+    @pytest.mark.parametrize(
+        "arg,err",
+        [
+            ({"linewidth": "I"}, TypeError),
+            ({"edgeitems": "I"}, TypeError),
+            ({"threshold": "I"}, TypeError),
+            ({"precision": "I"}, TypeError),
+            ({"floatmode": "I"}, ValueError),
+            ({"edgeitems": "I"}, TypeError),
+            ({"sign": "I"}, ValueError),
+            ({"nanstr": np.nan}, TypeError),
+            ({"infstr": np.nan}, TypeError),
+        ],
+    )
+    def test_print_option_arg_validation(self, arg, err):
+        with pytest.raises(err):
+            dpt.set_print_options(**arg)
+
+    def test_usm_ndarray_repr_arg_validation(self):
+        X = {}
+        with pytest.raises(TypeError):
+            dpt.usm_ndarray_repr(X)
+
+        try:
+            X = dpt.arange(4)
+        except dpctl.SyclDeviceCreationError:
+            pytest.skip("No SYCL devices available")
+        with pytest.raises(TypeError):
+            dpt.usm_ndarray_repr(X, line_width="I")
+
+        with pytest.raises(TypeError):
+            dpt.usm_ndarray_repr(X, precision="I")
+
+        with pytest.raises(TypeError):
+            dpt.usm_ndarray_repr(X, prefix=4)
+
+    def test_usm_ndarray_str_arg_validation(self):
+        X = {}
+        with pytest.raises(TypeError):
+            dpt.usm_ndarray_str(X)
+
+        try:
+            X = dpt.arange(4)
+        except dpctl.SyclDeviceCreationError:
+            pytest.skip("No SYCL devices available")
+
+        with pytest.raises(TypeError):
+            dpt.usm_ndarray_str(X, line_width="I")
+
+        with pytest.raises(TypeError):
+            dpt.usm_ndarray_str(X, edge_items="I")
+
+        with pytest.raises(TypeError):
+            dpt.usm_ndarray_str(X, threshold="I")
+
+        with pytest.raises(TypeError):
+            dpt.usm_ndarray_str(X, precision="I")
+
+        with pytest.raises(ValueError):
+            dpt.usm_ndarray_str(X, floatmode="I")
+
+        with pytest.raises(TypeError):
+            dpt.usm_ndarray_str(X, edge_items="I")
+
+        with pytest.raises(ValueError):
+            dpt.usm_ndarray_str(X, sign="I")
+
+        with pytest.raises(TypeError):
+            dpt.usm_ndarray_str(X, prefix=4)
+
+        with pytest.raises(TypeError):
+            dpt.usm_ndarray_str(X, prefix=4)
+
+        with pytest.raises(TypeError):
+            dpt.usm_ndarray_str(X, suffix=4)
+
+
+class TestSetPrintOptions(TestPrint):
+    def test_set_linewidth(self):
+        q = get_queue_or_skip()
+
+        dpt.set_print_options(linewidth=1)
+        x = dpt.asarray([0, 1], sycl_queue=q)
+        assert str(x) == "[0\n 1]"
+
+    def test_set_precision(self):
+        q = get_queue_or_skip()
+
+        dpt.set_print_options(precision=4)
+        x = dpt.asarray([1.23450], sycl_queue=q)
+        assert str(x) == "[1.2345]"
+
+    def test_threshold_edgeitems(self):
+        q = get_queue_or_skip()
+
+        dpt.set_print_options(threshold=1, edgeitems=1)
+        x = dpt.arange(9, sycl_queue=q)
+        assert str(x) == "[0 ... 8]"
+        dpt.set_print_options(edgeitems=9)
+        assert str(x) == "[0 1 2 3 4 5 6 7 8]"
+
+    def test_floatmodes(self):
+        q = get_queue_or_skip()
+
+        x = dpt.asarray([0.1234, 0.1234678], sycl_queue=q)
+        dpt.set_print_options(floatmode="fixed", precision=4)
+        assert str(x) == "[0.1234 0.1235]"
+
+        dpt.set_print_options(floatmode="unique")
+        assert str(x) == "[0.1234    0.1234678]"
+
+        dpt.set_print_options(floatmode="maxprec")
+        assert str(x) == "[0.1234 0.1235]"
+
+        dpt.set_print_options(floatmode="maxprec", precision=8)
+        assert str(x) == "[0.1234    0.1234678]"
+
+        dpt.set_print_options(floatmode="maxprec_equal", precision=4)
+        assert str(x) == "[0.1234 0.1235]"
+
+        dpt.set_print_options(floatmode="maxprec_equal", precision=8)
+        assert str(x) == "[0.1234000 0.1234678]"
+
+    def test_nan_inf_suppress(self):
+        q = get_queue_or_skip()
+
+        dpt.set_print_options(nanstr="nan1", infstr="inf1")
+        x = dpt.asarray([np.nan, np.inf], sycl_queue=q)
+        assert str(x) == "[nan1 inf1]"
+
+    def test_suppress_small(self):
+        q = get_queue_or_skip()
+
+        dpt.set_print_options(suppress=True)
+        x = dpt.asarray(5e-10, sycl_queue=q)
+        assert str(x) == "0."
+
+    def test_sign(self):
+        q = get_queue_or_skip()
+
+        x = dpt.asarray([0.0, 1.0, 2.0], sycl_queue=q)
+        y = dpt.asarray(1.0, sycl_queue=q)
+        z = dpt.asarray([1.0 + 1.0j], sycl_queue=q)
+        assert str(x) == "[0. 1. 2.]"
+        assert str(y) == "1."
+        assert str(z) == "[1.+1.j]"
+
+        dpt.set_print_options(sign="+")
+        assert str(x) == "[+0. +1. +2.]"
+        assert str(y) == "+1."
+        assert str(z) == "[+1.+1.j]"
+
+        dpt.set_print_options(sign=" ")
+        assert str(x) == "[ 0.  1.  2.]"
+        assert str(y) == " 1."
+        assert str(z) == "[ 1.+1.j]"
+
+    def test_numpy(self):
+        dpt.set_print_options(numpy=True)
+        options = dpt.get_print_options()
+        np_options = np.get_printoptions()
+        assert all(np_options[k] == options[k] for k in options.keys())
+
+
+class TestPrintFns(TestPrint):
+    @pytest.mark.parametrize(
+        "dtype,x_str",
+        [
+            ("b1", "[False  True  True  True]"),
+            ("i1", "[0 1 2 3]"),
+            ("u1", "[0 1 2 3]"),
+            ("i2", "[0 1 2 3]"),
+            ("u2", "[0 1 2 3]"),
+            ("i4", "[0 1 2 3]"),
+            ("u4", "[0 1 2 3]"),
+            ("i8", "[0 1 2 3]"),
+            ("u8", "[0 1 2 3]"),
+            ("f2", "[0. 1. 2. 3.]"),
+            ("f4", "[0. 1. 2. 3.]"),
+            ("f8", "[0. 1. 2. 3.]"),
+            ("c8", "[0.+0.j 1.+0.j 2.+0.j 3.+0.j]"),
+            ("c16", "[0.+0.j 1.+0.j 2.+0.j 3.+0.j]"),
+        ],
+    )
+    def test_print_types(self, dtype, x_str):
+        q = get_queue_or_skip()
+        skip_if_dtype_not_supported(dtype, q)
+
+        x = dpt.asarray([0, 1, 2, 3], dtype=dtype, sycl_queue=q)
+        assert str(x) == x_str
+
+    def test_print_str(self):
+        q = get_queue_or_skip()
+
+        x = dpt.asarray(0, sycl_queue=q)
+        assert str(x) == "0"
+
+        x = dpt.asarray([np.nan, np.inf], sycl_queue=q)
+        assert str(x) == "[nan inf]"
+
+        x = dpt.arange(9, sycl_queue=q)
+        assert str(x) == "[0 1 2 3 4 5 6 7 8]"
+
+        y = dpt.reshape(x, (3, 3), copy=True)
+        assert str(y) == "[[0 1 2]\n [3 4 5]\n [6 7 8]]"
+
+    def test_print_str_abbreviated(self):
+        q = get_queue_or_skip()
+
+        dpt.set_print_options(threshold=0, edgeitems=1)
+        x = dpt.arange(9, sycl_queue=q)
+        assert str(x) == "[0 ... 8]"
+
+        x = dpt.reshape(x, (3, 3))
+        assert str(x) == "[[0 ... 2]\n ...\n [6 ... 8]]"
+
+    def test_usm_ndarray_str_separator(self):
+        q = get_queue_or_skip()
+
+        x = dpt.reshape(dpt.arange(4, sycl_queue=q), (2, 2))
+
+        np.testing.assert_equal(
+            dpt.usm_ndarray_str(x, prefix="test", separator="   "),
+            "[[0   1]\n     [2   3]]",
+        )
+
+    def test_print_repr(self):
+        q = get_queue_or_skip()
+
+        x = dpt.asarray(3, dtype="int64", sycl_queue=q)
+        assert repr(x) == "usm_ndarray(3)"
+
+        x = dpt.asarray([np.nan, np.inf], sycl_queue=q)
+        if x.sycl_device.has_aspect_fp64:
+            assert repr(x) == "usm_ndarray([nan, inf])"
+        else:
+            assert repr(x) == "usm_ndarray([nan, inf], dtype=float32)"
+
+        x = dpt.arange(9, sycl_queue=q, dtype="int64")
+        assert repr(x) == "usm_ndarray([0, 1, 2, 3, 4, 5, 6, 7, 8])"
+
+        x = dpt.reshape(x, (3, 3))
+        np.testing.assert_equal(
+            repr(x),
+            "usm_ndarray([[0, 1, 2],"
+            "\n             [3, 4, 5],"
+            "\n             [6, 7, 8]])",
+        )
+
+        x = dpt.arange(4, dtype="i4", sycl_queue=q)
+        assert repr(x) == "usm_ndarray([0, 1, 2, 3], dtype=int32)"
+
+        dpt.set_print_options(linewidth=1)
+        np.testing.assert_equal(
+            repr(x),
+            "usm_ndarray([0,"
+            "\n             1,"
+            "\n             2,"
+            "\n             3],"
+            "\n            dtype=int32)",
+        )
+
+        # zero-size array
+        dpt.set_print_options(linewidth=75)
+        x = dpt.ones((9, 0), dtype="i4", sycl_queue=q)
+        assert repr(x) == "usm_ndarray([], shape=(9, 0), dtype=int32)"
+
+    def test_print_repr_abbreviated(self):
+        q = get_queue_or_skip()
+
+        dpt.set_print_options(threshold=0, edgeitems=1)
+        x = dpt.arange(9, dtype="int64", sycl_queue=q)
+        assert repr(x) == "usm_ndarray([0, ..., 8], shape=(9,))"
+
+        y = dpt.asarray(x, dtype="i4", copy=True)
+        assert repr(y) == "usm_ndarray([0, ..., 8], shape=(9,), dtype=int32)"
+
+        x = dpt.reshape(x, (3, 3))
+        np.testing.assert_equal(
+            repr(x),
+            "usm_ndarray([[0, ..., 2],"
+            "\n             ...,"
+            "\n             [6, ..., 8]], shape=(3, 3))",
+        )
+
+        y = dpt.reshape(y, (3, 3))
+        np.testing.assert_equal(
+            repr(y),
+            "usm_ndarray([[0, ..., 2],"
+            "\n             ...,"
+            "\n             [6, ..., 8]], shape=(3, 3), dtype=int32)",
+        )
+
+        dpt.set_print_options(linewidth=1)
+        np.testing.assert_equal(
+            repr(y),
+            "usm_ndarray([[0,"
+            "\n              ...,"
+            "\n              2],"
+            "\n             ...,"
+            "\n             [6,"
+            "\n              ...,"
+            "\n              8]],"
+            "\n            shape=(3, 3),"
+            "\n            dtype=int32)",
+        )
+
+    @pytest.mark.parametrize(
+        "dtype",
+        [
+            "i1",
+            "u1",
+            "i2",
+            "u2",
+            "i4",
+            "u4",
+            "u8",
+            "f2",
+            "f4",
+            "c8",
+        ],
+    )
+    def test_repr_appended_dtype(self, dtype):
+        q = get_queue_or_skip()
+        skip_if_dtype_not_supported(dtype, q)
+
+        x = dpt.empty(4, dtype=dtype)
+        assert repr(x).split("=")[-1][:-1] == x.dtype.name
+
+    def test_usm_ndarray_repr_prefix(self):
+        q = get_queue_or_skip()
+
+        x = dpt.arange(4, dtype=np.intp, sycl_queue=q)
+        np.testing.assert_equal(
+            dpt.usm_ndarray_repr(x, prefix="test"), "test([0, 1, 2, 3])"
+        )
+        x = dpt.reshape(x, (2, 2))
+        np.testing.assert_equal(
+            dpt.usm_ndarray_repr(x, prefix="test"),
+            "test([[0, 1]," "\n      [2, 3]])",
+        )
+
+
+class TestContextManager:
+    def test_context_manager_basic(self):
+        options = dpt.get_print_options()
+        try:
+            X = dpt.asarray(1.234567)
+        except dpctl.SyclDeviceCreationError:
+            pytest.skip("No SYCL devices available")
+        with dpt.print_options(precision=4):
+            s = str(X)
+        assert s == "1.2346"
+        assert options == dpt.get_print_options()
+
+    def test_context_manager_as(self):
+        with dpt.print_options(precision=4) as x:
+            options = x.copy()
+        assert options["precision"] == 4
diff --git a/dpnp/tests/tensor/test_usm_ndarray_reductions.py b/dpnp/tests/tensor/test_usm_ndarray_reductions.py
new file mode 100644
index 000000000000..4d828fbdbd49
--- /dev/null
+++ b/dpnp/tests/tensor/test_usm_ndarray_reductions.py
@@ -0,0 +1,706 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+from random import randrange
+
+import numpy as np
+import pytest
+from dpctl.utils import ExecutionPlacementError
+from numpy.testing import assert_allclose
+
+import dpnp.tensor as dpt
+from dpnp.tensor._tensor_impl import default_device_index_type
+
+from .helper import (
+    get_queue_or_skip,
+    skip_if_dtype_not_supported,
+)
+
+_no_complex_dtypes = [
+    "?",
+    "i1",
+    "u1",
+    "i2",
+    "u2",
+    "i4",
+    "u4",
+    "i8",
+    "u8",
+    "f2",
+    "f4",
+    "f8",
+]
+
+
+_all_dtypes = _no_complex_dtypes + [
+    "c8",
+    "c16",
+]
+
+
+def test_max_min_axis():
+    get_queue_or_skip()
+
+    x = dpt.reshape(
+        dpt.arange((3 * 4 * 5 * 6 * 7), dtype="i4"), (3, 4, 5, 6, 7)
+    )
+
+    m = dpt.max(x, axis=(1, 2, -1))
+    assert m.shape == (3, 6)
+    assert dpt.all(m == x[:, -1, -1, :, -1])
+
+    m = dpt.min(x, axis=(1, 2, -1))
+    assert m.shape == (3, 6)
+    assert dpt.all(m == x[:, 0, 0, :, 0])
+
+
+def test_max_axis1_axis0():
+    """See gh-1455"""
+    get_queue_or_skip()
+
+    x = dpt.reshape(dpt.arange(3 * 4 * 5), (3, 4, 5))
+
+    m = dpt.max(x, axis=0)
+    assert dpt.all(m == x[-1, :, :])
+
+    x = dpt.flip(x, axis=2)
+    m = dpt.max(x, axis=2)
+    assert dpt.all(m == x[:, :, 0])
+
+
+def test_reduction_keepdims():
+    get_queue_or_skip()
+
+    n0, n1 = 3, 6
+    x = dpt.ones((n0, 4, 5, n1, 7), dtype="i4")
+    m = dpt.max(x, axis=(1, 2, -1), keepdims=True)
+
+    xx = dpt.reshape(dpt.permute_dims(x, (0, 3, 1, 2, -1)), (n0, n1, -1))
+    p = dpt.argmax(xx, axis=-1, keepdims=True)
+
+    assert m.shape == (n0, 1, 1, n1, 1)
+    assert dpt.all(m == dpt.reshape(x[:, 0, 0, :, 0], m.shape))
+    assert dpt.all(p == 0)
+
+
+def test_max_scalar():
+    get_queue_or_skip()
+
+    x = dpt.ones(())
+    m = dpt.max(x)
+
+    assert m.shape == ()
+    assert x == m
+
+
+@pytest.mark.parametrize("arg_dtype", ["i4", "f4", "c8"])
+def test_reduction_kernels(arg_dtype):
+    # i4 - always uses atomics w/ sycl group reduction
+    # f4 - always uses atomics w/ custom group reduction
+    # c8 - always uses temps w/ custom group reduction
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(arg_dtype, q)
+
+    x = dpt.ones((24, 1025), dtype=arg_dtype, sycl_queue=q)
+    x[x.shape[0] // 2, :] = 3
+    x[:, x.shape[1] // 2] = 3
+
+    m = dpt.max(x)
+    assert m == 3
+    m = dpt.max(x, axis=0)
+    assert dpt.all(m == 3)
+    m = dpt.max(x, axis=1)
+    assert dpt.all(m == 3)
+
+    x = dpt.ones((24, 1025), dtype=arg_dtype, sycl_queue=q)
+    x[x.shape[0] // 2, :] = 0
+    x[:, x.shape[1] // 2] = 0
+
+    m = dpt.min(x)
+    assert m == 0
+    m = dpt.min(x, axis=0)
+    assert dpt.all(m == 0)
+    m = dpt.min(x, axis=1)
+    assert dpt.all(m == 0)
+
+
+def test_max_min_nan_propagation():
+    get_queue_or_skip()
+
+    # float, finites
+    x = dpt.arange(4, dtype="f4")
+    x[0] = dpt.nan
+    assert dpt.isnan(dpt.max(x))
+    assert dpt.isnan(dpt.min(x))
+
+    # float, infinities
+    x[1:] = dpt.inf
+    assert dpt.isnan(dpt.max(x))
+    x[1:] = -dpt.inf
+    assert dpt.isnan(dpt.min(x))
+
+    # complex
+    x = dpt.arange(4, dtype="c8")
+    x[0] = complex(dpt.nan, 0)
+    assert dpt.isnan(dpt.max(x))
+    assert dpt.isnan(dpt.min(x))
+
+    x[0] = complex(0, dpt.nan)
+    assert dpt.isnan(dpt.max(x))
+    assert dpt.isnan(dpt.min(x))
+
+
+def test_argmax_scalar():
+    get_queue_or_skip()
+
+    x = dpt.ones(())
+    m = dpt.argmax(x)
+
+    assert m.shape == ()
+    assert m == 0
+
+
+@pytest.mark.parametrize("arg_dtype", ["i4", "f4", "c8"])
+def test_search_reduction_kernels(arg_dtype):
+    # i4 - always uses atomics w/ sycl group reduction
+    # f4 - always uses atomics w/ custom group reduction
+    # c8 - always uses temps w/ custom group reduction
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(arg_dtype, q)
+
+    x_shape = (24, 1024)
+    x_size = np.prod(x_shape)
+    x = dpt.ones(x_size, dtype=arg_dtype, sycl_queue=q)
+    idx = randrange(x.size)
+    idx_tup = np.unravel_index(idx, x_shape)
+    x[idx] = 2
+
+    m = dpt.argmax(x)
+    assert m == idx
+
+    # test case of strided input mapping to contig
+    # implementation
+    m = dpt.argmax(dpt.flip(x))
+    assert m == x.size - 1 - idx
+
+    # test case of strided implementation
+    y = dpt.ones(2 * x.size, dtype=arg_dtype, sycl_queue=q)
+    y[::2] = x
+    m = dpt.argmax(y)
+    assert m == 2 * idx
+
+    x = dpt.reshape(x, x_shape)
+
+    x[idx_tup[0], :] = 3
+    m = dpt.argmax(x, axis=0)
+    assert dpt.all(m == idx_tup[0])
+    x[:, idx_tup[1]] = 4
+    m = dpt.argmax(x, axis=1)
+    assert dpt.all(m == idx_tup[1])
+
+    x = x[:, ::-2]
+    idx = randrange(x.shape[1])
+    x[:, idx] = 5
+    m = dpt.argmax(x, axis=1)
+    assert dpt.all(m == idx)
+
+    x = dpt.ones(x_size, dtype=arg_dtype, sycl_queue=q)
+    idx = randrange(x.size)
+    idx_tup = np.unravel_index(idx, x_shape)
+    x[idx] = 0
+
+    m = dpt.argmin(x)
+    assert m == idx
+
+    x = dpt.reshape(x, x_shape)
+
+    x[idx_tup[0], :] = -1
+    m = dpt.argmin(x, axis=0)
+    assert dpt.all(m == idx_tup[0])
+    x[:, idx_tup[1]] = -2
+    m = dpt.argmin(x, axis=1)
+    assert dpt.all(m == idx_tup[1])
+
+    x = x[:, ::-2]
+    idx = randrange(x.shape[1])
+    x[:, idx] = -3
+    m = dpt.argmin(x, axis=1)
+    assert dpt.all(m == idx)
+
+
+def test_argmax_argmin_nan_propagation():
+    get_queue_or_skip()
+
+    sz = 4
+    idx = randrange(sz)
+    # floats
+    x = dpt.arange(sz, dtype="f4")
+    x[idx] = dpt.nan
+    assert dpt.argmax(x) == idx
+    assert dpt.argmin(x) == idx
+
+    # complex
+    x = dpt.arange(sz, dtype="c8")
+    x[idx] = complex(dpt.nan, 0)
+    assert dpt.argmax(x) == idx
+    assert dpt.argmin(x) == idx
+
+    x[idx] = complex(0, dpt.nan)
+    assert dpt.argmax(x) == idx
+    assert dpt.argmin(x) == idx
+
+
+def test_argmax_argmin_identities():
+    # make sure that identity arrays work as expected
+    get_queue_or_skip()
+
+    x = dpt.full(3, dpt.iinfo(dpt.int32).min, dtype="i4")
+    assert dpt.argmax(x) == 0
+    x = dpt.full(3, dpt.iinfo(dpt.int32).max, dtype="i4")
+    assert dpt.argmin(x) == 0
+
+
+@pytest.mark.parametrize("order", ["C", "F"])
+def test_argmax_axis0_axis1(order):
+    get_queue_or_skip()
+
+    x = dpt.asarray([[1, 2, 3], [6, 5, 4]], dtype="i4", order=order)
+    assert dpt.argmax(x) == 3
+
+    res = dpt.argmax(x, axis=0)
+    expected = dpt.asarray([1, 1, 1], dtype=res.dtype)
+    assert dpt.all(res == expected)
+
+    res = dpt.argmax(x, axis=1)
+    expected = dpt.asarray([2, 0], dtype=res.dtype)
+    assert dpt.all(res == expected)
+
+
+def test_reduction_arg_validation():
+    get_queue_or_skip()
+
+    x = {}
+    with pytest.raises(TypeError):
+        dpt.sum(x)
+    with pytest.raises(TypeError):
+        dpt.max(x)
+    with pytest.raises(TypeError):
+        dpt.argmax(x)
+
+    x = dpt.zeros((0,), dtype="i4")
+    with pytest.raises(ValueError):
+        dpt.max(x)
+    with pytest.raises(ValueError):
+        dpt.argmax(x)
+
+
+@pytest.mark.parametrize("arg_dtype", _no_complex_dtypes[1:])
+def test_logsumexp_arg_dtype_default_output_dtype_matrix(arg_dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(arg_dtype, q)
+
+    m = dpt.ones(100, dtype=arg_dtype)
+    r = dpt.logsumexp(m)
+
+    assert isinstance(r, dpt.usm_ndarray)
+    assert r.dtype.kind == "f"
+    tol = dpt.finfo(r.dtype).resolution
+    assert_allclose(
+        dpt.asnumpy(r),
+        np.logaddexp.reduce(dpt.asnumpy(m), dtype=r.dtype),
+        rtol=tol,
+        atol=tol,
+    )
+
+
+def test_logsumexp_empty():
+    get_queue_or_skip()
+    x = dpt.empty((0,), dtype="f4")
+    y = dpt.logsumexp(x)
+    assert y.shape == ()
+    assert y == -dpt.inf
+
+
+def test_logsumexp_axis():
+    get_queue_or_skip()
+
+    m = dpt.ones((3, 4, 5, 6, 7), dtype="f4")
+    s = dpt.logsumexp(m, axis=(1, 2, -1))
+
+    assert isinstance(s, dpt.usm_ndarray)
+    assert s.shape == (3, 6)
+    tol = dpt.finfo(s.dtype).resolution
+    assert_allclose(
+        dpt.asnumpy(s),
+        np.logaddexp.reduce(dpt.asnumpy(m), axis=(1, 2, -1), dtype=s.dtype),
+        rtol=tol,
+        atol=tol,
+    )
+
+
+@pytest.mark.parametrize("arg_dtype", _no_complex_dtypes[1:])
+@pytest.mark.parametrize("out_dtype", _all_dtypes[1:])
+def test_logsumexp_arg_out_dtype_matrix(arg_dtype, out_dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(arg_dtype, q)
+    skip_if_dtype_not_supported(out_dtype, q)
+
+    m = dpt.ones(100, dtype=arg_dtype)
+    r = dpt.logsumexp(m, dtype=out_dtype)
+
+    assert isinstance(r, dpt.usm_ndarray)
+    assert r.dtype == dpt.dtype(out_dtype)
+
+
+def test_logsumexp_keepdims():
+    get_queue_or_skip()
+
+    m = dpt.ones((3, 4, 5, 6, 7), dtype="i4")
+    s = dpt.logsumexp(m, axis=(1, 2, -1), keepdims=True)
+
+    assert isinstance(s, dpt.usm_ndarray)
+    assert s.shape == (3, 1, 1, 6, 1)
+
+
+def test_logsumexp_keepdims_zero_size():
+    get_queue_or_skip()
+    n = 10
+    a = dpt.ones((n, 0, n))
+
+    s1 = dpt.logsumexp(a, keepdims=True)
+    assert s1.shape == (1, 1, 1)
+
+    s2 = dpt.logsumexp(a, axis=(0, 1), keepdims=True)
+    assert s2.shape == (1, 1, n)
+
+    s3 = dpt.logsumexp(a, axis=(1, 2), keepdims=True)
+    assert s3.shape == (n, 1, 1)
+
+    s4 = dpt.logsumexp(a, axis=(0, 2), keepdims=True)
+    assert s4.shape == (1, 0, 1)
+
+    a0 = a[0]
+    s5 = dpt.logsumexp(a0, keepdims=True)
+    assert s5.shape == (1, 1)
+
+
+def test_logsumexp_scalar():
+    get_queue_or_skip()
+
+    m = dpt.ones(())
+    s = dpt.logsumexp(m)
+
+    assert isinstance(s, dpt.usm_ndarray)
+    assert m.sycl_queue == s.sycl_queue
+    assert s.shape == ()
+
+
+def test_logsumexp_complex():
+    get_queue_or_skip()
+
+    x = dpt.zeros(1, dtype="c8")
+    with pytest.raises(ValueError):
+        dpt.logsumexp(x)
+
+
+def test_logsumexp_int_axis():
+    get_queue_or_skip()
+
+    x = dpt.zeros((8, 10), dtype="f4")
+    res = dpt.logsumexp(x, axis=0)
+    assert res.ndim == 1
+    assert res.shape[0] == 10
+
+
+def test_logsumexp_invalid_arr():
+    x = {}
+    with pytest.raises(TypeError):
+        dpt.logsumexp(x)
+
+
+@pytest.mark.parametrize("arg_dtype", _no_complex_dtypes[1:])
+def test_hypot_arg_dtype_default_output_dtype_matrix(arg_dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(arg_dtype, q)
+
+    m = dpt.ones(100, dtype=arg_dtype)
+    r = dpt.reduce_hypot(m)
+
+    assert isinstance(r, dpt.usm_ndarray)
+    assert r.dtype.kind == "f"
+    tol = dpt.finfo(r.dtype).resolution
+    assert_allclose(
+        dpt.asnumpy(r),
+        np.hypot.reduce(dpt.asnumpy(m), dtype=r.dtype),
+        rtol=tol,
+        atol=tol,
+    )
+
+
+def test_hypot_empty():
+    get_queue_or_skip()
+    x = dpt.empty((0,), dtype="f4")
+    y = dpt.reduce_hypot(x)
+    assert y.shape == ()
+    assert y == 0
+
+
+@pytest.mark.parametrize("arg_dtype", _no_complex_dtypes[1:])
+@pytest.mark.parametrize("out_dtype", _all_dtypes[1:])
+def test_hypot_arg_out_dtype_matrix(arg_dtype, out_dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(arg_dtype, q)
+    skip_if_dtype_not_supported(out_dtype, q)
+
+    m = dpt.ones(100, dtype=arg_dtype)
+    r = dpt.reduce_hypot(m, dtype=out_dtype)
+
+    assert isinstance(r, dpt.usm_ndarray)
+    assert r.dtype == dpt.dtype(out_dtype)
+
+
+def test_hypot_complex():
+    get_queue_or_skip()
+
+    x = dpt.zeros(1, dtype="c8")
+    with pytest.raises(ValueError):
+        dpt.reduce_hypot(x)
+
+
+def test_tree_reduction_axis1_axis0():
+    """See gh-1455"""
+    get_queue_or_skip()
+
+    x = dpt.reshape(dpt.arange(3 * 4 * 5, dtype="f4"), (3, 4, 5))
+
+    m = dpt.logsumexp(x, axis=0)
+    tol = dpt.finfo(m.dtype).resolution
+    assert_allclose(
+        dpt.asnumpy(m),
+        np.logaddexp.reduce(dpt.asnumpy(x), axis=0, dtype=m.dtype),
+        rtol=tol,
+        atol=tol,
+    )
+
+    x = dpt.flip(x, axis=2)
+    m = dpt.logsumexp(x, axis=2)
+    assert_allclose(
+        dpt.asnumpy(m),
+        np.logaddexp.reduce(dpt.asnumpy(x), axis=2, dtype=m.dtype),
+        rtol=tol,
+        atol=tol,
+    )
+
+
+def test_numeric_reduction_out_kwarg():
+    get_queue_or_skip()
+
+    n1, n2, n3 = 3, 4, 5
+    x = dpt.ones((n1, n2, n3), dtype="i8")
+    out = dpt.zeros((2 * n1, 3 * n2), dtype="i8")
+    res = dpt.sum(x, axis=-1, out=out[::-2, 1::3])
+    assert dpt.all(out[::-2, 0::3] == 0)
+    assert dpt.all(out[::-2, 2::3] == 0)
+    assert dpt.all(out[::-2, 1::3] == res)
+    assert dpt.all(out[::-2, 1::3] == 5)
+
+    out = dpt.zeros((2 * n1, 3 * n2, 1), dtype="i8")
+    res = dpt.sum(x, axis=-1, keepdims=True, out=out[::-2, 1::3])
+    assert res.shape == (n1, n2, 1)
+    assert dpt.all(out[::-2, 0::3] == 0)
+    assert dpt.all(out[::-2, 2::3] == 0)
+    assert dpt.all(out[::-2, 1::3] == res)
+    assert dpt.all(out[::-2, 1::3] == 5)
+
+    res = dpt.sum(x, axis=0, out=x[-1])
+    assert dpt.all(x[-1] == res)
+    assert dpt.all(x[-1] == 3)
+    assert dpt.all(x[0:-1] == 1)
+
+    # test no-op case
+    x = dpt.ones((n1, n2, n3), dtype="i8")
+    out = dpt.zeros((2 * n1, 3 * n2, n3), dtype="i8")
+    res = dpt.sum(x, axis=(), out=out[::-2, 1::3])
+    assert dpt.all(out[::-2, 0::3] == 0)
+    assert dpt.all(out[::-2, 2::3] == 0)
+    assert dpt.all(out[::-2, 1::3] == x)
+
+    # test with dtype kwarg
+    x = dpt.ones((n1, n2, n3), dtype="i4")
+    out = dpt.zeros((2 * n1, 3 * n2), dtype="f4")
+    res = dpt.sum(x, axis=-1, dtype="f4", out=out[::-2, 1::3])
+    zero_res = dpt.zeros_like(res)
+    assert dpt.allclose(out[::-2, 0::3], zero_res)
+    assert dpt.allclose(out[::-2, 2::3], zero_res)
+    assert dpt.allclose(out[::-2, 1::3], res)
+    assert dpt.allclose(out[::-2, 1::3], dpt.full_like(res, 5, dtype="f4"))
+
+
+def test_comparison_reduction_out_kwarg():
+    get_queue_or_skip()
+
+    n1, n2, n3 = 3, 4, 5
+    x = dpt.reshape(dpt.arange(n1 * n2 * n3, dtype="i4"), (n1, n2, n3))
+    out = dpt.zeros((2 * n1, 3 * n2), dtype="i4")
+    res = dpt.max(x, axis=-1, out=out[::-2, 1::3])
+    assert dpt.all(out[::-2, 0::3] == 0)
+    assert dpt.all(out[::-2, 2::3] == 0)
+    assert dpt.all(out[::-2, 1::3] == res)
+    assert dpt.all(out[::-2, 1::3] == x[:, :, -1])
+
+    out = dpt.zeros((2 * n1, 3 * n2, 1), dtype="i4")
+    res = dpt.max(x, axis=-1, keepdims=True, out=out[::-2, 1::3])
+    assert res.shape == (n1, n2, 1)
+    assert dpt.all(out[::-2, 0::3] == 0)
+    assert dpt.all(out[::-2, 2::3] == 0)
+    assert dpt.all(out[::-2, 1::3] == res)
+    assert dpt.all(out[::-2, 1::3] == x[:, :, -1, dpt.newaxis])
+
+    # test no-op case
+    out = dpt.zeros((2 * n1, 3 * n2, n3), dtype="i4")
+    res = dpt.max(x, axis=(), out=out[::-2, 1::3])
+    assert dpt.all(out[::-2, 0::3] == 0)
+    assert dpt.all(out[::-2, 2::3] == 0)
+    assert dpt.all(out[::-2, 1::3] == x)
+
+    # test overlap
+    res = dpt.max(x, axis=0, out=x[0])
+    assert dpt.all(x[0] == res)
+    assert dpt.all(x[0] == x[-1])
+
+
+def test_search_reduction_out_kwarg():
+    get_queue_or_skip()
+
+    n1, n2, n3 = 3, 4, 5
+    dt = dpt.__array_namespace_info__().default_dtypes()["indexing"]
+
+    x = dpt.reshape(dpt.arange(n1 * n2 * n3, dtype=dt), (n1, n2, n3))
+    out = dpt.zeros((2 * n1, 3 * n2), dtype=dt)
+    res = dpt.argmax(x, axis=-1, out=out[::-2, 1::3])
+    assert dpt.all(out[::-2, 0::3] == 0)
+    assert dpt.all(out[::-2, 2::3] == 0)
+    assert dpt.all(out[::-2, 1::3] == res)
+    assert dpt.all(out[::-2, 1::3] == n2)
+
+    out = dpt.zeros((2 * n1, 3 * n2, 1), dtype=dt)
+    res = dpt.argmax(x, axis=-1, keepdims=True, out=out[::-2, 1::3])
+    assert res.shape == (n1, n2, 1)
+    assert dpt.all(out[::-2, 0::3] == 0)
+    assert dpt.all(out[::-2, 2::3] == 0)
+    assert dpt.all(out[::-2, 1::3] == res)
+    assert dpt.all(out[::-2, 1::3] == n3 - 1)
+
+    # test no-op case
+    x = dpt.ones((), dtype=dt)
+    out = dpt.ones(2, dtype=dt)
+    res = dpt.argmax(x, axis=None, out=out[1])
+    assert dpt.all(out[0] == 1)
+    assert dpt.all(out[1] == 0)
+
+    # test overlap
+    x = dpt.reshape(dpt.arange(n1 * n2, dtype=dt), (n1, n2))
+    res = dpt.argmax(x, axis=0, out=x[0])
+    assert dpt.all(x[0] == res)
+    assert dpt.all(x[0] == n1 - 1)
+
+
+def test_reduction_out_kwarg_arg_validation():
+    q1 = get_queue_or_skip()
+    q2 = get_queue_or_skip()
+
+    ind_dt = dpt.__array_namespace_info__().default_dtypes()["indexing"]
+
+    x = dpt.ones(10, dtype="f4")
+    out_wrong_queue = dpt.empty((), dtype="f4", sycl_queue=q2)
+    out_wrong_dtype = dpt.empty((), dtype="i4", sycl_queue=q1)
+    out_wrong_shape = dpt.empty(1, dtype="f4", sycl_queue=q1)
+    out_wrong_keepdims = dpt.empty((), dtype="f4", sycl_queue=q1)
+    out_not_writable = dpt.empty((), dtype="f4", sycl_queue=q1)
+    out_not_writable.flags["W"] = False
+
+    with pytest.raises(TypeError):
+        dpt.sum(x, out=dict())
+    with pytest.raises(TypeError):
+        dpt.max(x, out=dict())
+    with pytest.raises(TypeError):
+        dpt.argmax(x, out=dict())
+    with pytest.raises(ExecutionPlacementError):
+        dpt.sum(x, out=out_wrong_queue)
+    with pytest.raises(ExecutionPlacementError):
+        dpt.max(x, out=out_wrong_queue)
+    with pytest.raises(ExecutionPlacementError):
+        dpt.argmax(x, out=dpt.empty_like(out_wrong_queue, dtype=ind_dt))
+    with pytest.raises(ValueError):
+        dpt.sum(x, out=out_wrong_dtype)
+    with pytest.raises(ValueError):
+        dpt.max(x, out=out_wrong_dtype)
+    with pytest.raises(ValueError):
+        dpt.argmax(x, out=dpt.empty_like(out_wrong_dtype, dtype="f4"))
+    with pytest.raises(ValueError):
+        dpt.sum(x, out=out_wrong_shape)
+    with pytest.raises(ValueError):
+        dpt.max(x, out=out_wrong_shape)
+    with pytest.raises(ValueError):
+        dpt.argmax(x, out=dpt.empty_like(out_wrong_shape, dtype=ind_dt))
+    with pytest.raises(ValueError):
+        dpt.sum(x, out=out_not_writable)
+    with pytest.raises(ValueError):
+        dpt.max(x, out=out_not_writable)
+    with pytest.raises(ValueError):
+        search_not_writable = dpt.empty_like(out_not_writable, dtype=ind_dt)
+        search_not_writable.flags["W"] = False
+        dpt.argmax(x, out=search_not_writable)
+    with pytest.raises(ValueError):
+        dpt.sum(x, keepdims=True, out=out_wrong_keepdims)
+    with pytest.raises(ValueError):
+        dpt.max(x, keepdims=True, out=out_wrong_keepdims)
+    with pytest.raises(ValueError):
+        dpt.argmax(
+            x,
+            keepdims=True,
+            out=dpt.empty_like(out_wrong_keepdims, dtype=ind_dt),
+        )
+
+
+@pytest.mark.parametrize("dt", _all_dtypes)
+def test_count_nonzero(dt):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dt, q)
+
+    expected_dt = default_device_index_type(q.sycl_device)
+
+    x = dpt.ones(10, dtype=dt, sycl_queue=q)
+    res = dpt.count_nonzero(x)
+    assert res == 10
+    assert res.dtype == expected_dt
+
+    x[3:6] = 0
+    res = dpt.count_nonzero(x)
+    assert res == 7
+    assert res.dtype == expected_dt
diff --git a/dpnp/tests/tensor/test_usm_ndarray_search_functions.py b/dpnp/tests/tensor/test_usm_ndarray_search_functions.py
new file mode 100644
index 000000000000..30be5f0ee4f5
--- /dev/null
+++ b/dpnp/tests/tensor/test_usm_ndarray_search_functions.py
@@ -0,0 +1,594 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+import ctypes
+import itertools
+
+import numpy as np
+import pytest
+from dpctl.utils import ExecutionPlacementError
+from numpy.testing import assert_array_equal
+
+import dpnp.tensor as dpt
+from dpnp.tensor._search_functions import _where_result_type
+from dpnp.tensor._type_utils import _all_data_types
+
+from .helper import (
+    get_queue_or_skip,
+    skip_if_dtype_not_supported,
+)
+
+_all_dtypes = [
+    "?",
+    "u1",
+    "i1",
+    "u2",
+    "i2",
+    "u4",
+    "i4",
+    "u8",
+    "i8",
+    "e",
+    "f",
+    "d",
+    "F",
+    "D",
+]
+
+
+class mock_device:
+    def __init__(self, fp16, fp64):
+        self.has_aspect_fp16 = fp16
+        self.has_aspect_fp64 = fp64
+
+
+def test_where_basic():
+    get_queue_or_skip()
+
+    cond = dpt.asarray(
+        [
+            [True, False, False],
+            [False, True, False],
+            [False, False, True],
+            [False, False, False],
+            [True, True, True],
+        ]
+    )
+    out = dpt.where(cond, dpt.asarray(1), dpt.asarray(0))
+    out_expected = dpt.asarray(
+        [[1, 0, 0], [0, 1, 0], [0, 0, 1], [0, 0, 0], [1, 1, 1]]
+    )
+    assert (dpt.asnumpy(out) == dpt.asnumpy(out_expected)).all()
+
+    out = dpt.where(cond, dpt.ones(cond.shape), dpt.zeros(cond.shape))
+    assert (dpt.asnumpy(out) == dpt.asnumpy(out_expected)).all()
+
+    out = dpt.where(
+        cond,
+        dpt.ones(cond.shape[0], dtype="i4")[:, dpt.newaxis],
+        dpt.zeros(cond.shape[0], dtype="i4")[:, dpt.newaxis],
+    )
+    assert (dpt.asnumpy(out) == dpt.asnumpy(out_expected)).all()
+
+
+def _dtype_all_close(x1, x2):
+    if np.issubdtype(x2.dtype, np.floating) or np.issubdtype(
+        x2.dtype, np.complexfloating
+    ):
+        x2_dtype = x2.dtype
+        return np.allclose(
+            x1, x2, atol=np.finfo(x2_dtype).eps, rtol=np.finfo(x2_dtype).eps
+        )
+    else:
+        return np.allclose(x1, x2)
+
+
+@pytest.mark.parametrize("dt1", _all_dtypes)
+@pytest.mark.parametrize("dt2", _all_dtypes)
+@pytest.mark.parametrize("fp16", [True, False])
+@pytest.mark.parametrize("fp64", [True, False])
+def test_where_result_types(dt1, dt2, fp16, fp64):
+    dev = mock_device(fp16, fp64)
+
+    dt1 = dpt.dtype(dt1)
+    dt2 = dpt.dtype(dt2)
+    res_t = _where_result_type(dt1, dt2, dev)
+
+    if fp16 and fp64:
+        assert res_t == dpt.result_type(dt1, dt2)
+    else:
+        if res_t:
+            assert res_t.kind == dpt.result_type(dt1, dt2).kind
+        else:
+            # some illegal cases are covered above, but
+            # this guarantees that _where_result_type
+            # produces None only when one of the dtypes
+            # is illegal given fp aspects of device
+            all_dts = _all_data_types(fp16, fp64)
+            assert dt1 not in all_dts or dt2 not in all_dts
+
+
+@pytest.mark.parametrize("dt", _all_dtypes)
+def test_where_mask_dtypes(dt):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dt, q)
+
+    # mask dtype changes
+    cond = dpt.asarray([0, 1, 3, 0, 10], dtype=dt, sycl_queue=q)
+    x1 = dpt.asarray(0, dtype="f4", sycl_queue=q)
+    x2 = dpt.asarray(1, dtype="f4", sycl_queue=q)
+    res = dpt.where(cond, x1, x2)
+
+    res_check = np.asarray([1, 0, 0, 1, 0], dtype=res.dtype)
+    assert _dtype_all_close(dpt.asnumpy(res), res_check)
+
+    # contiguous cases
+    x1 = dpt.full(cond.shape, 0, dtype="f4", sycl_queue=q)
+    x2 = dpt.full(cond.shape, 1, dtype="f4", sycl_queue=q)
+    res = dpt.where(cond, x1, x2)
+    assert _dtype_all_close(dpt.asnumpy(res), res_check)
+
+    # input array dtype changes
+    cond = dpt.asarray([False, True, True, False, True], sycl_queue=q)
+    x1 = dpt.asarray(0, dtype=dt, sycl_queue=q)
+    x2 = dpt.asarray(1, dtype=dt, sycl_queue=q)
+    res = dpt.where(cond, x1, x2)
+
+    res_check = np.asarray([1, 0, 0, 1, 0], dtype=res.dtype)
+    assert _dtype_all_close(dpt.asnumpy(res), res_check)
+
+    # contiguous cases
+    x1 = dpt.full(cond.shape, 0, dtype=dt, sycl_queue=q)
+    x2 = dpt.full(cond.shape, 1, dtype=dt, sycl_queue=q)
+    res = dpt.where(cond, x1, x2)
+    assert _dtype_all_close(dpt.asnumpy(res), res_check)
+
+
+def test_where_asymmetric_dtypes():
+    q = get_queue_or_skip()
+
+    cond = dpt.asarray([0, 1, 3, 0, 10], dtype="?", sycl_queue=q)
+    x1 = dpt.asarray(2, dtype="i4", sycl_queue=q)
+    x2 = dpt.asarray(3, dtype="i8", sycl_queue=q)
+
+    res = dpt.where(cond, x1, x2)
+    res_check = np.asarray([3, 2, 2, 3, 2], dtype=res.dtype)
+    assert _dtype_all_close(dpt.asnumpy(res), res_check)
+
+    # flip order
+
+    res = dpt.where(cond, x2, x1)
+    res_check = np.asarray([2, 3, 3, 2, 3], dtype=res.dtype)
+    assert _dtype_all_close(dpt.asnumpy(res), res_check)
+
+
+def test_where_nan_inf():
+    get_queue_or_skip()
+
+    cond = dpt.asarray([True, False, True, False], dtype="?")
+    x1 = dpt.asarray([np.nan, 2.0, np.inf, 3.0], dtype="f4")
+    x2 = dpt.asarray([2.0, np.nan, 3.0, np.inf], dtype="f4")
+
+    cond_np = dpt.asnumpy(cond)
+    x1_np = dpt.asnumpy(x1)
+    x2_np = dpt.asnumpy(x2)
+
+    res = dpt.where(cond, x1, x2)
+    res_np = np.where(cond_np, x1_np, x2_np)
+
+    assert np.allclose(dpt.asnumpy(res), res_np, equal_nan=True)
+
+    res = dpt.where(x1, cond, x2)
+    res_np = np.where(x1_np, cond_np, x2_np)
+    assert _dtype_all_close(dpt.asnumpy(res), res_np)
+
+
+def test_where_empty():
+    # check that numpy returns same results when
+    # handling empty arrays
+    get_queue_or_skip()
+
+    empty = dpt.empty(0, dtype="i2")
+    m = dpt.asarray(True)
+    x1 = dpt.asarray(1, dtype="i2")
+    x2 = dpt.asarray(2, dtype="i2")
+    res = dpt.where(empty, x1, x2)
+
+    empty_np = np.empty(0, dtype="i2")
+    m_np = dpt.asnumpy(m)
+    x1_np = dpt.asnumpy(x1)
+    x2_np = dpt.asnumpy(x2)
+    res_np = np.where(empty_np, x1_np, x2_np)
+
+    assert_array_equal(dpt.asnumpy(res), res_np)
+
+    res = dpt.where(m, empty, x2)
+    res_np = np.where(m_np, empty_np, x2_np)
+
+    assert_array_equal(dpt.asnumpy(res), res_np)
+
+    # check that broadcasting is performed
+    with pytest.raises(ValueError):
+        dpt.where(empty, x1, dpt.empty((1, 2)))
+
+
+@pytest.mark.parametrize("order", ["C", "F"])
+def test_where_contiguous(order):
+    get_queue_or_skip()
+
+    cond = dpt.asarray(
+        [
+            [[True, False, False], [False, True, True]],
+            [[False, True, False], [True, False, True]],
+            [[False, False, True], [False, False, True]],
+            [[False, False, False], [True, False, True]],
+            [[True, True, True], [True, False, True]],
+        ],
+        order=order,
+    )
+
+    x1 = dpt.full(cond.shape, 2, dtype="i4", order=order)
+    x2 = dpt.full(cond.shape, 3, dtype="i4", order=order)
+    expected = np.where(dpt.asnumpy(cond), dpt.asnumpy(x1), dpt.asnumpy(x2))
+    res = dpt.where(cond, x1, x2)
+
+    assert _dtype_all_close(dpt.asnumpy(res), expected)
+
+
+def test_where_contiguous1D():
+    get_queue_or_skip()
+
+    cond = dpt.asarray([True, False, True, False, False, True])
+
+    x1 = dpt.full(cond.shape, 2, dtype="i4")
+    x2 = dpt.full(cond.shape, 3, dtype="i4")
+    expected = np.where(dpt.asnumpy(cond), dpt.asnumpy(x1), dpt.asnumpy(x2))
+    res = dpt.where(cond, x1, x2)
+    assert_array_equal(dpt.asnumpy(res), expected)
+
+    # test with complex dtype (branch in kernel)
+    x1 = dpt.astype(x1, dpt.complex64)
+    x2 = dpt.astype(x2, dpt.complex64)
+    expected = np.where(dpt.asnumpy(cond), dpt.asnumpy(x1), dpt.asnumpy(x2))
+    res = dpt.where(cond, x1, x2)
+    assert _dtype_all_close(dpt.asnumpy(res), expected)
+
+
+def test_where_gh_1170():
+    get_queue_or_skip()
+
+    cond = dpt.asarray([False, True, True, False], dtype="?")
+    x1 = dpt.ones((3, 4), dtype="i4")
+    x2 = dpt.zeros((3, 4), dtype="i4")
+
+    res = dpt.where(cond, x1, x2)
+    expected = np.broadcast_to(dpt.asnumpy(cond).astype(x1.dtype), x1.shape)
+
+    assert_array_equal(dpt.asnumpy(res), expected)
+
+
+def test_where_strided():
+    get_queue_or_skip()
+
+    s0, s1 = 4, 9
+    cond = dpt.reshape(
+        dpt.asarray(
+            [True, False, False, False, True, True, False, True, False] * s0
+        ),
+        (s0, s1),
+    )[:, ::3]
+
+    x1 = dpt.reshape(
+        dpt.arange(cond.shape[0] * cond.shape[1] * 2, dtype="i4"),
+        (cond.shape[0], cond.shape[1] * 2),
+    )[:, ::2]
+    x2 = dpt.reshape(
+        dpt.arange(cond.shape[0] * cond.shape[1] * 3, dtype="i4"),
+        (cond.shape[0], cond.shape[1] * 3),
+    )[:, ::3]
+    expected = np.where(dpt.asnumpy(cond), dpt.asnumpy(x1), dpt.asnumpy(x2))
+    res = dpt.where(cond, x1, x2)
+
+    assert_array_equal(dpt.asnumpy(res), expected)
+
+    # negative strides
+    res = dpt.where(cond, dpt.flip(x1), x2)
+    expected = np.where(
+        dpt.asnumpy(cond), np.flip(dpt.asnumpy(x1)), dpt.asnumpy(x2)
+    )
+    assert_array_equal(dpt.asnumpy(res), expected)
+
+    res = dpt.where(dpt.flip(cond), x1, x2)
+    expected = np.where(
+        np.flip(dpt.asnumpy(cond)), dpt.asnumpy(x1), dpt.asnumpy(x2)
+    )
+    assert_array_equal(dpt.asnumpy(res), expected)
+
+
+def test_where_invariants():
+    get_queue_or_skip()
+
+    test_sh = (
+        6,
+        8,
+    )
+    mask = dpt.asarray(np.random.choice([True, False], size=test_sh))
+    p = dpt.ones(test_sh, dtype=dpt.int16)
+    m = dpt.full(test_sh, -1, dtype=dpt.int16)
+    inds_list = [
+        (
+            np.s_[:3],
+            np.s_[::2],
+        ),
+        (
+            np.s_[::2],
+            np.s_[::2],
+        ),
+        (
+            np.s_[::-1],
+            np.s_[:],
+        ),
+    ]
+    for ind in inds_list:
+        r1 = dpt.where(mask, p, m)[ind]
+        r2 = dpt.where(mask[ind], p[ind], m[ind])
+        assert (dpt.asnumpy(r1) == dpt.asnumpy(r2)).all()
+
+
+def test_where_arg_validation():
+    get_queue_or_skip()
+
+    check = {}
+    x1 = dpt.empty((1,), dtype="i4")
+    x2 = dpt.empty((1,), dtype="i4")
+
+    with pytest.raises(TypeError):
+        dpt.where(check, x1, x2)
+    with pytest.raises(ValueError):
+        dpt.where(x1, check, x2)
+    with pytest.raises(ValueError):
+        dpt.where(x1, x2, check)
+
+
+def test_where_compute_follows_data():
+    q1 = get_queue_or_skip()
+    q2 = get_queue_or_skip()
+    q3 = get_queue_or_skip()
+
+    x1 = dpt.empty((1,), dtype="i4", sycl_queue=q1)
+    x2 = dpt.empty((1,), dtype="i4", sycl_queue=q2)
+
+    with pytest.raises(ExecutionPlacementError):
+        dpt.where(dpt.empty((1,), dtype="i4", sycl_queue=q1), x1, x2)
+    with pytest.raises(ExecutionPlacementError):
+        dpt.where(dpt.empty((1,), dtype="i4", sycl_queue=q3), x1, x2)
+    with pytest.raises(ExecutionPlacementError):
+        dpt.where(x1, x1, x2)
+
+
+def test_where_order():
+    get_queue_or_skip()
+
+    test_sh = (
+        20,
+        20,
+    )
+    test_sh2 = tuple(2 * dim for dim in test_sh)
+    n = test_sh[-1]
+
+    for dt1, dt2 in zip(["i4", "i4", "f4"], ["i4", "f4", "i4"]):
+        ar1 = dpt.zeros(test_sh, dtype=dt1, order="C")
+        ar2 = dpt.ones(test_sh, dtype=dt2, order="C")
+        condition = dpt.zeros(test_sh, dtype="?", order="C")
+        res1 = dpt.where(condition, ar1, ar2, order="C")
+        assert res1.flags.c_contiguous
+        res2 = dpt.where(condition, ar1, ar2, order="F")
+        assert res2.flags.f_contiguous
+        res3 = dpt.where(condition, ar1, ar2, order="A")
+        assert res3.flags.c_contiguous
+        res4 = dpt.where(condition, ar1, ar2, order="K")
+        assert res4.flags.c_contiguous
+
+        ar1 = dpt.ones(test_sh, dtype=dt1, order="F")
+        ar2 = dpt.ones(test_sh, dtype=dt2, order="F")
+        condition = dpt.zeros(test_sh, dtype="?", order="F")
+        res1 = dpt.where(condition, ar1, ar2, order="C")
+        assert res1.flags.c_contiguous
+        res2 = dpt.where(condition, ar1, ar2, order="F")
+        assert res2.flags.f_contiguous
+        res3 = dpt.where(condition, ar1, ar2, order="A")
+        assert res2.flags.f_contiguous
+        res4 = dpt.where(condition, ar1, ar2, order="K")
+        assert res4.flags.f_contiguous
+
+        ar1 = dpt.ones(test_sh2, dtype=dt1, order="C")[:20, ::-2]
+        ar2 = dpt.ones(test_sh2, dtype=dt2, order="C")[:20, ::-2]
+        condition = dpt.zeros(test_sh2, dtype="?", order="C")[:20, ::-2]
+        res1 = dpt.where(condition, ar1, ar2, order="K")
+        assert res1.strides == (n, -1)
+        res2 = dpt.where(condition, ar1, ar2, order="C")
+        assert res2.strides == (n, 1)
+
+        ar1 = dpt.ones(test_sh2, dtype=dt1, order="C")[:20, ::-2].mT
+        ar2 = dpt.ones(test_sh2, dtype=dt2, order="C")[:20, ::-2].mT
+        condition = dpt.zeros(test_sh2, dtype="?", order="C")[:20, ::-2].mT
+        res1 = dpt.where(condition, ar1, ar2, order="K")
+        assert res1.strides == (-1, n)
+        res2 = dpt.where(condition, ar1, ar2, order="C")
+        assert res2.strides == (n, 1)
+
+        ar1 = dpt.ones(n, dtype=dt1, order="C")
+        ar2 = dpt.broadcast_to(dpt.ones(n, dtype=dt2, order="C"), test_sh)
+        condition = dpt.zeros(n, dtype="?", order="C")
+        res = dpt.where(condition, ar1, ar2, order="K")
+        assert res.strides == (20, 1)
+
+
+def test_where_unaligned():
+    get_queue_or_skip()
+
+    x = dpt.ones(513, dtype="i4")
+    a = dpt.full(512, 2, dtype="i4")
+    b = dpt.zeros(512, dtype="i4")
+
+    expected = dpt.full(512, 2, dtype="i4")
+    assert dpt.all(dpt.where(x[1:], a, b) == expected)
+
+
+def test_where_out():
+    get_queue_or_skip()
+
+    n1, n2, n3 = 3, 4, 5
+    ar1 = dpt.reshape(dpt.arange(n1 * n2 * n3, dtype="i4"), (n1, n2, n3))
+    ar2 = dpt.full_like(ar1, -5)
+    condition = dpt.tile(
+        dpt.reshape(
+            dpt.asarray([True, False, False, True], dtype="?"), (1, n2, 1)
+        ),
+        (n1, 1, n3),
+    )
+
+    out = dpt.zeros((2 * n1, 3 * n2, n3), dtype="i4")
+    res = dpt.where(condition, ar1, ar2, out=out[::-2, 1::3, :])
+
+    assert dpt.all(res == out[::-2, 1::3, :])
+    assert dpt.all(out[::-2, 0::3, :] == 0)
+    assert dpt.all(out[::-2, 2::3, :] == 0)
+
+    assert dpt.all(res[:, 1:3, :] == -5)
+    assert dpt.all(res[:, 0, :] == ar1[:, 0, :])
+    assert dpt.all(res[:, 3, :] == ar1[:, 3, :])
+
+    condition = dpt.tile(
+        dpt.reshape(dpt.asarray([1, 0], dtype="i4"), (1, 2, 1)),
+        (n1, 2, n3),
+    )
+    res = dpt.where(
+        condition[:, ::-1, :], condition[:, ::-1, :], condition, out=condition
+    )
+    assert dpt.all(res == condition)
+    assert dpt.all(condition == 1)
+
+    condition = dpt.tile(
+        dpt.reshape(dpt.asarray([True, False], dtype="?"), (1, 2, 1)),
+        (n1, 2, n3),
+    )
+    ar1 = dpt.full((n1, n2, n3), 7, dtype="i4")
+    ar2 = dpt.full_like(ar1, -5)
+    res = dpt.where(condition, ar1, ar2, out=ar2[:, ::-1, :])
+    assert dpt.all(ar2[:, ::-1, :] == res)
+    assert dpt.all(ar2[:, ::2, :] == -5)
+    assert dpt.all(ar2[:, 1::2, :] == 7)
+
+    condition = dpt.tile(
+        dpt.reshape(dpt.asarray([True, False], dtype="?"), (1, 2, 1)),
+        (n1, 2, n3),
+    )
+    ar1 = dpt.full((n1, n2, n3), 7, dtype="i4")
+    ar2 = dpt.full_like(ar1, -5)
+    res = dpt.where(condition, ar1, ar2, out=ar1[:, ::-1, :])
+    assert dpt.all(ar1[:, ::-1, :] == res)
+    assert dpt.all(ar1[:, ::2, :] == -5)
+    assert dpt.all(ar1[:, 1::2, :] == 7)
+
+
+def test_where_out_arg_validation():
+    q1 = get_queue_or_skip()
+    q2 = get_queue_or_skip()
+
+    condition = dpt.ones(5, dtype="i4", sycl_queue=q1)
+    x1 = dpt.ones(5, dtype="i4", sycl_queue=q1)
+    x2 = dpt.ones(5, dtype="i4", sycl_queue=q1)
+
+    out_wrong_queue = dpt.empty_like(condition, sycl_queue=q2)
+    out_wrong_dtype = dpt.empty_like(condition, dtype="f4")
+    out_wrong_shape = dpt.empty(6, dtype="i4", sycl_queue=q1)
+    out_not_writable = dpt.empty_like(condition)
+    out_not_writable.flags["W"] = False
+
+    with pytest.raises(TypeError):
+        dpt.where(condition, x1, x2, out=dict())
+    with pytest.raises(ExecutionPlacementError):
+        dpt.where(condition, x1, x2, out=out_wrong_queue)
+    with pytest.raises(ValueError):
+        dpt.where(condition, x1, x2, out=out_wrong_dtype)
+    with pytest.raises(ValueError):
+        dpt.where(condition, x1, x2, out=out_wrong_shape)
+    with pytest.raises(ValueError):
+        dpt.where(condition, x1, x2, out=out_not_writable)
+
+
+@pytest.mark.parametrize("arr_dt", _all_dtypes)
+def test_where_python_scalar(arr_dt):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(arr_dt, q)
+
+    n1, n2 = 10, 10
+    condition = dpt.tile(
+        dpt.reshape(
+            dpt.asarray([True, False], dtype="?", sycl_queue=q), (1, 2)
+        ),
+        (n1, n2 // 2),
+    )
+    x = dpt.zeros((n1, n2), dtype=arr_dt, sycl_queue=q)
+    py_scalars = (
+        bool(0),
+        int(0),
+        float(0),
+        complex(0),
+        np.float32(0),
+        ctypes.c_int(0),
+    )
+    for sc in py_scalars:
+        r = dpt.where(condition, x, sc)
+        assert isinstance(r, dpt.usm_ndarray)
+        r = dpt.where(condition, sc, x)
+        assert isinstance(r, dpt.usm_ndarray)
+
+
+def test_where_two_python_scalars():
+    get_queue_or_skip()
+
+    n1, n2 = 10, 10
+    condition = dpt.tile(
+        dpt.reshape(dpt.asarray([True, False], dtype="?"), (1, 2)),
+        (n1, n2 // 2),
+    )
+
+    py_scalars = [
+        bool(0),
+        int(0),
+        float(0),
+        complex(0),
+        np.float32(0),
+        ctypes.c_int(0),
+    ]
+
+    for sc1, sc2 in itertools.product(py_scalars, repeat=2):
+        r = dpt.where(condition, sc1, sc2)
+        assert isinstance(r, dpt.usm_ndarray)
diff --git a/dpnp/tests/tensor/test_usm_ndarray_searchsorted.py b/dpnp/tests/tensor/test_usm_ndarray_searchsorted.py
new file mode 100644
index 000000000000..d97e224b61cc
--- /dev/null
+++ b/dpnp/tests/tensor/test_usm_ndarray_searchsorted.py
@@ -0,0 +1,408 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+import dpctl
+import dpctl.utils as dpu
+import numpy as np
+import pytest
+
+import dpnp.tensor as dpt
+
+from .helper import (
+    get_queue_or_skip,
+    skip_if_dtype_not_supported,
+)
+
+
+def _check(hay_stack, needles, needles_np):
+    assert hay_stack.dtype == needles.dtype
+    assert hay_stack.ndim == 1
+
+    info_ = dpt.__array_namespace_info__()
+    default_dts_dev = info_.default_dtypes(device=hay_stack.device)
+    index_dt = default_dts_dev["indexing"]
+
+    p_left = dpt.searchsorted(hay_stack, needles, side="left")
+    assert p_left.dtype == index_dt
+
+    hs_np = dpt.asnumpy(hay_stack)
+    ref_left = np.searchsorted(hs_np, needles_np, side="left")
+    assert dpt.all(p_left == dpt.asarray(ref_left))
+
+    p_right = dpt.searchsorted(hay_stack, needles, side="right")
+    assert p_right.dtype == index_dt
+
+    ref_right = np.searchsorted(hs_np, needles_np, side="right")
+    assert dpt.all(p_right == dpt.asarray(ref_right))
+
+    sorter = dpt.arange(hay_stack.size)
+    ps_left = dpt.searchsorted(hay_stack, needles, side="left", sorter=sorter)
+    assert ps_left.dtype == index_dt
+    assert dpt.all(ps_left == p_left)
+    ps_right = dpt.searchsorted(hay_stack, needles, side="right", sorter=sorter)
+    assert ps_right.dtype == index_dt
+    assert dpt.all(ps_right == p_right)
+
+
+def test_searchsorted_contig_bool():
+    get_queue_or_skip()
+
+    dt = dpt.bool
+
+    hay_stack = dpt.arange(0, 1, dtype=dt)
+    needles_np = np.random.choice([True, False], size=1024)
+    needles = dpt.asarray(needles_np)
+
+    _check(hay_stack, needles, needles_np)
+    _check(
+        hay_stack,
+        dpt.reshape(needles, (32, 32)),
+        np.reshape(needles_np, (32, 32)),
+    )
+
+
+def test_searchsorted_strided_bool():
+    get_queue_or_skip()
+
+    dt = dpt.bool
+
+    hay_stack = dpt.repeat(dpt.arange(0, 1, dtype=dt), 4)[::4]
+    needles_np = np.random.choice([True, False], size=2 * 1024)
+    needles = dpt.asarray(needles_np)
+    sl = slice(None, None, -2)
+
+    _check(hay_stack, needles[sl], needles_np[sl])
+    _check(
+        hay_stack,
+        dpt.reshape(needles[sl], (32, 32)),
+        np.reshape(needles_np[sl], (32, 32)),
+    )
+
+
+@pytest.mark.parametrize(
+    "idt",
+    [
+        dpt.int8,
+        dpt.uint8,
+        dpt.int16,
+        dpt.uint16,
+        dpt.int32,
+        dpt.uint32,
+        dpt.int64,
+        dpt.uint64,
+    ],
+)
+def test_searchsorted_contig_int(idt):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(idt, q)
+
+    dt = dpt.dtype(idt)
+    max_v = dpt.iinfo(dt).max
+
+    hay_stack = dpt.arange(0, min(max_v, 255), dtype=dt)
+    needles_np = np.random.randint(0, max_v, dtype=dt, size=1024)
+    needles = dpt.asarray(needles_np)
+
+    _check(hay_stack, needles, needles_np)
+    _check(
+        hay_stack,
+        dpt.reshape(needles, (32, 32)),
+        np.reshape(needles_np, (32, 32)),
+    )
+
+
+@pytest.mark.parametrize(
+    "idt",
+    [
+        dpt.int8,
+        dpt.uint8,
+        dpt.int16,
+        dpt.uint16,
+        dpt.int32,
+        dpt.uint32,
+        dpt.int64,
+        dpt.uint64,
+    ],
+)
+def test_searchsorted_strided_int(idt):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(idt, q)
+
+    dt = dpt.dtype(idt)
+    max_v = dpt.iinfo(dt).max
+
+    hay_stack = dpt.repeat(dpt.arange(0, min(max_v, 255), dtype=dt), 4)[1::4]
+    needles_np = np.random.randint(0, max_v, dtype=dt, size=2 * 1024)
+    needles = dpt.asarray(needles_np)
+    sl = slice(None, None, -2)
+
+    _check(hay_stack, needles[sl], needles_np[sl])
+    _check(
+        hay_stack,
+        dpt.reshape(needles[sl], (32, 32)),
+        np.reshape(needles_np[sl], (32, 32)),
+    )
+
+
+def _add_extended_fp(array):
+    array[0] = -dpt.inf
+    array[-2] = dpt.inf
+    array[-1] = dpt.nan
+
+
+@pytest.mark.parametrize("idt", [dpt.float16, dpt.float32, dpt.float64])
+def test_searchsorted_contig_fp(idt):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(idt, q)
+
+    dt = dpt.dtype(idt)
+
+    hay_stack = dpt.linspace(0, 1, num=255, dtype=dt, endpoint=True)
+    _add_extended_fp(hay_stack)
+
+    needles_np = np.random.uniform(-0.1, 1.1, size=1024).astype(dt)
+    needles = dpt.asarray(needles_np)
+
+    _check(hay_stack, needles, needles_np)
+    _check(
+        hay_stack,
+        dpt.reshape(needles, (32, 32)),
+        np.reshape(needles_np, (32, 32)),
+    )
+
+
+@pytest.mark.parametrize("idt", [dpt.float16, dpt.float32, dpt.float64])
+def test_searchsorted_strided_fp(idt):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(idt, q)
+
+    dt = dpt.dtype(idt)
+
+    hay_stack = dpt.repeat(
+        dpt.linspace(0, 1, num=255, dtype=dt, endpoint=True), 4
+    )[1::4]
+    _add_extended_fp(hay_stack)
+
+    needles_np = np.random.uniform(-0.1, 1.1, size=3 * 1024).astype(dt)
+    needles = dpt.asarray(needles_np)
+    sl = slice(1, None, 3)
+
+    _check(hay_stack, needles[sl], needles_np[sl])
+    _check(
+        hay_stack,
+        dpt.reshape(needles[sl], (32, 32)),
+        np.reshape(needles_np[sl], (32, 32)),
+    )
+
+
+def _add_extended_cfp(array):
+    dt = array.dtype
+    ev_li = [
+        complex(-dpt.inf, -1),
+        complex(-dpt.inf, -dpt.inf),
+        complex(-dpt.inf, dpt.inf),
+        complex(-dpt.inf, dpt.nan),
+        complex(0, -dpt.inf),
+        complex(0, -1),
+        complex(0, dpt.inf),
+        complex(0, dpt.nan),
+        complex(dpt.inf, -dpt.inf),
+        complex(dpt.inf, -1),
+        complex(dpt.inf, dpt.inf),
+        complex(dpt.inf, dpt.nan),
+        complex(dpt.nan, -dpt.inf),
+        complex(dpt.nan, -1),
+        complex(dpt.nan, dpt.inf),
+        complex(dpt.nan, dpt.nan),
+    ]
+    ev = dpt.asarray(ev_li, dtype=dt, device=array.device)
+    return dpt.sort(dpt.concat((ev, array)))
+
+
+@pytest.mark.parametrize("idt", [dpt.complex64, dpt.complex128])
+def test_searchsorted_contig_cfp(idt):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(idt, q)
+
+    dt = dpt.dtype(idt)
+
+    hay_stack = dpt.linspace(0, 1, num=255, dtype=dt, endpoint=True)
+    hay_stack = _add_extended_cfp(hay_stack)
+    needles_np = np.random.uniform(-0.1, 1.1, size=1024).astype(dt)
+    needles = dpt.asarray(needles_np)
+
+    _check(hay_stack, needles, needles_np)
+    _check(
+        hay_stack,
+        dpt.reshape(needles, (32, 32)),
+        np.reshape(needles_np, (32, 32)),
+    )
+
+
+@pytest.mark.parametrize("idt", [dpt.complex64, dpt.complex128])
+def test_searchsorted_strided_cfp(idt):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(idt, q)
+
+    dt = dpt.dtype(idt)
+
+    hay_stack = dpt.repeat(
+        dpt.linspace(0, 1, num=255, dtype=dt, endpoint=True), 4
+    )[1::4]
+    needles_np = np.random.uniform(-0.1, 1.1, size=3 * 1024).astype(dt)
+    needles = dpt.asarray(needles_np)
+    sl = slice(1, None, 3)
+
+    _check(hay_stack, needles[sl], needles_np[sl])
+    _check(
+        hay_stack,
+        dpt.reshape(needles[sl], (32, 32)),
+        np.reshape(needles_np[sl], (32, 32)),
+    )
+
+    hay_stack = _add_extended_cfp(hay_stack)
+    _check(hay_stack, needles[sl], needles_np[sl])
+    _check(
+        hay_stack,
+        dpt.reshape(needles[sl], (32, 32)),
+        np.reshape(needles_np[sl], (32, 32)),
+    )
+
+
+def test_searchsorted_coerce():
+    get_queue_or_skip()
+
+    x1_i4 = dpt.arange(5, dtype="i4")
+    x1_i8 = dpt.arange(5, dtype="i8")
+    x2_i4 = dpt.arange(5, dtype="i4")
+    x2_i8 = dpt.arange(5, dtype="i8")
+
+    p1 = dpt.searchsorted(x1_i4, x2_i8)
+    p2 = dpt.searchsorted(x1_i8, x2_i8)
+    p3 = dpt.searchsorted(x1_i8, x2_i4)
+    assert dpt.all(p1 == p2)
+    assert dpt.all(p2 == p3)
+
+
+def test_searchsorted_validation():
+    with pytest.raises(TypeError):
+        dpt.searchsorted(None, None)
+    try:
+        x1 = dpt.arange(10, dtype="i4")
+    except dpctl.SyclDeviceCreationError:
+        pytest.skip("Default device could not be created")
+    with pytest.raises(TypeError):
+        dpt.searchsorted(x1, None)
+    with pytest.raises(TypeError):
+        dpt.searchsorted(x1, x1, sorter=dict())
+    with pytest.raises(ValueError):
+        dpt.searchsorted(x1, x1, side="unknown")
+
+
+def test_searchsorted_validation2():
+    try:
+        x1 = dpt.arange(10, dtype="i4")
+        sorter = dpt.arange(10, dtype="i4")
+    except dpctl.SyclDeviceCreationError:
+        pytest.skip("Default device could not be created")
+    d = x1.sycl_device
+    q2 = dpctl.SyclQueue(d, property="in_order")
+    x2 = dpt.ones(5, dtype=x1.dtype, sycl_queue=q2)
+
+    with pytest.raises(dpu.ExecutionPlacementError):
+        dpt.searchsorted(x1, x2)
+
+    with pytest.raises(dpu.ExecutionPlacementError):
+        dpt.searchsorted(x1, x2, sorter=sorter)
+
+    sorter = dpt.ones(x1.shape, dtype=dpt.bool)
+    # non-integral sorter.dtype raises
+    with pytest.raises(ValueError):
+        dpt.searchsorted(x1, x1, sorter=sorter)
+
+    # non-matching x1.shape and sorter.shape raises
+    with pytest.raises(ValueError):
+        dpt.searchsorted(x1, x1, sorter=sorter[:-1])
+
+    # x1 must be 1d, or ValueError is raised
+    with pytest.raises(ValueError):
+        dpt.searchsorted(x1[dpt.newaxis, :], x1)
+
+
+def test_pw_linear_interpolation_example():
+    get_queue_or_skip()
+
+    bins = dpt.asarray([0.0, 0.05, 0.2, 0.25, 0.5, 0.8, 0.95, 1])
+    vals = dpt.asarray([0.1, 0.15, 0.3, 0.5, 0.7, 0.53, 0.37, 0.1])
+    assert vals.shape == bins.shape
+    data_np = np.random.uniform(0, 1, size=10000)
+    data = dpt.asarray(data_np)
+
+    p = dpt.searchsorted(bins, data)
+    w = (data - bins[p]) / (bins[p - 1] - bins[p])
+    assert dpt.min(w) >= 0
+    assert dpt.max(w) <= 1
+    interp_vals = vals[p - 1] * w + (1 - w) * vals[p]
+
+    assert interp_vals.shape == data.shape
+    assert dpt.min(interp_vals) >= dpt.zeros(tuple())
+    av = dpt.sum(interp_vals) / data.size
+    exp = dpt.vecdot(vals[1:] + vals[:-1], bins[1:] - bins[:-1]) / 2
+
+    assert dpt.abs(av - exp) < 0.1
+
+
+def test_out_of_bound_sorter_values():
+    get_queue_or_skip()
+
+    x = dpt.asarray([1, 2, 0], dtype="i4")
+    n = x.shape[0]
+
+    # use out-of-bounds indices in sorter
+    sorter = dpt.asarray([2, 0 - n, 1 - n], dtype="i8")
+
+    x2 = dpt.arange(3, dtype=x.dtype)
+    p = dpt.searchsorted(x, x2, sorter=sorter)
+    # verify that they were applied with mode="wrap"
+    assert dpt.all(p == dpt.arange(3, dtype=p.dtype))
+
+
+def test_searchsorted_strided_scalar_needle():
+    get_queue_or_skip()
+
+    a_max = 255
+
+    hay_stack = dpt.flip(
+        dpt.repeat(dpt.arange(a_max - 1, -1, -1, dtype=dpt.int32), 4)
+    )
+    needles_np = np.squeeze(
+        np.random.randint(0, a_max, dtype=dpt.int32, size=1), axis=0
+    )
+    needles = dpt.asarray(needles_np)
+
+    _check(hay_stack, needles, needles_np)
diff --git a/dpnp/tests/tensor/test_usm_ndarray_sorting.py b/dpnp/tests/tensor/test_usm_ndarray_sorting.py
new file mode 100644
index 000000000000..af96811bf2f9
--- /dev/null
+++ b/dpnp/tests/tensor/test_usm_ndarray_sorting.py
@@ -0,0 +1,397 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+import itertools
+
+import numpy as np
+import pytest
+from numpy.testing import assert_array_equal
+
+import dpnp.tensor as dpt
+
+from .helper import (
+    get_queue_or_skip,
+    skip_if_dtype_not_supported,
+)
+
+
+@pytest.mark.parametrize(
+    "dtype",
+    [
+        "i1",
+        "u1",
+        "i2",
+        "u2",
+        "i4",
+        "u4",
+        "i8",
+        "u8",
+        "f2",
+        "f4",
+        "f8",
+        "c8",
+        "c16",
+    ],
+)
+def test_sort_1d(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    inp = dpt.roll(
+        dpt.concat(
+            (dpt.ones(10000, dtype=dtype), dpt.zeros(10000, dtype=dtype))
+        ),
+        734,
+    )
+
+    s = dpt.sort(inp, descending=False)
+    assert dpt.all(s[:-1] <= s[1:])
+
+    s1 = dpt.sort(inp, descending=True)
+    assert dpt.all(s1[:-1] >= s1[1:])
+
+
+@pytest.mark.parametrize(
+    "dtype",
+    [
+        "i1",
+        "u1",
+        "i2",
+        "u2",
+        "i4",
+        "u4",
+        "i8",
+        "u8",
+        "f2",
+        "f4",
+        "f8",
+        "c8",
+        "c16",
+    ],
+)
+def test_sort_2d(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    fl = dpt.roll(
+        dpt.concat(
+            (dpt.ones(10000, dtype=dtype), dpt.zeros(10000, dtype=dtype))
+        ),
+        734,
+    )
+    inp = dpt.reshape(fl, (20, -1))
+
+    s = dpt.sort(inp, axis=1, descending=False)
+    assert dpt.all(s[:, :-1] <= s[:, 1:])
+
+    s1 = dpt.sort(inp, axis=1, descending=True)
+    assert dpt.all(s1[:, :-1] >= s1[:, 1:])
+
+
+def test_sort_strides():
+    get_queue_or_skip()
+
+    fl = dpt.roll(
+        dpt.concat((dpt.ones(10000, dtype="i4"), dpt.zeros(10000, dtype="i4"))),
+        734,
+    )
+    inp = dpt.reshape(fl, (-1, 20))
+
+    s = dpt.sort(inp, axis=0, descending=False)
+    assert dpt.all(s[:-1, :] <= s[1:, :])
+
+    s1 = dpt.sort(inp, axis=0, descending=True)
+    assert dpt.all(s1[:-1, :] >= s1[1:, :])
+
+
+@pytest.mark.parametrize(
+    "dtype",
+    [
+        "i1",
+        "u1",
+        "i2",
+        "u2",
+        "i4",
+        "u4",
+        "i8",
+        "u8",
+        "f2",
+        "f4",
+        "f8",
+        "c8",
+        "c16",
+    ],
+)
+def test_argsort_1d(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    inp = dpt.roll(
+        dpt.concat(
+            (dpt.ones(10000, dtype=dtype), dpt.zeros(10000, dtype=dtype))
+        ),
+        734,
+    )
+
+    s_idx = dpt.argsort(inp, descending=False)
+    assert dpt.all(inp[s_idx[:-1]] <= inp[s_idx[1:]])
+
+    s1_idx = dpt.argsort(inp, descending=True)
+    assert dpt.all(inp[s1_idx[:-1]] >= inp[s1_idx[1:]])
+
+
+def test_sort_validation():
+    with pytest.raises(TypeError):
+        dpt.sort(dict())
+
+
+def test_sort_validation_kind():
+    get_queue_or_skip()
+
+    x = dpt.ones(128, dtype="u1")
+
+    with pytest.raises(ValueError):
+        dpt.sort(x, kind=Ellipsis)
+
+    with pytest.raises(ValueError):
+        dpt.sort(x, kind="invalid")
+
+
+def test_argsort_validation():
+    with pytest.raises(TypeError):
+        dpt.argsort(dict())
+
+
+def test_argsort_validation_kind():
+    get_queue_or_skip()
+
+    x = dpt.arange(127, stop=0, step=-1, dtype="i1")
+
+    with pytest.raises(ValueError):
+        dpt.argsort(x, kind=Ellipsis)
+
+    with pytest.raises(ValueError):
+        dpt.argsort(x, kind="invalid")
+
+
+_all_kinds = ["stable", "mergesort", "radixsort"]
+
+
+@pytest.mark.parametrize("kind", _all_kinds)
+def test_sort_axis0(kind):
+    get_queue_or_skip()
+
+    n, m = 200, 30
+    xf = dpt.arange(n * m, 0, step=-1, dtype="i4")
+    x = dpt.reshape(xf, (n, m))
+    s = dpt.sort(x, axis=0, kind=kind)
+
+    assert dpt.all(s[:-1, :] <= s[1:, :])
+
+
+@pytest.mark.parametrize("kind", _all_kinds)
+def test_argsort_axis0(kind):
+    get_queue_or_skip()
+
+    n, m = 200, 30
+    xf = dpt.arange(n * m, 0, step=-1, dtype="i4")
+    x = dpt.reshape(xf, (n, m))
+    idx = dpt.argsort(x, axis=0, kind=kind)
+
+    s = dpt.take_along_axis(x, idx, axis=0)
+
+    assert dpt.all(s[:-1, :] <= s[1:, :])
+
+
+@pytest.mark.parametrize("kind", _all_kinds)
+def test_argsort_axis1(kind):
+    get_queue_or_skip()
+
+    n, m = 200, 30
+    xf = dpt.arange(n * m, 0, step=-1, dtype="i4")
+    x = dpt.reshape(xf, (n, m))
+    idx = dpt.argsort(x, axis=1, kind=kind)
+
+    s = dpt.take_along_axis(x, idx, axis=1)
+
+    assert dpt.all(s[:, :-1] <= s[:, 1:])
+
+
+@pytest.mark.parametrize("kind", _all_kinds)
+def test_sort_strided(kind):
+    get_queue_or_skip()
+
+    x_orig = dpt.arange(100, dtype="i4")
+    x_flipped = dpt.flip(x_orig, axis=0)
+    s = dpt.sort(x_flipped, kind=kind)
+
+    assert dpt.all(s == x_orig)
+
+
+@pytest.mark.parametrize("kind", _all_kinds)
+def test_argsort_strided(kind):
+    get_queue_or_skip()
+
+    x_orig = dpt.arange(100, dtype="i4")
+    x_flipped = dpt.flip(x_orig, axis=0)
+    idx = dpt.argsort(x_flipped, kind=kind)
+    s = dpt.take_along_axis(x_flipped, idx, axis=0)
+
+    assert dpt.all(s == x_orig)
+
+
+@pytest.mark.parametrize("kind", _all_kinds)
+def test_sort_0d_array(kind):
+    get_queue_or_skip()
+
+    x = dpt.asarray(1, dtype="i4")
+    expected = dpt.asarray(1, dtype="i4")
+    assert dpt.sort(x, kind=kind) == expected
+
+
+@pytest.mark.parametrize("kind", _all_kinds)
+def test_argsort_0d_array(kind):
+    get_queue_or_skip()
+
+    x = dpt.asarray(1, dtype="i4")
+    expected = dpt.asarray(0, dtype="i4")
+    assert dpt.argsort(x, kind=kind) == expected
+
+
+@pytest.mark.parametrize(
+    "dtype",
+    [
+        "f2",
+        "f4",
+        "f8",
+    ],
+)
+@pytest.mark.parametrize("kind", _all_kinds)
+def test_sort_real_fp_nan(dtype, kind):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    x = dpt.asarray(
+        [-0.0, 0.1, dpt.nan, 0.0, -0.1, dpt.nan, 0.2, -0.3], dtype=dtype
+    )
+    s = dpt.sort(x, kind=kind)
+
+    expected = dpt.asarray(
+        [-0.3, -0.1, -0.0, 0.0, 0.1, 0.2, dpt.nan, dpt.nan], dtype=dtype
+    )
+
+    assert dpt.allclose(s, expected, equal_nan=True)
+
+    s = dpt.sort(x, descending=True, kind=kind)
+
+    expected = dpt.asarray(
+        [dpt.nan, dpt.nan, 0.2, 0.1, -0.0, 0.0, -0.1, -0.3], dtype=dtype
+    )
+
+    assert dpt.allclose(s, expected, equal_nan=True)
+
+
+@pytest.mark.parametrize(
+    "dtype",
+    [
+        "c8",
+        "c16",
+    ],
+)
+def test_sort_complex_fp_nan(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    rvs = [-0.0, 0.1, 0.0, 0.2, -0.3, dpt.nan]
+    ivs = [-0.0, 0.1, 0.0, 0.2, -0.3, dpt.nan]
+
+    cv = []
+    for rv in rvs:
+        for iv in ivs:
+            cv.append(complex(rv, iv))
+
+    inp = dpt.asarray(cv, dtype=dtype)
+    s = dpt.sort(inp)
+
+    expected = np.sort(dpt.asnumpy(inp))
+
+    assert np.allclose(dpt.asnumpy(s), expected, equal_nan=True)
+
+    pairs = []
+    for i, j in itertools.permutations(range(inp.shape[0]), 2):
+        pairs.append([i, j])
+    sub_arrs = inp[dpt.asarray(pairs)]
+    m1 = dpt.asnumpy(dpt.sort(sub_arrs, axis=1))
+    m2 = np.sort(dpt.asnumpy(sub_arrs), axis=1)
+    for k in range(len(pairs)):
+        i, j = pairs[k]
+        r1 = m1[k]
+        r2 = m2[k]
+        assert np.array_equal(
+            r1.view(np.int64), r2.view(np.int64)
+        ), f"Failed for {i} and {j}"
+
+
+def test_radix_sort_size_1_axis():
+    get_queue_or_skip()
+
+    x1 = dpt.ones((), dtype="i1")
+    r1 = dpt.sort(x1, kind="radixsort")
+    assert_array_equal(dpt.asnumpy(r1), dpt.asnumpy(x1))
+
+    x2 = dpt.ones([1], dtype="i1")
+    r2 = dpt.sort(x2, kind="radixsort")
+    assert_array_equal(dpt.asnumpy(r2), dpt.asnumpy(x2))
+
+    x3 = dpt.reshape(dpt.arange(10, dtype="i1"), (10, 1))
+    r3 = dpt.sort(x3, kind="radixsort")
+    assert dpt.asnumpy(r3 == x3).all()
+
+    x4 = dpt.reshape(dpt.arange(10, dtype="i1"), (1, 10))
+    r4 = dpt.sort(x4, axis=0, kind="radixsort")
+    assert dpt.asnumpy(r4 == x4).all()
+
+
+def test_radix_argsort_size_1_axis():
+    get_queue_or_skip()
+
+    x1 = dpt.ones((), dtype="i1")
+    r1 = dpt.argsort(x1, kind="radixsort")
+    assert r1 == 0
+
+    x2 = dpt.ones([1], dtype="i1")
+    r2 = dpt.argsort(x2, kind="radixsort")
+    assert dpt.asnumpy(r2 == 0).all()
+
+    x3 = dpt.reshape(dpt.arange(10, dtype="i1"), (10, 1))
+    r3 = dpt.argsort(x3, kind="radixsort")
+    assert dpt.asnumpy(r3 == 0).all()
+
+    x4 = dpt.reshape(dpt.arange(10, dtype="i1"), (1, 10))
+    r4 = dpt.argsort(x4, axis=0, kind="radixsort")
+    assert dpt.asnumpy(r4 == 0).all()
diff --git a/dpnp/tests/tensor/test_usm_ndarray_top_k.py b/dpnp/tests/tensor/test_usm_ndarray_top_k.py
new file mode 100644
index 000000000000..1c04c1fff57a
--- /dev/null
+++ b/dpnp/tests/tensor/test_usm_ndarray_top_k.py
@@ -0,0 +1,331 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+import pytest
+
+import dpnp.tensor as dpt
+
+from .helper import (
+    get_queue_or_skip,
+    skip_if_dtype_not_supported,
+)
+
+
+def _expected_largest_inds(inp, n, shift, k):
+    "Computed expected top_k indices for mode='largest'"
+    assert k < n
+    ones_start_id = shift % (2 * n)
+
+    alloc_dev = inp.device
+
+    if ones_start_id < n:
+        expected_inds = dpt.arange(
+            ones_start_id, ones_start_id + k, dtype="i8", device=alloc_dev
+        )
+    else:
+        # wrap-around
+        ones_end_id = (ones_start_id + n) % (2 * n)
+        if ones_end_id >= k:
+            expected_inds = dpt.arange(k, dtype="i8", device=alloc_dev)
+        else:
+            expected_inds = dpt.concat(
+                (
+                    dpt.arange(ones_end_id, dtype="i8", device=alloc_dev),
+                    dpt.arange(
+                        ones_start_id,
+                        ones_start_id + k - ones_end_id,
+                        dtype="i8",
+                        device=alloc_dev,
+                    ),
+                )
+            )
+
+    return expected_inds
+
+
+@pytest.mark.parametrize(
+    "dtype",
+    [
+        "i1",
+        "u1",
+        "i2",
+        "u2",
+        "i4",
+        "u4",
+        "i8",
+        "u8",
+        "f2",
+        "f4",
+        "f8",
+        "c8",
+        "c16",
+    ],
+)
+@pytest.mark.parametrize("n", [33, 43, 255, 511, 1021, 8193])
+def test_top_k_1d_largest(dtype, n):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    shift, k = 734, 5
+    o = dpt.ones(n, dtype=dtype)
+    z = dpt.zeros(n, dtype=dtype)
+    oz = dpt.concat((o, z))
+    inp = dpt.roll(oz, shift)
+
+    expected_inds = _expected_largest_inds(oz, n, shift, k)
+
+    s = dpt.top_k(inp, k, mode="largest")
+    assert s.values.shape == (k,)
+    assert s.values.dtype == inp.dtype
+    assert s.indices.shape == (k,)
+    assert dpt.all(s.values == dpt.ones(k, dtype=dtype)), s.values
+    assert dpt.all(s.values == inp[s.indices]), s.indices
+    assert dpt.all(s.indices == expected_inds), (s.indices, expected_inds)
+
+
+def _expected_smallest_inds(inp, n, shift, k):
+    "Computed expected top_k indices for mode='smallest'"
+    assert k < n
+    zeros_start_id = (n + shift) % (2 * n)
+    zeros_end_id = (shift) % (2 * n)
+
+    alloc_dev = inp.device
+
+    if zeros_start_id < zeros_end_id:
+        expected_inds = dpt.arange(
+            zeros_start_id, zeros_start_id + k, dtype="i8", device=alloc_dev
+        )
+    else:
+        if zeros_end_id >= k:
+            expected_inds = dpt.arange(k, dtype="i8", device=alloc_dev)
+        else:
+            expected_inds = dpt.concat(
+                (
+                    dpt.arange(zeros_end_id, dtype="i8", device=alloc_dev),
+                    dpt.arange(
+                        zeros_start_id,
+                        zeros_start_id + k - zeros_end_id,
+                        dtype="i8",
+                        device=alloc_dev,
+                    ),
+                )
+            )
+
+    return expected_inds
+
+
+@pytest.mark.parametrize(
+    "dtype",
+    [
+        "i1",
+        "u1",
+        "i2",
+        "u2",
+        "i4",
+        "u4",
+        "i8",
+        "u8",
+        "f2",
+        "f4",
+        "f8",
+        "c8",
+        "c16",
+    ],
+)
+@pytest.mark.parametrize("n", [37, 39, 61, 255, 257, 513, 1021, 8193])
+def test_top_k_1d_smallest(dtype, n):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    shift, k = 734, 5
+    o = dpt.ones(n, dtype=dtype)
+    z = dpt.zeros(n, dtype=dtype)
+    oz = dpt.concat((o, z))
+    inp = dpt.roll(oz, shift)
+
+    expected_inds = _expected_smallest_inds(oz, n, shift, k)
+
+    s = dpt.top_k(inp, k, mode="smallest")
+    assert s.values.shape == (k,)
+    assert s.values.dtype == inp.dtype
+    assert s.indices.shape == (k,)
+    assert dpt.all(s.values == dpt.zeros(k, dtype=dtype)), s.values
+    assert dpt.all(s.values == inp[s.indices]), s.indices
+    assert dpt.all(s.indices == expected_inds), (s.indices, expected_inds)
+
+
+@pytest.mark.parametrize(
+    "dtype",
+    [
+        # skip short types to ensure that m*n can be represented
+        # in the type
+        "i4",
+        "u4",
+        "i8",
+        "u8",
+        "f2",
+        "f4",
+        "f8",
+        "c8",
+        "c16",
+    ],
+)
+@pytest.mark.parametrize("n", [37, 39, 61, 255, 257, 513, 1021, 8193])
+def test_top_k_2d_largest(dtype, n):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    m, k = 8, 3
+    if dtype == "f2" and m * n > 2000:
+        pytest.skip(
+            "f2 can not distinguish between large integers used in this test"
+        )
+
+    x = dpt.reshape(dpt.arange(m * n, dtype=dtype), (m, n))
+
+    r = dpt.top_k(x, k, axis=1)
+
+    assert r.values.shape == (m, k)
+    assert r.indices.shape == (m, k)
+    expected_inds = dpt.reshape(dpt.arange(n, dtype=r.indices.dtype), (1, n))[
+        :, -k:
+    ]
+    assert expected_inds.shape == (1, k)
+    assert dpt.all(
+        dpt.sort(r.indices, axis=1) == dpt.sort(expected_inds, axis=1)
+    ), (r.indices, expected_inds)
+    expected_vals = x[:, -k:]
+    assert dpt.all(
+        dpt.sort(r.values, axis=1) == dpt.sort(expected_vals, axis=1)
+    )
+
+
+@pytest.mark.parametrize(
+    "dtype",
+    [
+        # skip short types to ensure that m*n can be represented
+        # in the type
+        "i4",
+        "u4",
+        "i8",
+        "u8",
+        "f2",
+        "f4",
+        "f8",
+        "c8",
+        "c16",
+    ],
+)
+@pytest.mark.parametrize("n", [37, 39, 61, 255, 257, 513, 1021, 8193])
+def test_top_k_2d_smallest(dtype, n):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    m, k = 8, 3
+    if dtype == "f2" and m * n > 2000:
+        pytest.skip(
+            "f2 can not distinguish between large integers used in this test"
+        )
+
+    x = dpt.reshape(dpt.arange(m * n, dtype=dtype), (m, n))
+
+    r = dpt.top_k(x, k, axis=1, mode="smallest")
+
+    assert r.values.shape == (m, k)
+    assert r.indices.shape == (m, k)
+    expected_inds = dpt.reshape(dpt.arange(n, dtype=r.indices.dtype), (1, n))[
+        :, :k
+    ]
+    assert dpt.all(
+        dpt.sort(r.indices, axis=1) == dpt.sort(expected_inds, axis=1)
+    )
+    assert dpt.all(dpt.sort(r.values, axis=1) == dpt.sort(x[:, :k], axis=1))
+
+
+def test_top_k_0d():
+    get_queue_or_skip()
+
+    a = dpt.ones(tuple(), dtype="i4")
+    assert a.ndim == 0
+    assert a.size == 1
+
+    r = dpt.top_k(a, 1)
+    assert r.values == a
+    assert r.indices == dpt.zeros_like(a, dtype=r.indices.dtype)
+
+
+def test_top_k_noncontig():
+    get_queue_or_skip()
+
+    a = dpt.arange(256, dtype=dpt.int32)[::2]
+    r = dpt.top_k(a, 3)
+
+    assert dpt.all(dpt.sort(r.values) == dpt.asarray([250, 252, 254])), r.values
+    assert dpt.all(
+        dpt.sort(r.indices) == dpt.asarray([125, 126, 127])
+    ), r.indices
+
+
+def test_top_k_axis0():
+    get_queue_or_skip()
+
+    m, n, k = 128, 8, 3
+    x = dpt.reshape(dpt.arange(m * n, dtype=dpt.int32), (m, n))
+
+    r = dpt.top_k(x, k, axis=0, mode="smallest")
+    assert r.values.shape == (k, n)
+    assert r.indices.shape == (k, n)
+    expected_inds = dpt.reshape(dpt.arange(m, dtype=r.indices.dtype), (m, 1))[
+        :k, :
+    ]
+    assert dpt.all(
+        dpt.sort(r.indices, axis=0) == dpt.sort(expected_inds, axis=0)
+    )
+    assert dpt.all(dpt.sort(r.values, axis=0) == dpt.sort(x[:k, :], axis=0))
+
+
+def test_top_k_validation():
+    get_queue_or_skip()
+    x = dpt.ones(10, dtype=dpt.int64)
+    with pytest.raises(ValueError):
+        # k must be positive
+        dpt.top_k(x, -1)
+    with pytest.raises(TypeError):
+        # argument should be usm_ndarray
+        dpt.top_k(list(), 2)
+    x2 = dpt.reshape(x, (2, 5))
+    with pytest.raises(ValueError):
+        # k must not exceed array dimension
+        # along specified axis
+        dpt.top_k(x2, 100, axis=1)
+    with pytest.raises(ValueError):
+        # for 0d arrays, k must be 1
+        dpt.top_k(x[0], 2)
+    with pytest.raises(ValueError):
+        # mode must be "largest", or "smallest"
+        dpt.top_k(x, 2, mode="invalid")
diff --git a/dpnp/tests/tensor/test_usm_ndarray_unique.py b/dpnp/tests/tensor/test_usm_ndarray_unique.py
new file mode 100644
index 000000000000..d602c0346f5d
--- /dev/null
+++ b/dpnp/tests/tensor/test_usm_ndarray_unique.py
@@ -0,0 +1,361 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+import dpctl
+import pytest
+
+import dpnp.tensor as dpt
+
+from .helper import (
+    get_queue_or_skip,
+    skip_if_dtype_not_supported,
+)
+
+
+@pytest.mark.parametrize(
+    "dtype",
+    [
+        "i1",
+        "u1",
+        "i2",
+        "u2",
+        "i4",
+        "u4",
+        "i8",
+        "u8",
+        "f2",
+        "f4",
+        "f8",
+        "c8",
+        "c16",
+    ],
+)
+def test_unique_values(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    n, roll = 10000, 734
+    inp = dpt.roll(
+        dpt.concat((dpt.ones(n, dtype=dtype), dpt.zeros(n, dtype=dtype))),
+        roll,
+    )
+
+    uv = dpt.unique_values(inp)
+    assert dpt.all(uv == dpt.arange(2, dtype=dtype))
+
+
+def test_unique_values_strided():
+    get_queue_or_skip()
+
+    n, m = 1000, 20
+    inp = dpt.ones((n, m), dtype="i4", order="F")
+    inp[:, ::2] = 0
+
+    uv = dpt.unique_values(inp)
+    assert dpt.all(uv == dpt.arange(2, dtype="i4"))
+
+    inp = dpt.reshape(inp, -1)
+    inp = dpt.flip(dpt.reshape(inp, -1))
+
+    uv = dpt.unique_values(inp)
+    assert dpt.all(uv == dpt.arange(2, dtype="i4"))
+
+
+@pytest.mark.parametrize(
+    "dtype",
+    [
+        "i1",
+        "u1",
+        "i2",
+        "u2",
+        "i4",
+        "u4",
+        "i8",
+        "u8",
+        "f2",
+        "f4",
+        "f8",
+        "c8",
+        "c16",
+    ],
+)
+def test_unique_counts(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    n, roll = 10000, 734
+    inp = dpt.roll(
+        dpt.concat((dpt.ones(n, dtype=dtype), dpt.zeros(n, dtype=dtype))),
+        roll,
+    )
+
+    uv, uv_counts = dpt.unique_counts(inp)
+    assert dpt.all(uv == dpt.arange(2, dtype=dtype))
+    assert dpt.all(uv_counts == dpt.full(2, n, dtype=uv_counts.dtype))
+
+
+def test_unique_counts_strided():
+    get_queue_or_skip()
+
+    n, m = 1000, 20
+    inp = dpt.ones((n, m), dtype="i4", order="F")
+    inp[:, ::2] = 0
+
+    uv, uv_counts = dpt.unique_counts(inp)
+    assert dpt.all(uv == dpt.arange(2, dtype="i4"))
+    assert dpt.all(uv_counts == dpt.full(2, n / 2 * m, dtype=uv_counts.dtype))
+
+    inp = dpt.flip(dpt.reshape(inp, -1))
+
+    uv, uv_counts = dpt.unique_counts(inp)
+    assert dpt.all(uv == dpt.arange(2, dtype="i4"))
+    assert dpt.all(uv_counts == dpt.full(2, n / 2 * m, dtype=uv_counts.dtype))
+
+
+@pytest.mark.parametrize(
+    "dtype",
+    [
+        "i1",
+        "u1",
+        "i2",
+        "u2",
+        "i4",
+        "u4",
+        "i8",
+        "u8",
+        "f2",
+        "f4",
+        "f8",
+        "c8",
+        "c16",
+    ],
+)
+def test_unique_inverse(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    n, roll = 10000, 734
+    inp = dpt.roll(
+        dpt.concat((dpt.ones(n, dtype=dtype), dpt.zeros(n, dtype=dtype))),
+        roll,
+    )
+
+    uv, inv = dpt.unique_inverse(inp)
+    assert dpt.all(uv == dpt.arange(2, dtype=dtype))
+    assert dpt.all(inp == uv[inv])
+    assert inp.shape == inv.shape
+
+
+def test_unique_inverse_strided():
+    get_queue_or_skip()
+
+    n, m = 1000, 20
+    inp = dpt.ones((n, m), dtype="i4", order="F")
+    inp[:, ::2] = 0
+
+    uv, inv = dpt.unique_inverse(inp)
+    assert dpt.all(uv == dpt.arange(2, dtype="i4"))
+    assert dpt.all(inp == uv[inv])
+    assert inp.shape == inv.shape
+
+    inp = dpt.flip(dpt.reshape(inp, -1))
+
+    uv, inv = dpt.unique_inverse(inp)
+    assert dpt.all(uv == dpt.arange(2, dtype="i4"))
+    assert dpt.all(inp == uv[inv])
+    assert inp.shape == inv.shape
+
+
+@pytest.mark.parametrize(
+    "dtype",
+    [
+        "i1",
+        "u1",
+        "i2",
+        "u2",
+        "i4",
+        "u4",
+        "i8",
+        "u8",
+        "f2",
+        "f4",
+        "f8",
+        "c8",
+        "c16",
+    ],
+)
+def test_unique_all(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    n, roll = 10000, 734
+    inp = dpt.roll(
+        dpt.concat((dpt.ones(n, dtype=dtype), dpt.zeros(n, dtype=dtype))),
+        roll,
+    )
+
+    uv, ind, inv, uv_counts = dpt.unique_all(inp)
+    assert dpt.all(uv == dpt.arange(2, dtype=dtype))
+    assert dpt.all(uv == inp[ind])
+    assert dpt.all(inp == uv[inv])
+    assert inp.shape == inv.shape
+    assert dpt.all(uv_counts == dpt.full(2, n, dtype=uv_counts.dtype))
+
+
+def test_unique_all_strided():
+    get_queue_or_skip()
+
+    n, m = 1000, 20
+    inp = dpt.ones((n, m), dtype="i4", order="F")
+    inp[:, ::2] = 0
+
+    uv, ind, inv, uv_counts = dpt.unique_all(inp)
+    assert dpt.all(uv == dpt.arange(2, dtype="i4"))
+    assert dpt.all(uv == dpt.reshape(inp, -1)[ind])
+    assert dpt.all(inp == uv[inv])
+    assert inp.shape == inv.shape
+    assert dpt.all(uv_counts == dpt.full(2, n / 2 * m, dtype=uv_counts.dtype))
+
+    inp = dpt.flip(dpt.reshape(inp, -1))
+
+    uv, ind, inv, uv_counts = dpt.unique_all(inp)
+    assert dpt.all(uv == dpt.arange(2, dtype="i4"))
+    assert dpt.all(uv == inp[ind])
+    assert dpt.all(inp == uv[inv])
+    assert inp.shape == inv.shape
+    assert dpt.all(uv_counts == dpt.full(2, n / 2 * m, dtype=uv_counts.dtype))
+
+
+def test_set_functions_empty_input():
+    get_queue_or_skip()
+    x = dpt.ones((10, 0, 1), dtype="i4")
+
+    res = dpt.unique_values(x)
+    assert isinstance(res, dpt.usm_ndarray)
+    assert res.size == 0
+    assert res.dtype == x.dtype
+
+    res = dpt.unique_inverse(x)
+    assert type(res).__name__ == "UniqueInverseResult"
+    uv, inv = res
+    assert isinstance(uv, dpt.usm_ndarray)
+    assert uv.size == 0
+    assert isinstance(inv, dpt.usm_ndarray)
+    assert inv.size == 0
+
+    res = dpt.unique_counts(x)
+    assert type(res).__name__ == "UniqueCountsResult"
+    uv, uv_counts = res
+    assert isinstance(uv, dpt.usm_ndarray)
+    assert uv.size == 0
+    assert isinstance(uv_counts, dpt.usm_ndarray)
+    assert uv_counts.size == 0
+
+    res = dpt.unique_all(x)
+    assert type(res).__name__ == "UniqueAllResult"
+    uv, ind, inv, uv_counts = res
+    assert isinstance(uv, dpt.usm_ndarray)
+    assert uv.size == 0
+    assert isinstance(ind, dpt.usm_ndarray)
+    assert ind.size == 0
+    assert isinstance(inv, dpt.usm_ndarray)
+    assert inv.size == 0
+    assert isinstance(uv_counts, dpt.usm_ndarray)
+    assert uv_counts.size == 0
+
+
+def test_set_function_outputs():
+    get_queue_or_skip()
+    # check standard and early exit paths
+    x1 = dpt.arange(10, dtype="i4")
+    x2 = dpt.ones((10, 10), dtype="i4")
+
+    assert isinstance(dpt.unique_values(x1), dpt.usm_ndarray)
+    assert isinstance(dpt.unique_values(x2), dpt.usm_ndarray)
+
+    assert type(dpt.unique_inverse(x1)).__name__ == "UniqueInverseResult"
+    assert type(dpt.unique_inverse(x2)).__name__ == "UniqueInverseResult"
+
+    assert type(dpt.unique_counts(x1)).__name__ == "UniqueCountsResult"
+    assert type(dpt.unique_counts(x2)).__name__ == "UniqueCountsResult"
+
+    assert type(dpt.unique_all(x1)).__name__ == "UniqueAllResult"
+    assert type(dpt.unique_all(x2)).__name__ == "UniqueAllResult"
+
+
+def test_set_functions_compute_follows_data():
+    # tests that all intermediate calls and allocations
+    # are compatible with an input with an arbitrary queue
+    get_queue_or_skip()
+    q = dpctl.SyclQueue()
+    x = dpt.arange(10, dtype="i4", sycl_queue=q)
+
+    uv = dpt.unique_values(x)
+    assert isinstance(uv, dpt.usm_ndarray)
+    assert uv.sycl_queue == q
+    uv, uc = dpt.unique_counts(x)
+    assert isinstance(uv, dpt.usm_ndarray)
+    assert isinstance(uc, dpt.usm_ndarray)
+    assert uv.sycl_queue == q
+    assert uc.sycl_queue == q
+    uv, inv_ind = dpt.unique_inverse(x)
+    assert isinstance(uv, dpt.usm_ndarray)
+    assert isinstance(inv_ind, dpt.usm_ndarray)
+    assert uv.sycl_queue == q
+    assert inv_ind.sycl_queue == q
+    uv, ind, inv_ind, uc = dpt.unique_all(x)
+    assert isinstance(uv, dpt.usm_ndarray)
+    assert isinstance(ind, dpt.usm_ndarray)
+    assert isinstance(inv_ind, dpt.usm_ndarray)
+    assert isinstance(uc, dpt.usm_ndarray)
+    assert uv.sycl_queue == q
+    assert ind.sycl_queue == q
+    assert inv_ind.sycl_queue == q
+    assert uc.sycl_queue == q
+
+
+def test_gh_1738():
+    get_queue_or_skip()
+
+    ones = dpt.ones(10, dtype="i8")
+    iota = dpt.arange(10, dtype="i8")
+
+    assert ones.device == iota.device
+
+    dpt_info = dpt.__array_namespace_info__()
+    ind_dt = dpt_info.default_dtypes(device=ones.device)["indexing"]
+
+    dt = dpt.unique_inverse(ones).inverse_indices.dtype
+    assert dt == ind_dt
+    dt = dpt.unique_all(ones).inverse_indices.dtype
+    assert dt == ind_dt
+
+    dt = dpt.unique_inverse(iota).inverse_indices.dtype
+    assert dt == ind_dt
+    dt = dpt.unique_all(iota).inverse_indices.dtype
+    assert dt == ind_dt
diff --git a/dpnp/tests/tensor/test_usm_ndarray_utility_functions.py b/dpnp/tests/tensor/test_usm_ndarray_utility_functions.py
new file mode 100644
index 000000000000..b6d6293ade73
--- /dev/null
+++ b/dpnp/tests/tensor/test_usm_ndarray_utility_functions.py
@@ -0,0 +1,199 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+from random import randrange
+
+import numpy as np
+import pytest
+from numpy.testing import assert_array_equal, assert_equal
+
+import dpnp.tensor as dpt
+from dpnp.tensor._numpy_helper import AxisError
+
+from .helper import (
+    get_queue_or_skip,
+    skip_if_dtype_not_supported,
+)
+
+_all_dtypes = [
+    "?",
+    "i1",
+    "u1",
+    "i2",
+    "u2",
+    "i4",
+    "u4",
+    "i8",
+    "u8",
+    "f2",
+    "f4",
+    "f8",
+    "c8",
+    "c16",
+]
+
+
+@pytest.mark.parametrize("func,identity", [(dpt.all, True), (dpt.any, False)])
+@pytest.mark.parametrize("dtype", _all_dtypes)
+def test_boolean_reduction_dtypes_contig(func, identity, dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    x = dpt.full(10, identity, dtype=dtype, sycl_queue=q)
+    res = func(x)
+
+    assert_equal(dpt.asnumpy(res), identity)
+
+    x[randrange(x.size)] = not identity
+    res = func(x)
+    assert_equal(dpt.asnumpy(res), not identity)
+
+    # test branch in kernel for large arrays
+    wg_size = 4 * 32
+    x = dpt.full((wg_size + 1), identity, dtype=dtype, sycl_queue=q)
+    res = func(x)
+    assert_equal(dpt.asnumpy(res), identity)
+
+    x[randrange(x.size)] = not identity
+    res = func(x)
+    assert_equal(dpt.asnumpy(res), not identity)
+
+
+@pytest.mark.parametrize("func,identity", [(dpt.all, True), (dpt.any, False)])
+@pytest.mark.parametrize("dtype", _all_dtypes)
+def test_boolean_reduction_dtypes_strided(func, identity, dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    x = dpt.full(20, identity, dtype=dtype, sycl_queue=q)[::-2]
+    res = func(x)
+    assert_equal(dpt.asnumpy(res), identity)
+
+    x[randrange(x.size)] = not identity
+    res = func(x)
+    assert_equal(dpt.asnumpy(res), not identity)
+
+
+@pytest.mark.parametrize("func,identity", [(dpt.all, True), (dpt.any, False)])
+def test_boolean_reduction_axis(func, identity):
+    get_queue_or_skip()
+
+    x = dpt.full((2, 3, 4, 5, 6), identity, dtype="i4")
+    res = func(x, axis=(1, 2, -1))
+
+    assert res.shape == (2, 5)
+    assert_array_equal(dpt.asnumpy(res), np.full(res.shape, identity))
+
+    # make first row of output negation of identity
+    x[0, 0, 0, ...] = not identity
+    res = func(x, axis=(1, 2, -1))
+    assert_array_equal(dpt.asnumpy(res[0]), np.full(res.shape[1], not identity))
+
+
+@pytest.mark.parametrize("func", [dpt.all, dpt.any])
+def test_boolean_reduction_keepdims(func):
+    get_queue_or_skip()
+
+    x = dpt.ones((2, 3, 4, 5, 6), dtype="i4")
+    res = func(x, axis=(1, 2, -1), keepdims=True)
+    assert res.shape == (2, 1, 1, 5, 1)
+    assert_array_equal(dpt.asnumpy(res), np.full(res.shape, True))
+
+    res = func(x, axis=None, keepdims=True)
+    assert res.shape == (1,) * x.ndim
+
+
+@pytest.mark.parametrize("func,identity", [(dpt.all, True), (dpt.any, False)])
+def test_boolean_reduction_empty(func, identity):
+    get_queue_or_skip()
+
+    x = dpt.empty((0,), dtype="i4")
+    res = func(x)
+    assert_equal(dpt.asnumpy(res), identity)
+
+
+# nan, inf, and -inf should evaluate to true
+@pytest.mark.parametrize("func", [dpt.all, dpt.any])
+def test_boolean_reductions_nan_inf(func):
+    q = get_queue_or_skip()
+
+    x = dpt.asarray([dpt.nan, dpt.inf, -dpt.inf], dtype="f4", sycl_queue=q)[
+        :, dpt.newaxis
+    ]
+    res = func(x, axis=1)
+    assert_array_equal(dpt.asnumpy(res), np.array([True, True, True]))
+
+
+@pytest.mark.parametrize("func", [dpt.all, dpt.any])
+def test_boolean_reduction_scalars(func):
+    get_queue_or_skip()
+
+    x = dpt.ones((), dtype="i4")
+    assert_equal(dpt.asnumpy(func(x)), True)
+
+    x = dpt.zeros((), dtype="i4")
+    assert_equal(dpt.asnumpy(func(x)), False)
+
+
+@pytest.mark.parametrize("func", [dpt.all, dpt.any])
+def test_boolean_reduction_empty_axis(func):
+    get_queue_or_skip()
+
+    x = dpt.ones((5,), dtype="i4")
+    res = func(x, axis=())
+    assert_array_equal(dpt.asnumpy(res), dpt.asnumpy(x).astype(np.bool_))
+
+
+@pytest.mark.parametrize("func", [dpt.all, dpt.any])
+def test_arg_validation_boolean_reductions(func):
+    get_queue_or_skip()
+
+    x = dpt.ones((4, 5), dtype="i4")
+    d = {}
+
+    with pytest.raises(TypeError):
+        func(d)
+    with pytest.raises(AxisError):
+        func(x, axis=-3)
+
+
+def test_boolean_reductions_3d_gh_1327():
+    get_queue_or_skip()
+
+    size = 24
+    x = dpt.reshape(dpt.arange(-10, size - 10, 1, dtype="i4"), (2, 3, 4))
+    res = dpt.all(x, axis=0)
+    res_np = np.full(res.shape, True, dtype="?")
+    res_np[2, 2] = False
+
+    assert (dpt.asnumpy(res) == res_np).all()
+
+    x = dpt.ones((2, 3, 4, 5), dtype="i4")
+    res = dpt.any(x, axis=0)
+
+    assert (dpt.asnumpy(res) == np.full(res.shape, True, dtype="?")).all()
diff --git a/dpnp/tests/test_ndarray.py b/dpnp/tests/test_ndarray.py
index a16ccd9f5bf3..c79452c30ee3 100644
--- a/dpnp/tests/test_ndarray.py
+++ b/dpnp/tests/test_ndarray.py
@@ -490,6 +490,9 @@ def test_print_dpnp_special_character(character):
     assert result == expected
 
 
+# TODO: repr formatting is inconsistent (scientific vs integer-like output)
+# This is a minor issue that does not depend on compiler flags
+@pytest.mark.skip(reason="SAT-8452")
 def test_print_dpnp_1d():
     dtype = dpnp.default_float_type()
     result = repr(dpnp.arange(10000, dtype=dtype))
diff --git a/pyproject.toml b/pyproject.toml
index 09253467b8dc..4935b0e9ac80 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -108,7 +108,7 @@ target-version = ['py310', 'py311', 'py312', 'py313', 'py314']
 [tool.codespell]
 builtin = "clear,rare,informal,names"
 check-filenames = true
-ignore-words-list = "amin,arange,elemt,fro,hist,ith,mone,nd,nin,sinc,vart,GroupT,AccessorT,IndexT,fpT,OffsetT,inpT"
+ignore-words-list = "amin,arange,elemt,fro,hist,ith,mone,nd,nin,sinc,vart,GroupT,AccessorT,IndexT,fpT,OffsetT,inpT, wit"
 quiet-level = 3
 
 [tool.coverage.report]
diff --git a/setup.py b/setup.py
index 86899c27ca65..3f5449663508 100644
--- a/setup.py
+++ b/setup.py
@@ -53,12 +53,14 @@
             "dpnp_backend_c.lib",
             "dpnp_backend_c.dll",
             "tests/*.*",
+            "tests/tensor/*.py",
+            "tests/tensor/*/*.py",
             "tests/testing/*.py",
             "tests/third_party/cupy/*.py",
             "tests/third_party/cupy/*/*.py",
             "tests/third_party/cupyx/*.py",
             "tests/third_party/cupyx/*/*.py",
-        ]
+        ],
     },
     include_package_data=False,
 )

From cf2a2867c922329d1155d7eb954e52d94c3e8e23 Mon Sep 17 00:00:00 2001
From: vlad-perevezentsev <vladislav.perevezentsev@intel.com>
Date: Fri, 10 Apr 2026 17:19:13 +0200
Subject: [PATCH 25/43] Move compute follows data utils (#2840)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This PR proposes to move the file `_compute_follows_data.pyx` from
`dpctl.utils` to `dpnp.tensor` as part of the migration of
`dpctl.tensor` to `dpnp.tensor`

### Changes
>- **Moved file**: `dpctl/utils/_compute_follows_data.pyx` →
`dpnp/tensor/_compute_follows_data.pyx`
>- **Exports** (now available from `dpnp.tensor`):
>>- `ExecutionPlacementError` - exception for execution placement errors
>>- `get_execution_queue()` - determine execution queue from input
arrays
>>- `get_coerced_usm_type()` - determine output USM type for
compute-follows-data
>>- `validate_usm_type()` - validate USM type specifications
---
 dpnp/dpnp_algo/dpnp_arraycreation.py          |   3 +-
 dpnp/dpnp_algo/dpnp_elementwise_common.py     |   4 +-
 dpnp/dpnp_algo/dpnp_fill.py                   |   4 +-
 dpnp/dpnp_container.py                        |  16 +-
 dpnp/dpnp_iface.py                            |   2 +-
 dpnp/dpnp_iface_histograms.py                 |   7 +-
 dpnp/dpnp_iface_indexing.py                   |   8 +-
 dpnp/dpnp_iface_logic.py                      |   4 +-
 dpnp/dpnp_iface_mathematical.py               |  10 +-
 dpnp/dpnp_iface_statistics.py                 |   2 +-
 dpnp/dpnp_utils/dpnp_algo_utils.pyx           |  12 +-
 dpnp/dpnp_utils/dpnp_utils_einsum.py          |   5 +-
 dpnp/dpnp_utils/dpnp_utils_linearalgebra.py   |   5 +-
 dpnp/dpnp_utils/dpnp_utils_statistics.py      |   7 +-
 dpnp/exceptions/__init__.py                   |   2 +-
 dpnp/fft/dpnp_utils_fft.py                    |  10 +-
 dpnp/random/dpnp_random_state.py              |   6 +-
 dpnp/tensor/__init__.py                       |  11 +
 dpnp/tensor/_accumulation.py                  |   7 +-
 dpnp/tensor/_clip.py                          |  49 ++---
 dpnp/tensor/_compute_follows_data.pyx         | 191 ++++++++++++++++++
 dpnp/tensor/_copy_utils.py                    |  50 +++--
 dpnp/tensor/_ctors.py                         |  76 ++++---
 dpnp/tensor/_device.py                        |   4 +-
 dpnp/tensor/_elementwise_common.py            |  35 ++--
 dpnp/tensor/_indexing_functions.py            |  49 ++---
 dpnp/tensor/_linear_algebra_functions.py      |  31 ++-
 dpnp/tensor/_manipulation_functions.py        |  29 ++-
 dpnp/tensor/_print.py                         |   4 +-
 dpnp/tensor/_reduction.py                     |  15 +-
 dpnp/tensor/_reshape.py                       |   4 +-
 dpnp/tensor/_search_functions.py              |  27 ++-
 dpnp/tensor/_searchsorted.py                  |  13 +-
 dpnp/tensor/_set_functions.py                 |  10 +-
 dpnp/tensor/_testing.py                       |   5 +-
 dpnp/tensor/_utility_functions.py             |  36 ++--
 dpnp/tests/tensor/elementwise/test_add.py     |  17 +-
 dpnp/tests/tensor/elementwise/test_divide.py  |   4 +-
 dpnp/tests/tensor/elementwise/test_equal.py   |   4 +-
 .../tensor/elementwise/test_floor_divide.py   |   4 +-
 dpnp/tests/tensor/elementwise/test_greater.py |   4 +-
 .../tensor/elementwise/test_greater_equal.py  |   4 +-
 dpnp/tests/tensor/elementwise/test_hypot.py   |   4 +-
 dpnp/tests/tensor/elementwise/test_less.py    |   4 +-
 .../tensor/elementwise/test_less_equal.py     |   4 +-
 .../tensor/elementwise/test_logaddexp.py      |   4 +-
 .../tensor/elementwise/test_logical_and.py    |   4 +-
 .../tensor/elementwise/test_logical_or.py     |   4 +-
 .../tensor/elementwise/test_logical_xor.py    |   4 +-
 .../elementwise/test_maximum_minimum.py       |   8 +-
 .../tests/tensor/elementwise/test_multiply.py |   4 +-
 .../tensor/elementwise/test_not_equal.py      |   4 +-
 dpnp/tests/tensor/elementwise/test_pow.py     |   4 +-
 .../tensor/elementwise/test_remainder.py      |   4 +-
 .../tests/tensor/elementwise/test_subtract.py |   4 +-
 .../tensor/elementwise/test_type_utils.py     |   2 +-
 dpnp/tests/tensor/test_tensor_accumulation.py |   3 +-
 dpnp/tests/tensor/test_tensor_asarray.py      |   2 +-
 dpnp/tests/tensor/test_tensor_clip.py         |  27 ++-
 dpnp/tests/tensor/test_tensor_diff.py         |  11 +-
 dpnp/tests/tensor/test_tensor_isin.py         |   5 +-
 dpnp/tests/tensor/test_usm_ndarray_ctor.py    |   4 +-
 .../tests/tensor/test_usm_ndarray_indexing.py |  29 ++-
 dpnp/tests/tensor/test_usm_ndarray_linalg.py  |   9 +-
 .../tensor/test_usm_ndarray_manipulation.py   |   3 +-
 .../tensor/test_usm_ndarray_reductions.py     |   8 +-
 .../test_usm_ndarray_search_functions.py      |   9 +-
 .../tensor/test_usm_ndarray_searchsorted.py   |   5 +-
 dpnp/tests/test_fft.py                        |   3 +-
 dpnp/tests/test_fill.py                       |   2 +-
 dpnp/tests/test_indexing.py                   |   5 +-
 dpnp/tests/test_linalg.py                     |   7 +-
 dpnp/tests/test_logic.py                      |   2 +-
 dpnp/tests/test_mathematical.py               |  11 +-
 dpnp/tests/test_nanfunctions.py               |   3 +-
 dpnp/tests/test_product.py                    |   2 +-
 dpnp/tests/test_sycl_queue.py                 |   5 +-
 dpnp/tests/test_usm_type.py                   |  93 +++++----
 78 files changed, 607 insertions(+), 489 deletions(-)
 create mode 100644 dpnp/tensor/_compute_follows_data.pyx

diff --git a/dpnp/dpnp_algo/dpnp_arraycreation.py b/dpnp/dpnp_algo/dpnp_arraycreation.py
index 66d8b9d9fbc8..df21ea5bbc44 100644
--- a/dpnp/dpnp_algo/dpnp_arraycreation.py
+++ b/dpnp/dpnp_algo/dpnp_arraycreation.py
@@ -29,7 +29,6 @@
 import math
 import operator
 
-import dpctl.utils as dpu
 import numpy
 
 import dpnp
@@ -340,7 +339,7 @@ class dpnp_nd_grid:
     def __init__(
         self, sparse=False, device=None, usm_type="device", sycl_queue=None
     ):
-        dpu.validate_usm_type(usm_type, allow_none=True)
+        dpt.validate_usm_type(usm_type, allow_none=True)
         self.sparse = sparse
         self.usm_type = "device" if usm_type is None else usm_type
         self.sycl_queue_normalized = dpnp.get_normalized_queue_device(
diff --git a/dpnp/dpnp_algo/dpnp_elementwise_common.py b/dpnp/dpnp_algo/dpnp_elementwise_common.py
index 5902d389391f..4eb613db35a2 100644
--- a/dpnp/dpnp_algo/dpnp_elementwise_common.py
+++ b/dpnp/dpnp_algo/dpnp_elementwise_common.py
@@ -451,7 +451,7 @@ def __call__(
                     f"Expected output shape is {x.shape}, got {res.shape}"
                 )
 
-            if dpu.get_execution_queue((exec_q, res.sycl_queue)) is None:
+            if dpt.get_execution_queue((exec_q, res.sycl_queue)) is None:
                 raise dpnp.exceptions.ExecutionPlacementError(
                     "Input and output allocation queues are not compatible"
                 )
@@ -1062,7 +1062,7 @@ def __call__(
                     f"Expected output shape is {res_shape}, got {res.shape}"
                 )
 
-            if dpu.get_execution_queue((exec_q, res.sycl_queue)) is None:
+            if dpt.get_execution_queue((exec_q, res.sycl_queue)) is None:
                 raise dpnp.exceptions.ExecutionPlacementError(
                     "Input and output allocation queues are not compatible"
                 )
diff --git a/dpnp/dpnp_algo/dpnp_fill.py b/dpnp/dpnp_algo/dpnp_fill.py
index 7afda62bb07f..03c47dea169c 100644
--- a/dpnp/dpnp_algo/dpnp_fill.py
+++ b/dpnp/dpnp_algo/dpnp_fill.py
@@ -49,8 +49,8 @@ def dpnp_fill(arr, val):
         val = dpnp.get_usm_ndarray(val)
         if val.shape != ():
             raise ValueError("`val` must be a scalar or 0D-array")
-        if dpu.get_execution_queue((exec_q, val.sycl_queue)) is None:
-            raise dpu.ExecutionPlacementError(
+        if dpt.get_execution_queue((exec_q, val.sycl_queue)) is None:
+            raise dpt.ExecutionPlacementError(
                 "Input arrays have incompatible queues."
             )
         a_val = dpt.astype(val, arr.dtype)
diff --git a/dpnp/dpnp_container.py b/dpnp/dpnp_container.py
index 14d9278579ba..374cc2c26f09 100644
--- a/dpnp/dpnp_container.py
+++ b/dpnp/dpnp_container.py
@@ -35,8 +35,6 @@
 
 """
 
-import dpctl.utils as dpu
-
 import dpnp
 import dpnp.tensor as dpt
 from dpnp.dpnp_array import dpnp_array
@@ -67,7 +65,7 @@ def arange(
     sycl_queue=None,
 ):
     """Validate input parameters before passing them into `dpctl.tensor` module"""
-    dpu.validate_usm_type(usm_type, allow_none=False)
+    dpt.validate_usm_type(usm_type, allow_none=False)
     sycl_queue_normalized = dpnp.get_normalized_queue_device(
         sycl_queue=sycl_queue, device=device
     )
@@ -93,7 +91,7 @@ def asarray(
     sycl_queue=None,
 ):
     """Converts `x1` to `dpnp_array`."""
-    dpu.validate_usm_type(usm_type, allow_none=True)
+    dpt.validate_usm_type(usm_type, allow_none=True)
 
     if order is None:
         order = "K"
@@ -154,7 +152,7 @@ def empty(
     sycl_queue=None,
 ):
     """Validate input parameters before passing them into `dpctl.tensor` module"""
-    dpu.validate_usm_type(usm_type, allow_none=False)
+    dpt.validate_usm_type(usm_type, allow_none=False)
     sycl_queue_normalized = dpnp.get_normalized_queue_device(
         sycl_queue=sycl_queue, device=device
     )
@@ -185,7 +183,7 @@ def eye(
     sycl_queue=None,
 ):
     """Validate input parameters before passing them into `dpctl.tensor` module"""
-    dpu.validate_usm_type(usm_type, allow_none=False)
+    dpt.validate_usm_type(usm_type, allow_none=False)
     sycl_queue_normalized = dpnp.get_normalized_queue_device(
         sycl_queue=sycl_queue, device=device
     )
@@ -216,7 +214,7 @@ def full(
     sycl_queue=None,
 ):
     """Validate input parameters before passing them into `dpctl.tensor` module"""
-    dpu.validate_usm_type(usm_type, allow_none=True)
+    dpt.validate_usm_type(usm_type, allow_none=True)
 
     sycl_queue_normalized = dpnp.get_normalized_queue_device(
         fill_value, sycl_queue=sycl_queue, device=device
@@ -249,7 +247,7 @@ def ones(
     sycl_queue=None,
 ):
     """Validate input parameters before passing them into `dpctl.tensor` module"""
-    dpu.validate_usm_type(usm_type, allow_none=False)
+    dpt.validate_usm_type(usm_type, allow_none=False)
     sycl_queue_normalized = dpnp.get_normalized_queue_device(
         sycl_queue=sycl_queue, device=device
     )
@@ -289,7 +287,7 @@ def zeros(
     sycl_queue=None,
 ):
     """Validate input parameters before passing them into `dpctl.tensor` module"""
-    dpu.validate_usm_type(usm_type, allow_none=False)
+    dpt.validate_usm_type(usm_type, allow_none=False)
     sycl_queue_normalized = dpnp.get_normalized_queue_device(
         sycl_queue=sycl_queue, device=device
     )
diff --git a/dpnp/dpnp_iface.py b/dpnp/dpnp_iface.py
index acca10a2211b..a9bf24bc56ea 100644
--- a/dpnp/dpnp_iface.py
+++ b/dpnp/dpnp_iface.py
@@ -407,7 +407,7 @@ def get_dpnp_descriptor(
     if queue is not None and copy_when_nondefault_queue:
         default_queue = dpctl.SyclQueue()
         queue_is_default = (
-            dpctl.utils.get_execution_queue([queue, default_queue]) is not None
+            dpt.get_execution_queue([queue, default_queue]) is not None
         )
         if not queue_is_default:
             ext_obj = dpnp.array(ext_obj, sycl_queue=default_queue)
diff --git a/dpnp/dpnp_iface_histograms.py b/dpnp/dpnp_iface_histograms.py
index 8f3363e79fe0..944302d31205 100644
--- a/dpnp/dpnp_iface_histograms.py
+++ b/dpnp/dpnp_iface_histograms.py
@@ -53,6 +53,7 @@
     result_type_for_device,
     to_supported_dtypes,
 )
+from dpnp.tensor import get_coerced_usm_type, get_execution_queue
 
 # pylint: disable=no-name-in-module
 from .dpnp_utils import get_usm_allocations
@@ -87,10 +88,10 @@ def _ravel_check_a_and_weights(a, weights):
     if weights is not None:
         # check that `weights` array has supported type
         dpnp.check_supported_arrays_type(weights)
-        usm_type = dpu.get_coerced_usm_type([usm_type, weights.usm_type])
+        usm_type = get_coerced_usm_type([usm_type, weights.usm_type])
 
         # check that arrays have the same allocation queue
-        if dpu.get_execution_queue([a.sycl_queue, weights.sycl_queue]) is None:
+        if get_execution_queue([a.sycl_queue, weights.sycl_queue]) is None:
             raise ValueError(
                 "a and weights must be allocated on the same SYCL queue"
             )
@@ -173,7 +174,7 @@ def _get_bin_edges(a, bins, range, usm_type):
 
     elif numpy.ndim(bins) == 1:
         if dpnp.is_supported_array_type(bins):
-            if dpu.get_execution_queue([a.sycl_queue, bins.sycl_queue]) is None:
+            if get_execution_queue([a.sycl_queue, bins.sycl_queue]) is None:
                 raise ValueError(
                     "a and bins must be allocated on the same SYCL queue"
                 )
diff --git a/dpnp/dpnp_iface_indexing.py b/dpnp/dpnp_iface_indexing.py
index a24c8f56844a..26a0c826cf4a 100644
--- a/dpnp/dpnp_iface_indexing.py
+++ b/dpnp/dpnp_iface_indexing.py
@@ -130,8 +130,8 @@ def _choose_run(inds, chcs, q, usm_type, out=None, mode=0):
                 f"got {out.dtype}"
             )
 
-        if dpu.get_execution_queue((q, out.sycl_queue)) is None:
-            raise dpu.ExecutionPlacementError(
+        if dpt.get_execution_queue((q, out.sycl_queue)) is None:
+            raise dpt.ExecutionPlacementError(
                 "Input and output allocation queues are not compatible"
             )
 
@@ -292,8 +292,8 @@ def _take_index(x, inds, axis, q, usm_type, out=None, mode=0):
                 f"Output array of type {x.dtype} is needed, " f"got {out.dtype}"
             )
 
-        if dpu.get_execution_queue((q, out.sycl_queue)) is None:
-            raise dpu.ExecutionPlacementError(
+        if dpt.get_execution_queue((q, out.sycl_queue)) is None:
+            raise dpt.ExecutionPlacementError(
                 "Input and output allocation queues are not compatible"
             )
 
diff --git a/dpnp/dpnp_iface_logic.py b/dpnp/dpnp_iface_logic.py
index ce1d40774ca0..12014b4b5151 100644
--- a/dpnp/dpnp_iface_logic.py
+++ b/dpnp/dpnp_iface_logic.py
@@ -1261,12 +1261,12 @@ def isin(
         usm_element = dpnp.get_usm_ndarray(element)
     else:
         if (
-            dpu.get_execution_queue(
+            dpt.get_execution_queue(
                 (element.sycl_queue, test_elements.sycl_queue)
             )
             is None
         ):
-            raise dpu.ExecutionPlacementError(
+            raise dpt.ExecutionPlacementError(
                 "Input arrays have incompatible allocation queues"
             )
         usm_element = dpnp.get_usm_ndarray(element)
diff --git a/dpnp/dpnp_iface_mathematical.py b/dpnp/dpnp_iface_mathematical.py
index 22517d9cccca..e46bbe8f323d 100644
--- a/dpnp/dpnp_iface_mathematical.py
+++ b/dpnp/dpnp_iface_mathematical.py
@@ -270,10 +270,10 @@ def _process_ediff1d_args(arg, arg_name, ary_dtype, ary_sycl_queue, usm_type):
     if not dpnp.is_supported_array_type(arg):
         arg = dpnp.asarray(arg, usm_type=usm_type, sycl_queue=ary_sycl_queue)
     else:
-        usm_type = dpu.get_coerced_usm_type([usm_type, arg.usm_type])
+        usm_type = dpt.get_coerced_usm_type([usm_type, arg.usm_type])
         # check that arrays have the same allocation queue
-        if dpu.get_execution_queue([ary_sycl_queue, arg.sycl_queue]) is None:
-            raise dpu.ExecutionPlacementError(
+        if dpt.get_execution_queue([ary_sycl_queue, arg.sycl_queue]) is None:
+            raise dpt.ExecutionPlacementError(
                 f"ary and {arg_name} must be allocated on the same SYCL queue"
             )
 
@@ -304,7 +304,7 @@ def _validate_interp_param(param, name, exec_q, usm_type, dtype=None):
                 f"a {name} value must be 0-dimensional, "
                 f"but got {param.ndim}-dim"
             )
-        if dpu.get_execution_queue([exec_q, param.sycl_queue]) is None:
+        if dpt.get_execution_queue([exec_q, param.sycl_queue]) is None:
             raise ValueError(
                 f"input arrays and {name} must be allocated "
                 "on the same SYCL queue"
@@ -2721,7 +2721,7 @@ def gradient(f, *varargs, axis=None, edge_order=1):
         if dpnp.isscalar(ax_dx):
             usm_type = f.usm_type
         else:
-            usm_type = dpu.get_coerced_usm_type([f.usm_type, ax_dx.usm_type])
+            usm_type = dpt.get_coerced_usm_type([f.usm_type, ax_dx.usm_type])
         out = dpnp.empty_like(f, dtype=otype, usm_type=usm_type)
 
         # spacing for the current axis
diff --git a/dpnp/dpnp_iface_statistics.py b/dpnp/dpnp_iface_statistics.py
index 3d1f62ef716e..bf27fc98a4ce 100644
--- a/dpnp/dpnp_iface_statistics.py
+++ b/dpnp/dpnp_iface_statistics.py
@@ -670,7 +670,7 @@ def _run_native_sliding_dot_product1d(a, v, l_pad, r_pad, rdtype):
     a_casted = dpnp.asarray(a, dtype=supported_dtype, order="C")
     v_casted = dpnp.asarray(v, dtype=supported_dtype, order="C")
 
-    usm_type = dpu.get_coerced_usm_type([a_casted.usm_type, v_casted.usm_type])
+    usm_type = dpt.get_coerced_usm_type([a_casted.usm_type, v_casted.usm_type])
     out_size = l_pad + r_pad + a_casted.size - v_casted.size + 1
     # out type is the same as input type
     out = dpnp.empty_like(a_casted, shape=out_size, usm_type=usm_type)
diff --git a/dpnp/dpnp_utils/dpnp_algo_utils.pyx b/dpnp/dpnp_utils/dpnp_algo_utils.pyx
index 938d9118545b..11737831f014 100644
--- a/dpnp/dpnp_utils/dpnp_algo_utils.pyx
+++ b/dpnp/dpnp_utils/dpnp_algo_utils.pyx
@@ -36,13 +36,13 @@ This module contains different helpers and utilities
 """
 
 import dpctl
-import dpctl.utils as dpu
 import numpy
 
 import dpnp
 import dpnp.config as config
 import dpnp.dpnp_container as dpnp_container
 from dpnp.dpnp_array import dpnp_array
+from dpnp.tensor import get_coerced_usm_type, get_execution_queue
 
 cimport cpython
 cimport cython
@@ -153,7 +153,7 @@ def call_origin(function, *args, **kwargs):
         kwargx = convert_item(kwarg)
         kwargs_new[key] = kwargx
 
-    exec_q = dpu.get_execution_queue(alloc_queues)
+    exec_q = get_execution_queue(alloc_queues)
     if exec_q is None:
         exec_q = dpnp.get_normalized_queue_device(sycl_queue=sycl_queue)
     # print(f"DPNP call_origin(): backend called. \n\t function={function}, \n\t args_new={args_new}, \n\t kwargs_new={kwargs_new}, \n\t dpnp_inplace={dpnp_inplace}")
@@ -221,7 +221,7 @@ def _get_coerced_usm_type(objects):
     elif len(types_in_use) == 1:
         return types_in_use[0]
 
-    common_usm_type = dpu.get_coerced_usm_type(types_in_use)
+    common_usm_type = get_coerced_usm_type(types_in_use)
     if common_usm_type is None:
         raise ValueError("Input arrays must have coerced USM types")
     return common_usm_type
@@ -234,7 +234,7 @@ def _get_common_allocation_queue(objects):
     elif len(queues_in_use) == 1:
         return queues_in_use[0]
 
-    common_queue = dpu.get_execution_queue(queues_in_use)
+    common_queue = get_execution_queue(queues_in_use)
     if common_queue is None:
         raise ValueError("Input arrays must be allocated on the same SYCL queue")
     return common_queue
@@ -401,13 +401,13 @@ cdef tuple get_common_usm_allocation(dpnp_descriptor x1, dpnp_descriptor x2):
     array1_obj = x1.get_array()
     array2_obj = x2.get_array()
 
-    common_usm_type = dpctl.utils.get_coerced_usm_type((array1_obj.usm_type, array2_obj.usm_type))
+    common_usm_type = get_coerced_usm_type((array1_obj.usm_type, array2_obj.usm_type))
     if common_usm_type is None:
         raise ValueError(
             "could not recognize common USM type for inputs of USM types {} and {}"
             "".format(array1_obj.usm_type, array2_obj.usm_type))
 
-    common_sycl_queue = dpu.get_execution_queue((array1_obj.sycl_queue, array2_obj.sycl_queue))
+    common_sycl_queue = get_execution_queue((array1_obj.sycl_queue, array2_obj.sycl_queue))
     if common_sycl_queue is None:
         raise ValueError(
             "could not recognize common SYCL queue for inputs in SYCL queues {} and {}"
diff --git a/dpnp/dpnp_utils/dpnp_utils_einsum.py b/dpnp/dpnp_utils/dpnp_utils_einsum.py
index 284268e2868b..81adaf4edc67 100644
--- a/dpnp/dpnp_utils/dpnp_utils_einsum.py
+++ b/dpnp/dpnp_utils/dpnp_utils_einsum.py
@@ -31,13 +31,12 @@
 import operator
 import warnings
 
-import dpctl
 import numpy
-from dpctl.utils import ExecutionPlacementError
 
 import dpnp
 from dpnp.dpnp_array import dpnp_array
 from dpnp.dpnp_utils import get_usm_allocations, map_dtype_to_device
+from dpnp.tensor import ExecutionPlacementError
 
 _einsum_symbols = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"
 
@@ -1023,7 +1022,7 @@ def dpnp_einsum(
     res_usm_type, exec_q = get_usm_allocations(arrays)
     if out is not None:
         dpnp.check_supported_arrays_type(out)
-        if dpctl.utils.get_execution_queue((exec_q, out.sycl_queue)) is None:
+        if dpnp.tensor.get_execution_queue((exec_q, out.sycl_queue)) is None:
             raise ExecutionPlacementError(
                 "Input and output allocation queues are not compatible"
             )
diff --git a/dpnp/dpnp_utils/dpnp_utils_linearalgebra.py b/dpnp/dpnp_utils/dpnp_utils_linearalgebra.py
index 3ea0ec170bb3..316521b7b7fd 100644
--- a/dpnp/dpnp_utils/dpnp_utils_linearalgebra.py
+++ b/dpnp/dpnp_utils/dpnp_utils_linearalgebra.py
@@ -28,7 +28,6 @@
 
 import dpctl.utils as dpu
 import numpy
-from dpctl.utils import ExecutionPlacementError
 
 import dpnp
 import dpnp.backend.extensions.blas._blas_impl as bi
@@ -694,8 +693,8 @@ def _validate_out_array(out, exec_q):
     """Validate out is supported array and has correct queue."""
     if out is not None:
         dpnp.check_supported_arrays_type(out)
-        if dpu.get_execution_queue((exec_q, out.sycl_queue)) is None:
-            raise ExecutionPlacementError(
+        if dpt.get_execution_queue((exec_q, out.sycl_queue)) is None:
+            raise dpt.ExecutionPlacementError(
                 "Input and output allocation queues are not compatible"
             )
 
diff --git a/dpnp/dpnp_utils/dpnp_utils_statistics.py b/dpnp/dpnp_utils/dpnp_utils_statistics.py
index 6dd5d6433f82..3a773e29c9e1 100644
--- a/dpnp/dpnp_utils/dpnp_utils_statistics.py
+++ b/dpnp/dpnp_utils/dpnp_utils_statistics.py
@@ -28,9 +28,6 @@
 
 import warnings
 
-import dpctl
-from dpctl.utils import ExecutionPlacementError
-
 import dpnp
 import dpnp.tensor as dpt
 from dpnp.dpnp_array import dpnp_array
@@ -67,9 +64,9 @@ def _calc_nanmedian(a, out=None):
         res = dpnp.empty_like(valid_counts, dtype=a.dtype)
     else:
         dpnp.check_supported_arrays_type(out)
-        exec_q = dpctl.utils.get_execution_queue((a.sycl_queue, out.sycl_queue))
+        exec_q = dpt.get_execution_queue((a.sycl_queue, out.sycl_queue))
         if exec_q is None:
-            raise ExecutionPlacementError(
+            raise dpt.ExecutionPlacementError(
                 "Input and output allocation queues are not compatible"
             )
         if out.shape != valid_counts.shape:
diff --git a/dpnp/exceptions/__init__.py b/dpnp/exceptions/__init__.py
index 7e5a55961d51..99587311cf0d 100644
--- a/dpnp/exceptions/__init__.py
+++ b/dpnp/exceptions/__init__.py
@@ -32,9 +32,9 @@
     SyclQueueCreationError,
 )
 from dpctl.memory import USMAllocationError
-from dpctl.utils import ExecutionPlacementError
 from numpy.exceptions import AxisError
 
+from dpnp.tensor import ExecutionPlacementError
 from dpnp.tensor._dlpack import DLPackCreationError
 
 __all__ = [
diff --git a/dpnp/fft/dpnp_utils_fft.py b/dpnp/fft/dpnp_utils_fft.py
index 074f0a66d7bc..7f232e23f426 100644
--- a/dpnp/fft/dpnp_utils_fft.py
+++ b/dpnp/fft/dpnp_utils_fft.py
@@ -41,14 +41,13 @@
 
 from collections.abc import Sequence
 
-import dpctl
 import dpctl.utils as dpu
 import numpy
-from dpctl.utils import ExecutionPlacementError
 
 import dpnp
 import dpnp.backend.extensions.fft._fft_impl as fi
 import dpnp.tensor._tensor_impl as ti
+from dpnp.tensor import ExecutionPlacementError, get_execution_queue
 from dpnp.tensor._numpy_helper import (
     normalize_axis_index,
     normalize_axis_tuple,
@@ -546,10 +545,7 @@ def _validate_out_keyword(a, out, s, axes, c2c, c2r, r2c):
     """Validate out keyword argument."""
     if out is not None:
         dpnp.check_supported_arrays_type(out)
-        if (
-            dpctl.utils.get_execution_queue((a.sycl_queue, out.sycl_queue))
-            is None
-        ):
+        if get_execution_queue((a.sycl_queue, out.sycl_queue)) is None:
             raise ExecutionPlacementError(
                 "Input and output allocation queues are not compatible"
             )
@@ -779,7 +775,7 @@ def dpnp_fillfreq(a, m, n, val):
     """Fill an array with the sample frequencies"""
 
     exec_q = a.sycl_queue
-    _manager = dpctl.utils.SequentialOrderManager[exec_q]
+    _manager = dpu.SequentialOrderManager[exec_q]
 
     # it's assumed there are no dependent events to populate the array
     ht_lin_ev, lin_ev = ti._linspace_step(0, 1, a[:m].get_array(), exec_q)
diff --git a/dpnp/random/dpnp_random_state.py b/dpnp/random/dpnp_random_state.py
index e49fe739aedd..e95434bcd410 100644
--- a/dpnp/random/dpnp_random_state.py
+++ b/dpnp/random/dpnp_random_state.py
@@ -36,7 +36,6 @@
 
 """
 
-import dpctl.utils as dpu
 import numpy
 
 import dpnp
@@ -46,6 +45,7 @@
     use_origin_backend,
 )
 from dpnp.random.dpnp_algo_random import MCG59, MT19937
+from dpnp.tensor import validate_usm_type
 
 
 class RandomState:
@@ -269,7 +269,7 @@ def normal(
                         f"scale={scale}, but must be non-negative."
                     )
 
-                dpu.validate_usm_type(usm_type, allow_none=False)
+                validate_usm_type(usm_type, allow_none=False)
                 return self._random_state.normal(
                     loc=loc,
                     scale=scale,
@@ -635,7 +635,7 @@ def uniform(
                 dtype = self._validate_float_dtype(
                     dtype, (dpnp.int32, dpnp.float32, dpnp.float64)
                 )
-                dpu.validate_usm_type(usm_type, allow_none=False)
+                validate_usm_type(usm_type, allow_none=False)
 
                 return self._random_state.uniform(
                     low=low,
diff --git a/dpnp/tensor/__init__.py b/dpnp/tensor/__init__.py
index 03980e194fd0..0118e04f7ab1 100644
--- a/dpnp/tensor/__init__.py
+++ b/dpnp/tensor/__init__.py
@@ -30,6 +30,12 @@
 from ._accumulation import cumulative_logsumexp, cumulative_prod, cumulative_sum
 from ._array_api import __array_api_version__, __array_namespace_info__
 from ._clip import clip
+from ._compute_follows_data import (
+    ExecutionPlacementError,
+    get_coerced_usm_type,
+    get_execution_queue,
+    validate_usm_type,
+)
 from ._constants import e, inf, nan, newaxis, pi
 from ._copy_utils import (
     asnumpy,
@@ -411,4 +417,9 @@
     "zeros_like",
     "__array_api_version__",
     "__array_namespace_info__",
+    # utilities
+    "ExecutionPlacementError",
+    "get_coerced_usm_type",
+    "get_execution_queue",
+    "validate_usm_type",
 ]
diff --git a/dpnp/tensor/_accumulation.py b/dpnp/tensor/_accumulation.py
index 305cf263514e..fa1326c3b18d 100644
--- a/dpnp/tensor/_accumulation.py
+++ b/dpnp/tensor/_accumulation.py
@@ -26,8 +26,7 @@
 # THE POSSIBILITY OF SUCH DAMAGE.
 # *****************************************************************************
 
-import dpctl
-from dpctl.utils import ExecutionPlacementError, SequentialOrderManager
+from dpctl.utils import SequentialOrderManager
 
 import dpnp.tensor as dpt
 import dpnp.tensor._tensor_accumulation_impl as tai
@@ -121,8 +120,8 @@ def _accumulate_common(
             raise ValueError(
                 f"Output array of type {res_dt} is needed, " f"got {out.dtype}"
             )
-        if dpctl.utils.get_execution_queue((q, out.sycl_queue)) is None:
-            raise ExecutionPlacementError(
+        if dpt.get_execution_queue((q, out.sycl_queue)) is None:
+            raise dpt.ExecutionPlacementError(
                 "Input and output allocation queues are not compatible"
             )
         # permute out array dims if necessary
diff --git a/dpnp/tensor/_clip.py b/dpnp/tensor/_clip.py
index 64020e88ce39..4ba2dcecb370 100644
--- a/dpnp/tensor/_clip.py
+++ b/dpnp/tensor/_clip.py
@@ -26,8 +26,7 @@
 # THE POSSIBILITY OF SUCH DAMAGE.
 # *****************************************************************************
 
-import dpctl
-from dpctl.utils import ExecutionPlacementError, SequentialOrderManager
+from dpctl.utils import SequentialOrderManager
 
 import dpnp.tensor as dpt
 import dpnp.tensor._tensor_elementwise_impl as tei
@@ -80,19 +79,19 @@ def _clip_none(x, val, out, order, _binary_fn):
         exec_q = q1
         res_usm_type = x_usm_type
     else:
-        exec_q = dpctl.utils.get_execution_queue((q1, q2))
+        exec_q = dpt.get_execution_queue((q1, q2))
         if exec_q is None:
-            raise ExecutionPlacementError(
+            raise dpt.ExecutionPlacementError(
                 "Execution placement can not be unambiguously inferred "
                 "from input arguments."
             )
-        res_usm_type = dpctl.utils.get_coerced_usm_type(
+        res_usm_type = dpt.get_coerced_usm_type(
             (
                 x_usm_type,
                 val_usm_type,
             )
         )
-    dpctl.utils.validate_usm_type(res_usm_type, allow_none=False)
+    dpt.validate_usm_type(res_usm_type, allow_none=False)
     x_shape = x.shape
     val_shape = _get_shape(val)
     if not isinstance(val_shape, (tuple, list)):
@@ -153,8 +152,8 @@ def _clip_none(x, val, out, order, _binary_fn):
                 f"Output array of type {res_dt} is needed, got {out.dtype}"
             )
 
-        if dpctl.utils.get_execution_queue((exec_q, out.sycl_queue)) is None:
-            raise ExecutionPlacementError(
+        if dpt.get_execution_queue((exec_q, out.sycl_queue)) is None:
+            raise dpt.ExecutionPlacementError(
                 "Input and output allocation queues are not compatible"
             )
 
@@ -340,11 +339,8 @@ def clip(x, /, min=None, max=None, out=None, order="K"):
                     f"got {out.dtype}"
                 )
 
-            if (
-                dpctl.utils.get_execution_queue((exec_q, out.sycl_queue))
-                is None
-            ):
-                raise ExecutionPlacementError(
+            if dpt.get_execution_queue((exec_q, out.sycl_queue)) is None:
+                raise dpt.ExecutionPlacementError(
                     "Input and output allocation queues are not compatible"
                 )
 
@@ -388,46 +384,46 @@ def clip(x, /, min=None, max=None, out=None, order="K"):
             exec_q = q1
             res_usm_type = x_usm_type
         elif q3 is None:
-            exec_q = dpctl.utils.get_execution_queue((q1, q2))
+            exec_q = dpt.get_execution_queue((q1, q2))
             if exec_q is None:
-                raise ExecutionPlacementError(
+                raise dpt.ExecutionPlacementError(
                     "Execution placement can not be unambiguously inferred "
                     "from input arguments."
                 )
-            res_usm_type = dpctl.utils.get_coerced_usm_type(
+            res_usm_type = dpt.get_coerced_usm_type(
                 (
                     x_usm_type,
                     min_usm_type,
                 )
             )
         elif q2 is None:
-            exec_q = dpctl.utils.get_execution_queue((q1, q3))
+            exec_q = dpt.get_execution_queue((q1, q3))
             if exec_q is None:
-                raise ExecutionPlacementError(
+                raise dpt.ExecutionPlacementError(
                     "Execution placement can not be unambiguously inferred "
                     "from input arguments."
                 )
-            res_usm_type = dpctl.utils.get_coerced_usm_type(
+            res_usm_type = dpt.get_coerced_usm_type(
                 (
                     x_usm_type,
                     max_usm_type,
                 )
             )
         else:
-            exec_q = dpctl.utils.get_execution_queue((q1, q2, q3))
+            exec_q = dpt.get_execution_queue((q1, q2, q3))
             if exec_q is None:
-                raise ExecutionPlacementError(
+                raise dpt.ExecutionPlacementError(
                     "Execution placement can not be unambiguously inferred "
                     "from input arguments."
                 )
-            res_usm_type = dpctl.utils.get_coerced_usm_type(
+            res_usm_type = dpt.get_coerced_usm_type(
                 (
                     x_usm_type,
                     min_usm_type,
                     max_usm_type,
                 )
             )
-        dpctl.utils.validate_usm_type(res_usm_type, allow_none=False)
+        dpt.validate_usm_type(res_usm_type, allow_none=False)
         x_shape = x.shape
         min_shape = _get_shape(min)
         max_shape = _get_shape(max)
@@ -506,11 +502,8 @@ def clip(x, /, min=None, max=None, out=None, order="K"):
                     f"got {out.dtype}"
                 )
 
-            if (
-                dpctl.utils.get_execution_queue((exec_q, out.sycl_queue))
-                is None
-            ):
-                raise ExecutionPlacementError(
+            if dpt.get_execution_queue((exec_q, out.sycl_queue)) is None:
+                raise dpt.ExecutionPlacementError(
                     "Input and output allocation queues are not compatible"
                 )
 
diff --git a/dpnp/tensor/_compute_follows_data.pyx b/dpnp/tensor/_compute_follows_data.pyx
new file mode 100644
index 000000000000..70e6bdfaeb79
--- /dev/null
+++ b/dpnp/tensor/_compute_follows_data.pyx
@@ -0,0 +1,191 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+# distutils: language = c++
+# cython: language_level=3
+# cython: linetrace=True
+
+"""Compute-follows-data utilities for execution queue and USM type management.
+
+This module provides utilities to determine execution placement and USM allocation
+types when combining arrays under the compute-follows-data paradigm.
+"""
+
+
+import dpctl
+from dpctl._sycl_queue cimport SyclQueue
+
+__all__ = [
+    "get_execution_queue", "get_coerced_usm_type", "ExecutionPlacementError"
+]
+
+
+class ExecutionPlacementError(Exception):
+    """Exception raised when execution placement target can not
+    be unambiguously determined from input arrays.
+
+    Make sure that input arrays are associated with the same
+    :class:`dpctl.SyclQueue`,
+    or migrate data to the same :class:`dpctl.SyclQueue` using
+    :meth:`dpctl.tensor.usm_ndarray.to_device` method.
+    """
+    pass
+
+
+cdef bint queue_equiv(SyclQueue q1, SyclQueue q2):
+    """Queues are equivalent if ``q1 == q2``, that is they are copies
+    of the same underlying SYCL object and hence are the same."""
+    return q1.__eq__(q2)
+
+
+def get_execution_queue(qs, /):
+    """
+    Get execution queue from queues associated with input arrays.
+
+    Args:
+        qs (List[:class:`dpctl.SyclQueue`], Tuple[:class:`dpctl.SyclQueue`]):
+            a list or a tuple of :class:`dpctl.SyclQueue` objects
+            corresponding to arrays that are being combined.
+
+    Returns:
+        SyclQueue:
+            execution queue under compute follows data paradigm,
+            or ``None`` if queues are not equal.
+    """
+    if not isinstance(qs, (list, tuple)):
+        raise TypeError(
+            "Expected a list or a tuple, got {}".format(type(qs))
+        )
+    if len(qs) == 0:
+        return None
+    elif len(qs) == 1:
+        return qs[0] if isinstance(qs[0], dpctl.SyclQueue) else None
+    for q1, q2 in zip(qs[:-1], qs[1:]):
+        if not isinstance(q1, dpctl.SyclQueue):
+            return None
+        elif not isinstance(q2, dpctl.SyclQueue):
+            return None
+        elif not queue_equiv(<SyclQueue> q1, <SyclQueue> q2):
+            return None
+    return qs[0]
+
+
+def get_coerced_usm_type(usm_types, /):
+    """
+    Get USM type of the output array for a function combining
+    arrays of given USM types using compute-follows-data execution
+    model.
+
+    Args:
+        usm_types (List[str], Tuple[str]):
+            a list or a tuple of strings of ``.usm_types`` attributes
+            for input arrays
+
+    Returns:
+         str
+            type of USM allocation for the output arrays (s).
+            ``None`` if any of the input strings are not recognized.
+    """
+    if not isinstance(usm_types, (list, tuple)):
+        raise TypeError(
+            "Expected a list or a tuple, got {}".format(type(usm_types))
+        )
+    if len(usm_types) == 0:
+        return None
+    _k = ["device", "shared", "host"]
+    _m = {k: i for i, k in enumerate(_k)}
+    res = len(_k)
+    for t in usm_types:
+        if not isinstance(t, str):
+            return None
+        if t not in _m:
+            return None
+        res = min(res, _m[t])
+    return _k[res]
+
+
+def _validate_usm_type_allow_none(usm_type):
+    "Validates usm_type argument"
+    if usm_type is not None:
+        if isinstance(usm_type, str):
+            if usm_type not in ["device", "shared", "host"]:
+                raise ValueError(
+                    f"Unrecognized value of usm_type={usm_type}, "
+                    "expected 'device', 'shared', 'host', or None."
+                )
+        else:
+            raise TypeError(
+                f"Expected usm_type to be a str or None, got {type(usm_type)}"
+            )
+
+
+def _validate_usm_type_disallow_none(usm_type):
+    "Validates usm_type argument"
+    if isinstance(usm_type, str):
+        if usm_type not in ["device", "shared", "host"]:
+            raise ValueError(
+                f"Unrecognized value of usm_type={usm_type}, "
+                "expected 'device', 'shared', or 'host'."
+            )
+    else:
+        raise TypeError(
+            f"Expected usm_type to be a str, got {type(usm_type)}"
+        )
+
+
+def validate_usm_type(usm_type, /, *, allow_none=True):
+    """ validate_usm_type(usm_type, allow_none=True)
+
+    Raises an exception if `usm_type` is invalid.
+
+    Args:
+        usm_type:
+            Specification for USM allocation type. Valid specifications
+            are:
+
+            * ``"device"``
+            * ``"shared"``
+            * ``"host"``
+
+            If ``allow_none`` keyword argument is set, a value of
+            ``None`` is also permitted.
+        allow_none (bool, optional):
+            Whether ``usm_type`` value of ``None`` is considered valid.
+            Default: `True`.
+
+    Raises:
+        ValueError:
+            if ``usm_type`` is not a recognized string.
+        TypeError:
+            if ``usm_type`` is not a string, and ``usm_type`` is
+            not ``None`` provided ``allow_none`` is ``True``.
+    """
+    if allow_none:
+        _validate_usm_type_allow_none(usm_type)
+    else:
+        _validate_usm_type_disallow_none(usm_type)
diff --git a/dpnp/tensor/_copy_utils.py b/dpnp/tensor/_copy_utils.py
index 9a16d4f59acd..ac1e9a9863a2 100644
--- a/dpnp/tensor/_copy_utils.py
+++ b/dpnp/tensor/_copy_utils.py
@@ -32,8 +32,8 @@
 
 import dpctl
 import dpctl.memory as dpm
-import dpctl.utils
 import numpy as np
+from dpctl.utils import SequentialOrderManager
 
 import dpnp.tensor as dpt
 import dpnp.tensor._tensor_impl as ti
@@ -124,7 +124,7 @@ def _copy_from_numpy_into(dst, np_ary):
             src_ary = src_ary.astype(np.float32)
         elif src_ary_dt_c == "D":
             src_ary = src_ary.astype(np.complex64)
-    _manager = dpctl.utils.SequentialOrderManager[copy_q]
+    _manager = SequentialOrderManager[copy_q]
     dep_ev = _manager.submitted_events
     # synchronizing call
     ti._copy_numpy_ndarray_into_usm_ndarray(
@@ -142,14 +142,12 @@ def _extract_impl(ary, ary_mask, axis=0):
             f"Expecting type dpctl.tensor.usm_ndarray, got {type(ary)}"
         )
     if isinstance(ary_mask, dpt.usm_ndarray):
-        dst_usm_type = dpctl.utils.get_coerced_usm_type(
+        dst_usm_type = dpt.get_coerced_usm_type(
             (ary.usm_type, ary_mask.usm_type)
         )
-        exec_q = dpctl.utils.get_execution_queue(
-            (ary.sycl_queue, ary_mask.sycl_queue)
-        )
+        exec_q = dpt.get_execution_queue((ary.sycl_queue, ary_mask.sycl_queue))
         if exec_q is None:
-            raise dpctl.utils.ExecutionPlacementError(
+            raise dpt.ExecutionPlacementError(
                 "arrays have different associated queues. "
                 "Use `y.to_device(x.device)` to migrate."
             )
@@ -175,7 +173,7 @@ def _extract_impl(ary, ary_mask, axis=0):
     cumsum_dt = dpt.int32 if mask_nelems < int32_t_max else dpt.int64
     cumsum = dpt.empty(mask_nelems, dtype=cumsum_dt, device=ary_mask.device)
     exec_q = cumsum.sycl_queue
-    _manager = dpctl.utils.SequentialOrderManager[exec_q]
+    _manager = SequentialOrderManager[exec_q]
     dep_evs = _manager.submitted_events
     mask_count = ti.mask_positions(
         ary_mask, cumsum, sycl_queue=exec_q, depends=dep_evs
@@ -230,8 +228,8 @@ def _get_indices_queue_usm_type(inds, queue, usm_type):
         raise TypeError(
             "at least one element of `inds` expected to be an array"
         )
-    usm_type = dpctl.utils.get_coerced_usm_type(usm_types)
-    q = dpctl.utils.get_execution_queue(queues)
+    usm_type = dpt.get_coerced_usm_type(usm_types)
+    q = dpt.get_execution_queue(queues)
     return q, usm_type
 
 
@@ -247,7 +245,7 @@ def _nonzero_impl(ary):
     cumsum = dpt.empty(
         mask_nelems, dtype=cumsum_dt, sycl_queue=exec_q, order="C"
     )
-    _manager = dpctl.utils.SequentialOrderManager[exec_q]
+    _manager = SequentialOrderManager[exec_q]
     dep_evs = _manager.submitted_events
     mask_count = ti.mask_positions(
         ary, cumsum, sycl_queue=exec_q, depends=dep_evs
@@ -318,20 +316,20 @@ def _place_impl(ary, ary_mask, vals, axis=0):
             f"Expecting type dpctl.tensor.usm_ndarray, got {type(ary)}"
         )
     if isinstance(ary_mask, dpt.usm_ndarray):
-        exec_q = dpctl.utils.get_execution_queue(
+        exec_q = dpt.get_execution_queue(
             (
                 ary.sycl_queue,
                 ary_mask.sycl_queue,
             )
         )
-        coerced_usm_type = dpctl.utils.get_coerced_usm_type(
+        coerced_usm_type = dpt.get_coerced_usm_type(
             (
                 ary.usm_type,
                 ary_mask.usm_type,
             )
         )
         if exec_q is None:
-            raise dpctl.utils.ExecutionPlacementError(
+            raise dpt.ExecutionPlacementError(
                 "arrays have different associated queues. "
                 "Use `y.to_device(x.device)` to migrate."
             )
@@ -355,15 +353,15 @@ def _place_impl(ary, ary_mask, vals, axis=0):
                 sycl_queue=exec_q,
             )
         else:
-            exec_q = dpctl.utils.get_execution_queue((exec_q, vals.sycl_queue))
-            coerced_usm_type = dpctl.utils.get_coerced_usm_type(
+            exec_q = dpt.get_execution_queue((exec_q, vals.sycl_queue))
+            coerced_usm_type = dpt.get_coerced_usm_type(
                 (
                     coerced_usm_type,
                     vals.usm_type,
                 )
             )
     if exec_q is None:
-        raise dpctl.utils.ExecutionPlacementError(
+        raise dpt.ExecutionPlacementError(
             "arrays have different associated queues. "
             "Use `Y.to_device(X.device)` to migrate."
         )
@@ -383,7 +381,7 @@ def _place_impl(ary, ary_mask, vals, axis=0):
         device=ary_mask.device,
     )
     exec_q = cumsum.sycl_queue
-    _manager = dpctl.utils.SequentialOrderManager[exec_q]
+    _manager = SequentialOrderManager[exec_q]
     dep_ev = _manager.submitted_events
     mask_count = ti.mask_positions(
         ary_mask, cumsum, sycl_queue=exec_q, depends=dep_ev
@@ -440,15 +438,15 @@ def _put_multi_index(ary, inds, p, vals, mode=0):
                 sycl_queue=exec_q,
             )
         else:
-            exec_q = dpctl.utils.get_execution_queue((exec_q, vals.sycl_queue))
-            coerced_usm_type = dpctl.utils.get_coerced_usm_type(
+            exec_q = dpt.get_execution_queue((exec_q, vals.sycl_queue))
+            coerced_usm_type = dpt.get_coerced_usm_type(
                 (
                     coerced_usm_type,
                     vals.usm_type,
                 )
             )
     if exec_q is None:
-        raise dpctl.utils.ExecutionPlacementError(
+        raise dpt.ExecutionPlacementError(
             "Can not automatically determine where to allocate the "
             "result or performance execution. "
             "Use `usm_ndarray.to_device` method to migrate data to "
@@ -470,7 +468,7 @@ def _put_multi_index(ary, inds, p, vals, mode=0):
     else:
         rhs = dpt.astype(vals, ary.dtype)
     rhs = dpt.broadcast_to(rhs, expected_vals_shape)
-    _manager = dpctl.utils.SequentialOrderManager[exec_q]
+    _manager = SequentialOrderManager[exec_q]
     dep_ev = _manager.submitted_events
     hev, put_ev = ti._put(
         dst=ary,
@@ -504,7 +502,7 @@ def _take_multi_index(ary, inds, p, mode=0):
         inds, ary.sycl_queue, ary.usm_type
     )
     if exec_q is None:
-        raise dpctl.utils.ExecutionPlacementError(
+        raise dpt.ExecutionPlacementError(
             "Can not automatically determine where to allocate the "
             "result or performance execution. "
             "Use `usm_ndarray.to_device` method to migrate data to "
@@ -522,7 +520,7 @@ def _take_multi_index(ary, inds, p, mode=0):
     res = dpt.empty(
         res_shape, dtype=ary.dtype, usm_type=res_usm_type, sycl_queue=exec_q
     )
-    _manager = dpctl.utils.SequentialOrderManager[exec_q]
+    _manager = SequentialOrderManager[exec_q]
     dep_ev = _manager.submitted_events
     hev, take_ev = ti._take(
         src=ary,
@@ -630,7 +628,7 @@ def _copy_overlapping(dst, src):
         order="C",
         buffer_ctor_kwargs={"queue": q},
     )
-    _manager = dpctl.utils.SequentialOrderManager[q]
+    _manager = SequentialOrderManager[q]
     dep_evs = _manager.submitted_events
     hcp1, cp1 = ti._copy_usm_ndarray_into_usm_ndarray(
         src=src, dst=tmp, sycl_queue=q, depends=dep_evs
@@ -655,7 +653,7 @@ def _copy_same_shape(dst, src):
         return
 
     copy_q = dst.sycl_queue
-    _manager = dpctl.utils.SequentialOrderManager[copy_q]
+    _manager = SequentialOrderManager[copy_q]
     dep_evs = _manager.submitted_events
     hev, cpy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
         src=src, dst=dst, sycl_queue=copy_q, depends=dep_evs
diff --git a/dpnp/tensor/_ctors.py b/dpnp/tensor/_ctors.py
index c6e14db7398f..7e9a6202f12a 100644
--- a/dpnp/tensor/_ctors.py
+++ b/dpnp/tensor/_ctors.py
@@ -31,8 +31,8 @@
 
 import dpctl
 import dpctl.memory as dpm
-import dpctl.utils
 import numpy as np
+from dpctl.utils import SequentialOrderManager
 
 import dpnp.tensor as dpt
 import dpnp.tensor._tensor_impl as ti
@@ -164,8 +164,8 @@ def _asarray_from_seq(
     if usm_type is None:
         usm_types_in_seq = []
         _usm_types_walker(seq_obj, usm_types_in_seq)
-        usm_type = dpctl.utils.get_coerced_usm_type(usm_types_in_seq)
-    dpctl.utils.validate_usm_type(usm_type)
+        usm_type = dpt.get_coerced_usm_type(usm_types_in_seq)
+    dpt.validate_usm_type(usm_type)
     if dtype is None:
         dtype = _map_to_device_dtype(seq_dt, alloc_q)
     else:
@@ -186,7 +186,7 @@ def _asarray_from_seq(
             sycl_queue=alloc_q,
             order=order,
         )
-        _manager = dpctl.utils.SequentialOrderManager[exec_q]
+        _manager = SequentialOrderManager[exec_q]
         _device_copy_walker(seq_obj, res, _manager)
         return res
     else:
@@ -215,7 +215,7 @@ def _asarray_from_seq_single_device(
         exec_q = seq_dev
         alloc_q = seq_dev
     else:
-        exec_q = dpctl.utils.get_execution_queue(
+        exec_q = dpt.get_execution_queue(
             (
                 sycl_queue,
                 seq_dev,
@@ -249,9 +249,7 @@ def _asarray_from_usm_ndarray(
     if usm_type is None:
         usm_type = usm_ndary.usm_type
     if sycl_queue is not None:
-        exec_q = dpctl.utils.get_execution_queue(
-            [usm_ndary.sycl_queue, sycl_queue]
-        )
+        exec_q = dpt.get_execution_queue([usm_ndary.sycl_queue, sycl_queue])
         copy_q = normalize_queue_device(sycl_queue=sycl_queue, device=exec_q)
     else:
         copy_q = usm_ndary.sycl_queue
@@ -300,9 +298,9 @@ def _asarray_from_usm_ndarray(
             order=order,
             buffer_ctor_kwargs={"queue": copy_q},
         )
-    eq = dpctl.utils.get_execution_queue([usm_ndary.sycl_queue, copy_q])
+    eq = dpt.get_execution_queue([usm_ndary.sycl_queue, copy_q])
     if eq is not None:
-        _manager = dpctl.utils.SequentialOrderManager[eq]
+        _manager = SequentialOrderManager[eq]
         dep_evs = _manager.submitted_events
         hev, cpy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
             src=usm_ndary, dst=res, sycl_queue=eq, depends=dep_evs
@@ -350,7 +348,7 @@ def _coerce_and_infer_dt(*args, dt, sycl_queue, err_msg, allow_bool=False):
 def _copy_through_host_walker(seq_o, usm_res):
     if isinstance(seq_o, dpt.usm_ndarray):
         if (
-            dpctl.utils.get_execution_queue(
+            dpt.get_execution_queue(
                 (
                     usm_res.sycl_queue,
                     seq_o.sycl_queue,
@@ -370,7 +368,7 @@ def _copy_through_host_walker(seq_o, usm_res):
     if hasattr(seq_o, "__sycl_usm_array_interface__"):
         usm_ar = _usm_ndarray_from_suai(seq_o)
         if (
-            dpctl.utils.get_execution_queue(
+            dpt.get_execution_queue(
                 (
                     usm_res.sycl_queue,
                     usm_ar.sycl_queue,
@@ -615,7 +613,7 @@ def arange(
         start = 0
     if step is None:
         step = 1
-    dpctl.utils.validate_usm_type(usm_type, allow_none=False)
+    dpt.validate_usm_type(usm_type, allow_none=False)
     sycl_queue = normalize_queue_device(sycl_queue=sycl_queue, device=device)
     is_bool = False
     if dtype:
@@ -656,7 +654,7 @@ def arange(
     else:
         _step = sc_ty(1)
     _start = _first
-    _manager = dpctl.utils.SequentialOrderManager[sycl_queue]
+    _manager = SequentialOrderManager[sycl_queue]
     # populating newly allocated array, no task dependencies
     hev, lin_ev = ti._linspace_step(_start, _step, res, sycl_queue)
     _manager.add_event_pair(hev, lin_ev)
@@ -752,7 +750,7 @@ def asarray(
         )
     order = order[0].upper()
     # 4. Check that usm_type is None, or a valid value
-    dpctl.utils.validate_usm_type(usm_type, allow_none=True)
+    dpt.validate_usm_type(usm_type, allow_none=True)
     # 5. Normalize device/sycl_queue [keep it None if was None]
     if device is not None or sycl_queue is not None:
         sycl_queue = normalize_queue_device(
@@ -846,7 +844,7 @@ def asarray(
                 if len(devs) == 1:
                     alloc_q = devs[0]
                 else:
-                    raise dpctl.utils.ExecutionPlacementError(
+                    raise dpt.ExecutionPlacementError(
                         "Please specify `device` or `sycl_queue` keyword "
                         "argument to determine where to allocate the "
                         "resulting array."
@@ -929,7 +927,7 @@ def empty(
             "Unrecognized order keyword value, expecting 'F' or 'C'."
         )
     order = order[0].upper()
-    dpctl.utils.validate_usm_type(usm_type, allow_none=False)
+    dpt.validate_usm_type(usm_type, allow_none=False)
     sycl_queue = normalize_queue_device(sycl_queue=sycl_queue, device=device)
     dtype = _get_dtype(dtype, sycl_queue)
     _ensure_native_dtype_device_support(dtype, sycl_queue.sycl_device)
@@ -997,7 +995,7 @@ def empty_like(
         dtype = x.dtype
     if usm_type is None:
         usm_type = x.usm_type
-    dpctl.utils.validate_usm_type(usm_type, allow_none=False)
+    dpt.validate_usm_type(usm_type, allow_none=False)
     if device is None and sycl_queue is None:
         device = x.device
     sycl_queue = normalize_queue_device(sycl_queue=sycl_queue, device=device)
@@ -1097,7 +1095,7 @@ def eye(
             usm_type=usm_type,
             sycl_queue=sycl_queue,
         )
-    dpctl.utils.validate_usm_type(usm_type, allow_none=False)
+    dpt.validate_usm_type(usm_type, allow_none=False)
     sycl_queue = normalize_queue_device(sycl_queue=sycl_queue, device=device)
     dtype = _get_dtype(dtype, sycl_queue)
     _ensure_native_dtype_device_support(dtype, sycl_queue.sycl_device)
@@ -1109,7 +1107,7 @@ def eye(
         buffer_ctor_kwargs={"queue": sycl_queue},
     )
     if n_rows != 0 and n_cols != 0:
-        _manager = dpctl.utils.SequentialOrderManager[sycl_queue]
+        _manager = SequentialOrderManager[sycl_queue]
         hev, eye_ev = ti._eye(k, dst=res, sycl_queue=sycl_queue)
         _manager.add_event_pair(hev, eye_ev)
     return res
@@ -1178,7 +1176,7 @@ def full(
             "Unrecognized order keyword value, expecting 'F' or 'C'."
         )
     order = order[0].upper()
-    dpctl.utils.validate_usm_type(usm_type, allow_none=True)
+    dpt.validate_usm_type(usm_type, allow_none=True)
 
     if isinstance(fill_value, (dpt.usm_ndarray, np.ndarray, tuple, list)):
         if (
@@ -1214,7 +1212,7 @@ def full(
     )
     fill_value = _cast_fill_val(fill_value, dtype)
 
-    _manager = dpctl.utils.SequentialOrderManager[sycl_queue]
+    _manager = SequentialOrderManager[sycl_queue]
     # populating new allocation, no dependent events
     hev, full_ev = ti._full_usm_ndarray(fill_value, res, sycl_queue)
     _manager.add_event_pair(hev, full_ev)
@@ -1288,7 +1286,7 @@ def full_like(
         dtype = x.dtype
     if usm_type is None:
         usm_type = x.usm_type
-    dpctl.utils.validate_usm_type(usm_type, allow_none=False)
+    dpt.validate_usm_type(usm_type, allow_none=False)
     if device is None and sycl_queue is None:
         device = x.device
     sycl_queue = normalize_queue_device(sycl_queue=sycl_queue, device=device)
@@ -1307,7 +1305,7 @@ def full_like(
             )
             X = dpt.broadcast_to(X, sh)
             res = _empty_like_orderK(x, dtype, usm_type, sycl_queue)
-            _manager = dpctl.utils.SequentialOrderManager[sycl_queue]
+            _manager = SequentialOrderManager[sycl_queue]
             # order copy after tasks populating X
             dep_evs = _manager.submitted_events
             hev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
@@ -1321,7 +1319,7 @@ def full_like(
         dtype = _get_dtype(dtype, sycl_queue, ref_type=type(fill_value))
         res = _empty_like_orderK(x, dtype, usm_type, sycl_queue)
         fill_value = _cast_fill_val(fill_value, dtype)
-        _manager = dpctl.utils.SequentialOrderManager[sycl_queue]
+        _manager = SequentialOrderManager[sycl_queue]
         # populating new allocation, no dependent events
         hev, full_ev = ti._full_usm_ndarray(fill_value, res, sycl_queue)
         _manager.add_event_pair(hev, full_ev)
@@ -1405,7 +1403,7 @@ def linspace(
             interval.
     """
     sycl_queue = normalize_queue_device(sycl_queue=sycl_queue, device=device)
-    dpctl.utils.validate_usm_type(usm_type, allow_none=False)
+    dpt.validate_usm_type(usm_type, allow_none=False)
     if endpoint not in [True, False]:
         raise TypeError("endpoint keyword argument must be of boolean type")
 
@@ -1432,7 +1430,7 @@ def linspace(
         stop = float(stop)
 
     res = dpt.empty(num, dtype=dt, usm_type=usm_type, sycl_queue=sycl_queue)
-    _manager = dpctl.utils.SequentialOrderManager[sycl_queue]
+    _manager = SequentialOrderManager[sycl_queue]
     hev, la_ev = ti._linspace_affine(
         start, stop, dst=res, include_endpoint=endpoint, sycl_queue=sycl_queue
     )
@@ -1567,7 +1565,7 @@ def ones(
             "Unrecognized order keyword value, expecting 'F' or 'C'."
         )
     order = order[0].upper()
-    dpctl.utils.validate_usm_type(usm_type, allow_none=False)
+    dpt.validate_usm_type(usm_type, allow_none=False)
     sycl_queue = normalize_queue_device(sycl_queue=sycl_queue, device=device)
     dtype = _get_dtype(dtype, sycl_queue)
     res = dpt.usm_ndarray(
@@ -1577,7 +1575,7 @@ def ones(
         order=order,
         buffer_ctor_kwargs={"queue": sycl_queue},
     )
-    _manager = dpctl.utils.SequentialOrderManager[sycl_queue]
+    _manager = SequentialOrderManager[sycl_queue]
     # populating new allocation, no dependent events
     hev, full_ev = ti._full_usm_ndarray(1, res, sycl_queue)
     _manager.add_event_pair(hev, full_ev)
@@ -1639,7 +1637,7 @@ def ones_like(
         dtype = x.dtype
     if usm_type is None:
         usm_type = x.usm_type
-    dpctl.utils.validate_usm_type(usm_type, allow_none=False)
+    dpt.validate_usm_type(usm_type, allow_none=False)
     if device is None and sycl_queue is None:
         device = x.device
     sycl_queue = normalize_queue_device(sycl_queue=sycl_queue, device=device)
@@ -1648,7 +1646,7 @@ def ones_like(
     if order == "K":
         _ensure_native_dtype_device_support(dtype, sycl_queue.sycl_device)
         res = _empty_like_orderK(x, dtype, usm_type, sycl_queue)
-        _manager = dpctl.utils.SequentialOrderManager[sycl_queue]
+        _manager = SequentialOrderManager[sycl_queue]
         # populating new allocation, no dependent events
         hev, full_ev = ti._full_usm_ndarray(1, res, sycl_queue)
         _manager.add_event_pair(hev, full_ev)
@@ -1711,7 +1709,7 @@ def tril(x, /, *, k=0):
             usm_type=x.usm_type,
             sycl_queue=q,
         )
-        _manager = dpctl.utils.SequentialOrderManager[q]
+        _manager = SequentialOrderManager[q]
         dep_evs = _manager.submitted_events
         hev, cpy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
             src=x, dst=res, sycl_queue=q, depends=dep_evs
@@ -1733,7 +1731,7 @@ def tril(x, /, *, k=0):
             usm_type=x.usm_type,
             sycl_queue=q,
         )
-        _manager = dpctl.utils.SequentialOrderManager[q]
+        _manager = SequentialOrderManager[q]
         dep_evs = _manager.submitted_events
         hev, tril_ev = ti._tril(
             src=x, dst=res, k=k, sycl_queue=q, depends=dep_evs
@@ -1797,7 +1795,7 @@ def triu(x, /, *, k=0):
             usm_type=x.usm_type,
             sycl_queue=q,
         )
-        _manager = dpctl.utils.SequentialOrderManager[q]
+        _manager = SequentialOrderManager[q]
         dep_evs = _manager.submitted_events
         hev, cpy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
             src=x, dst=res, sycl_queue=q, depends=dep_evs
@@ -1811,7 +1809,7 @@ def triu(x, /, *, k=0):
             usm_type=x.usm_type,
             sycl_queue=q,
         )
-        _manager = dpctl.utils.SequentialOrderManager[q]
+        _manager = SequentialOrderManager[q]
         dep_evs = _manager.submitted_events
         hev, triu_ev = ti._triu(
             src=x, dst=res, k=k, sycl_queue=q, depends=dep_evs
@@ -1871,7 +1869,7 @@ def zeros(
             "Unrecognized order keyword value, expecting 'F' or 'C'."
         )
     order = order[0].upper()
-    dpctl.utils.validate_usm_type(usm_type, allow_none=False)
+    dpt.validate_usm_type(usm_type, allow_none=False)
     sycl_queue = normalize_queue_device(sycl_queue=sycl_queue, device=device)
     dtype = _get_dtype(dtype, sycl_queue)
     _ensure_native_dtype_device_support(dtype, sycl_queue.sycl_device)
@@ -1882,7 +1880,7 @@ def zeros(
         order=order,
         buffer_ctor_kwargs={"queue": sycl_queue},
     )
-    _manager = dpctl.utils.SequentialOrderManager[sycl_queue]
+    _manager = SequentialOrderManager[sycl_queue]
     # populating new allocation, no dependent events
     hev, zeros_ev = ti._zeros_usm_ndarray(res, sycl_queue)
     _manager.add_event_pair(hev, zeros_ev)
@@ -1947,7 +1945,7 @@ def zeros_like(
         dtype = x.dtype
     if usm_type is None:
         usm_type = x.usm_type
-    dpctl.utils.validate_usm_type(usm_type, allow_none=False)
+    dpt.validate_usm_type(usm_type, allow_none=False)
     if device is None and sycl_queue is None:
         device = x.device
     sycl_queue = normalize_queue_device(sycl_queue=sycl_queue, device=device)
@@ -1956,7 +1954,7 @@ def zeros_like(
     if order == "K":
         _ensure_native_dtype_device_support(dtype, sycl_queue.sycl_device)
         res = _empty_like_orderK(x, dtype, usm_type, sycl_queue)
-        _manager = dpctl.utils.SequentialOrderManager[sycl_queue]
+        _manager = SequentialOrderManager[sycl_queue]
         # populating new allocation, no dependent events
         hev, full_ev = ti._full_usm_ndarray(0, res, sycl_queue)
         _manager.add_event_pair(hev, full_ev)
diff --git a/dpnp/tensor/_device.py b/dpnp/tensor/_device.py
index 8d763bc721e3..5f2725c74855 100644
--- a/dpnp/tensor/_device.py
+++ b/dpnp/tensor/_device.py
@@ -31,6 +31,8 @@
 from dpctl._sycl_device_factory import _cached_default_device
 from dpctl._sycl_queue_manager import get_device_cached_queue
 
+from ._compute_follows_data import get_execution_queue
+
 __doc__ = "Implementation of array API mandated Device class"
 
 
@@ -182,7 +184,7 @@ def normalize_queue_device(sycl_queue=None, device=None):
     if d is None:
         return q
     d = Device.create_device(d)
-    qq = dpctl.utils.get_execution_queue(
+    qq = get_execution_queue(
         (
             q,
             d.sycl_queue,
diff --git a/dpnp/tensor/_elementwise_common.py b/dpnp/tensor/_elementwise_common.py
index d312d50a4a8f..e258df1b2e93 100644
--- a/dpnp/tensor/_elementwise_common.py
+++ b/dpnp/tensor/_elementwise_common.py
@@ -26,8 +26,7 @@
 # THE POSSIBILITY OF SUCH DAMAGE.
 # *****************************************************************************
 
-import dpctl
-from dpctl.utils import ExecutionPlacementError, SequentialOrderManager
+from dpctl.utils import SequentialOrderManager
 
 import dpnp.tensor as dpt
 import dpnp.tensor._tensor_impl as ti
@@ -232,11 +231,8 @@ def __call__(self, x, /, *, out=None, order="K"):
                 # created, so the array overlap check isn't needed.
                 out = dpt.empty_like(out)
 
-            if (
-                dpctl.utils.get_execution_queue((x.sycl_queue, out.sycl_queue))
-                is None
-            ):
-                raise ExecutionPlacementError(
+            if dpt.get_execution_queue((x.sycl_queue, out.sycl_queue)) is None:
+                raise dpt.ExecutionPlacementError(
                     "Input and output allocation queues are not compatible"
                 )
 
@@ -482,7 +478,7 @@ def __call__(self, o1, o2, /, *, out=None, order="K"):
         q1, o1_usm_type = _get_queue_usm_type(o1)
         q2, o2_usm_type = _get_queue_usm_type(o2)
         if q1 is None and q2 is None:
-            raise ExecutionPlacementError(
+            raise dpt.ExecutionPlacementError(
                 "Execution placement can not be unambiguously inferred "
                 "from input arguments. "
                 "One of the arguments must represent USM allocation and "
@@ -495,19 +491,19 @@ def __call__(self, o1, o2, /, *, out=None, order="K"):
             exec_q = q1
             res_usm_type = o1_usm_type
         else:
-            exec_q = dpctl.utils.get_execution_queue((q1, q2))
+            exec_q = dpt.get_execution_queue((q1, q2))
             if exec_q is None:
-                raise ExecutionPlacementError(
+                raise dpt.ExecutionPlacementError(
                     "Execution placement can not be unambiguously inferred "
                     "from input arguments."
                 )
-            res_usm_type = dpctl.utils.get_coerced_usm_type(
+            res_usm_type = dpt.get_coerced_usm_type(
                 (
                     o1_usm_type,
                     o2_usm_type,
                 )
             )
-        dpctl.utils.validate_usm_type(res_usm_type, allow_none=False)
+        dpt.validate_usm_type(res_usm_type, allow_none=False)
         o1_shape = _get_shape(o1)
         o2_shape = _get_shape(o2)
         if not all(
@@ -583,11 +579,8 @@ def __call__(self, o1, o2, /, *, out=None, order="K"):
                     f"got {out.dtype}"
                 )
 
-            if (
-                dpctl.utils.get_execution_queue((exec_q, out.sycl_queue))
-                is None
-            ):
-                raise ExecutionPlacementError(
+            if dpt.get_execution_queue((exec_q, out.sycl_queue)) is None:
+                raise dpt.ExecutionPlacementError(
                     "Input and output allocation queues are not compatible"
                 )
 
@@ -876,19 +869,19 @@ def _inplace_op(self, o1, o2):
             exec_q = q1
             res_usm_type = o1_usm_type
         else:
-            exec_q = dpctl.utils.get_execution_queue((q1, q2))
+            exec_q = dpt.get_execution_queue((q1, q2))
             if exec_q is None:
-                raise ExecutionPlacementError(
+                raise dpt.ExecutionPlacementError(
                     "Execution placement can not be unambiguously inferred "
                     "from input arguments."
                 )
-            res_usm_type = dpctl.utils.get_coerced_usm_type(
+            res_usm_type = dpt.get_coerced_usm_type(
                 (
                     o1_usm_type,
                     o2_usm_type,
                 )
             )
-        dpctl.utils.validate_usm_type(res_usm_type, allow_none=False)
+        dpt.validate_usm_type(res_usm_type, allow_none=False)
         o1_shape = o1.shape
         o2_shape = _get_shape(o2)
         if not isinstance(o2_shape, (tuple, list)):
diff --git a/dpnp/tensor/_indexing_functions.py b/dpnp/tensor/_indexing_functions.py
index 8f097e59efc3..32162942d738 100644
--- a/dpnp/tensor/_indexing_functions.py
+++ b/dpnp/tensor/_indexing_functions.py
@@ -28,8 +28,7 @@
 
 import operator
 
-import dpctl
-import dpctl.utils
+from dpctl.utils import SequentialOrderManager
 
 import dpnp.tensor as dpt
 import dpnp.tensor._tensor_impl as ti
@@ -91,14 +90,14 @@ def extract(condition, arr):
         raise TypeError(
             "Expecting dpctl.tensor.usm_ndarray type, " f"got {type(arr)}"
         )
-    exec_q = dpctl.utils.get_execution_queue(
+    exec_q = dpt.get_execution_queue(
         (
             condition.sycl_queue,
             arr.sycl_queue,
         )
     )
     if exec_q is None:
-        raise dpctl.utils.ExecutionPlacementError
+        raise dpt.ExecutionPlacementError
     if condition.shape != arr.shape:
         raise ValueError("Arrays are not of the same size")
     return _extract_impl(arr, condition)
@@ -163,7 +162,7 @@ def place(arr, mask, vals):
         raise TypeError(
             "Expecting dpctl.tensor.usm_ndarray type, " f"got {type(vals)}"
         )
-    exec_q = dpctl.utils.get_execution_queue(
+    exec_q = dpt.get_execution_queue(
         (
             arr.sycl_queue,
             mask.sycl_queue,
@@ -171,11 +170,11 @@ def place(arr, mask, vals):
         )
     )
     if exec_q is None:
-        raise dpctl.utils.ExecutionPlacementError
+        raise dpt.ExecutionPlacementError
     if arr.shape != mask.shape or vals.ndim != 1:
         raise ValueError("Array sizes are not as required")
     cumsum = dpt.empty(mask.size, dtype="i8", sycl_queue=exec_q)
-    _manager = dpctl.utils.SequentialOrderManager[exec_q]
+    _manager = SequentialOrderManager[exec_q]
     deps_ev = _manager.submitted_events
     nz_count = ti.mask_positions(
         mask, cumsum, sycl_queue=exec_q, depends=deps_ev
@@ -297,10 +296,10 @@ def put_vec_duplicates(vec, ind, vals):
                 indices.dtype
             )
         )
-    exec_q = dpctl.utils.get_execution_queue(queues_)
+    exec_q = dpt.get_execution_queue(queues_)
     if exec_q is None:
-        raise dpctl.utils.ExecutionPlacementError
-    vals_usm_type = dpctl.utils.get_coerced_usm_type(usm_types_)
+        raise dpt.ExecutionPlacementError
+    vals_usm_type = dpt.get_coerced_usm_type(usm_types_)
 
     mode = _get_indexing_mode(mode)
 
@@ -340,7 +339,7 @@ def put_vec_duplicates(vec, ind, vals):
         rhs = dpt.astype(vals, x.dtype)
     rhs = dpt.broadcast_to(rhs, val_shape)
 
-    _manager = dpctl.utils.SequentialOrderManager[exec_q]
+    _manager = SequentialOrderManager[exec_q]
     deps_ev = _manager.submitted_events
     hev, put_ev = ti._put(
         x, (indices,), rhs, axis, mode, sycl_queue=exec_q, depends=deps_ev
@@ -404,13 +403,13 @@ def put_along_axis(x, indices, vals, /, *, axis=-1, mode="wrap"):
     else:
         queues_ = [x.sycl_queue, indices.sycl_queue]
         usm_types_ = [x.usm_type, indices.usm_type]
-    exec_q = dpctl.utils.get_execution_queue(queues_)
+    exec_q = dpt.get_execution_queue(queues_)
     if exec_q is None:
-        raise dpctl.utils.ExecutionPlacementError(
+        raise dpt.ExecutionPlacementError(
             "Execution placement can not be unambiguously inferred "
             "from input arguments. "
         )
-    out_usm_type = dpctl.utils.get_coerced_usm_type(usm_types_)
+    out_usm_type = dpt.get_coerced_usm_type(usm_types_)
     mode_i = _get_indexing_mode(mode)
     indexes_dt = (
         dpt.uint64
@@ -482,12 +481,10 @@ def take(x, indices, /, *, axis=None, out=None, mode="wrap"):
         raise ValueError(
             "`indices` expected a 1D array, got `{}`".format(indices.ndim)
         )
-    exec_q = dpctl.utils.get_execution_queue([x.sycl_queue, indices.sycl_queue])
+    exec_q = dpt.get_execution_queue([x.sycl_queue, indices.sycl_queue])
     if exec_q is None:
-        raise dpctl.utils.ExecutionPlacementError
-    res_usm_type = dpctl.utils.get_coerced_usm_type(
-        [x.usm_type, indices.usm_type]
-    )
+        raise dpt.ExecutionPlacementError
+    res_usm_type = dpt.get_coerced_usm_type([x.usm_type, indices.usm_type])
 
     mode = _get_indexing_mode(mode)
 
@@ -532,8 +529,8 @@ def take(x, indices, /, *, axis=None, out=None, mode="wrap"):
             raise ValueError(
                 f"Output array of type {dt} is needed, got {out.dtype}"
             )
-        if dpctl.utils.get_execution_queue((exec_q, out.sycl_queue)) is None:
-            raise dpctl.utils.ExecutionPlacementError(
+        if dpt.get_execution_queue((exec_q, out.sycl_queue)) is None:
+            raise dpt.ExecutionPlacementError(
                 "Input and output allocation queues are not compatible"
             )
         if ti._array_overlap(x, out):
@@ -543,7 +540,7 @@ def take(x, indices, /, *, axis=None, out=None, mode="wrap"):
             res_shape, dtype=dt, usm_type=res_usm_type, sycl_queue=exec_q
         )
 
-    _manager = dpctl.utils.SequentialOrderManager[exec_q]
+    _manager = SequentialOrderManager[exec_q]
     deps_ev = _manager.submitted_events
     hev, take_ev = ti._take(
         x, (indices,), out, axis, mode, sycl_queue=exec_q, depends=deps_ev
@@ -612,12 +609,10 @@ def take_along_axis(x, indices, /, *, axis=-1, mode="wrap"):
             "argument arrays must be equal"
         )
     pp = normalize_axis_index(operator.index(axis), x_nd)
-    out_usm_type = dpctl.utils.get_coerced_usm_type(
-        (x.usm_type, indices.usm_type)
-    )
-    exec_q = dpctl.utils.get_execution_queue((x.sycl_queue, indices.sycl_queue))
+    out_usm_type = dpt.get_coerced_usm_type((x.usm_type, indices.usm_type))
+    exec_q = dpt.get_execution_queue((x.sycl_queue, indices.sycl_queue))
     if exec_q is None:
-        raise dpctl.utils.ExecutionPlacementError(
+        raise dpt.ExecutionPlacementError(
             "Execution placement can not be unambiguously inferred "
             "from input arguments. "
         )
diff --git a/dpnp/tensor/_linear_algebra_functions.py b/dpnp/tensor/_linear_algebra_functions.py
index bec0522cd18f..ad64fd201eb0 100644
--- a/dpnp/tensor/_linear_algebra_functions.py
+++ b/dpnp/tensor/_linear_algebra_functions.py
@@ -28,8 +28,7 @@
 
 import operator
 
-import dpctl
-from dpctl.utils import ExecutionPlacementError, SequentialOrderManager
+from dpctl.utils import SequentialOrderManager
 
 import dpnp.tensor as dpt
 import dpnp.tensor._tensor_elementwise_impl as tei
@@ -121,19 +120,19 @@ def tensordot(x1, x2, axes=2):
         raise TypeError(f"Expected dpctl.tensor.usm_ndarray, got {type(x2)}")
     q1, x1_usm_type = x1.sycl_queue, x1.usm_type
     q2, x2_usm_type = x2.sycl_queue, x2.usm_type
-    exec_q = dpctl.utils.get_execution_queue((q1, q2))
+    exec_q = dpt.get_execution_queue((q1, q2))
     if exec_q is None:
-        raise ExecutionPlacementError(
+        raise dpt.ExecutionPlacementError(
             "Execution placement can not be unambiguously inferred "
             "from input arguments."
         )
-    res_usm_type = dpctl.utils.get_coerced_usm_type(
+    res_usm_type = dpt.get_coerced_usm_type(
         (
             x1_usm_type,
             x2_usm_type,
         )
     )
-    dpctl.utils.validate_usm_type(res_usm_type, allow_none=False)
+    dpt.validate_usm_type(res_usm_type, allow_none=False)
     # handle axes and shapes validation
     x1_nd = x1.ndim
     x2_nd = x2.ndim
@@ -357,19 +356,19 @@ def vecdot(x1, x2, axis=-1):
         raise TypeError(f"Expected dpctl.tensor.usm_ndarray, got {type(x2)}")
     q1, x1_usm_type = x1.sycl_queue, x1.usm_type
     q2, x2_usm_type = x2.sycl_queue, x2.usm_type
-    exec_q = dpctl.utils.get_execution_queue((q1, q2))
+    exec_q = dpt.get_execution_queue((q1, q2))
     if exec_q is None:
-        raise ExecutionPlacementError(
+        raise dpt.ExecutionPlacementError(
             "Execution placement can not be unambiguously inferred "
             "from input arguments."
         )
-    res_usm_type = dpctl.utils.get_coerced_usm_type(
+    res_usm_type = dpt.get_coerced_usm_type(
         (
             x1_usm_type,
             x2_usm_type,
         )
     )
-    dpctl.utils.validate_usm_type(res_usm_type, allow_none=False)
+    dpt.validate_usm_type(res_usm_type, allow_none=False)
     # axis and shape validation
     x1_nd = x1.ndim
     x2_nd = x2.ndim
@@ -661,19 +660,19 @@ def matmul(x1, x2, out=None, dtype=None, order="K"):
         order = "K"
     q1, x1_usm_type = x1.sycl_queue, x1.usm_type
     q2, x2_usm_type = x2.sycl_queue, x2.usm_type
-    exec_q = dpctl.utils.get_execution_queue((q1, q2))
+    exec_q = dpt.get_execution_queue((q1, q2))
     if exec_q is None:
-        raise ExecutionPlacementError(
+        raise dpt.ExecutionPlacementError(
             "Execution placement can not be unambiguously inferred "
             "from input arguments."
         )
-    res_usm_type = dpctl.utils.get_coerced_usm_type(
+    res_usm_type = dpt.get_coerced_usm_type(
         (
             x1_usm_type,
             x2_usm_type,
         )
     )
-    dpctl.utils.validate_usm_type(res_usm_type, allow_none=False)
+    dpt.validate_usm_type(res_usm_type, allow_none=False)
 
     x1_nd = x1.ndim
     x2_nd = x2.ndim
@@ -780,8 +779,8 @@ def matmul(x1, x2, out=None, dtype=None, order="K"):
                 f"Output array of type {res_dt} is needed, got {out.dtype}"
             )
 
-        if dpctl.utils.get_execution_queue((exec_q, out.sycl_queue)) is None:
-            raise ExecutionPlacementError(
+        if dpt.get_execution_queue((exec_q, out.sycl_queue)) is None:
+            raise dpt.ExecutionPlacementError(
                 "Input and output allocation queues are not compatible"
             )
 
diff --git a/dpnp/tensor/_manipulation_functions.py b/dpnp/tensor/_manipulation_functions.py
index 965bafda7948..7347f62de115 100644
--- a/dpnp/tensor/_manipulation_functions.py
+++ b/dpnp/tensor/_manipulation_functions.py
@@ -29,9 +29,8 @@
 import itertools
 import operator
 
-import dpctl
-import dpctl.utils as dputils
 import numpy as np
+from dpctl.utils import SequentialOrderManager
 
 import dpnp.tensor as dpt
 import dpnp.tensor._tensor_impl as ti
@@ -57,11 +56,11 @@ def _arrays_validation(arrays, check_ndim=True):
         if not isinstance(X, dpt.usm_ndarray):
             raise TypeError(f"Expected usm_ndarray type, got {type(X)}.")
 
-    exec_q = dputils.get_execution_queue([X.sycl_queue for X in arrays])
+    exec_q = dpt.get_execution_queue([X.sycl_queue for X in arrays])
     if exec_q is None:
         raise ValueError("All the input arrays must have same sycl queue.")
 
-    res_usm_type = dputils.get_coerced_usm_type([X.usm_type for X in arrays])
+    res_usm_type = dpt.get_coerced_usm_type([X.usm_type for X in arrays])
     if res_usm_type is None:
         raise ValueError("All the input arrays must have usm_type.")
 
@@ -176,7 +175,7 @@ def _concat_axis_None(arrays):
     )
 
     fill_start = 0
-    _manager = dputils.SequentialOrderManager[exec_q]
+    _manager = SequentialOrderManager[exec_q]
     deps = _manager.submitted_events
     for array in arrays:
         fill_end = fill_start + array.size
@@ -335,7 +334,7 @@ def concat(arrays, /, *, axis=0):
         res_shape, dtype=res_dtype, usm_type=res_usm_type, sycl_queue=exec_q
     )
 
-    _manager = dputils.SequentialOrderManager[exec_q]
+    _manager = SequentialOrderManager[exec_q]
     deps = _manager.submitted_events
     fill_start = 0
     for i in range(n):
@@ -584,21 +583,19 @@ def repeat(x, repeats, /, *, axis=None):
                 "`repeats` array must be 0- or 1-dimensional, got "
                 f"{repeats.ndim}"
             )
-        exec_q = dpctl.utils.get_execution_queue(
-            (x.sycl_queue, repeats.sycl_queue)
-        )
+        exec_q = dpt.get_execution_queue((x.sycl_queue, repeats.sycl_queue))
         if exec_q is None:
-            raise dputils.ExecutionPlacementError(
+            raise dpt.ExecutionPlacementError(
                 "Execution placement can not be unambiguously inferred "
                 "from input arguments."
             )
-        usm_type = dpctl.utils.get_coerced_usm_type(
+        usm_type = dpt.get_coerced_usm_type(
             (
                 x.usm_type,
                 repeats.usm_type,
             )
         )
-        dpctl.utils.validate_usm_type(usm_type, allow_none=False)
+        dpt.validate_usm_type(usm_type, allow_none=False)
         if not dpt.can_cast(repeats.dtype, dpt.int64, casting="same_kind"):
             raise TypeError(
                 f"'repeats' data type {repeats.dtype} cannot be cast to "
@@ -651,7 +648,7 @@ def repeat(x, repeats, /, *, axis=None):
             f"got {type(repeats)}"
         )
 
-    _manager = dputils.SequentialOrderManager[exec_q]
+    _manager = SequentialOrderManager[exec_q]
     dep_evs = _manager.submitted_events
     if scalar:
         res_axis_size = repeats * axis_size
@@ -786,7 +783,7 @@ def roll(x, /, shift, *, axis=None):
     if not isinstance(x, dpt.usm_ndarray):
         raise TypeError(f"Expected usm_ndarray type, got {type(x)}.")
     exec_q = x.sycl_queue
-    _manager = dputils.SequentialOrderManager[exec_q]
+    _manager = SequentialOrderManager[exec_q]
     if axis is None:
         shift = operator.index(shift)
         res = dpt.empty(
@@ -918,7 +915,7 @@ def stack(arrays, /, *, axis=0):
         res_shape, dtype=res_dtype, usm_type=res_usm_type, sycl_queue=exec_q
     )
 
-    _manager = dputils.SequentialOrderManager[exec_q]
+    _manager = SequentialOrderManager[exec_q]
     dep_evs = _manager.submitted_events
     for i in range(n):
         c_shapes_copy = tuple(
@@ -1088,7 +1085,7 @@ def tile(x, repetitions, /):
             broadcast_sh,
         )
         # copy broadcast input into flat array
-        _manager = dputils.SequentialOrderManager[exec_q]
+        _manager = SequentialOrderManager[exec_q]
         dep_evs = _manager.submitted_events
         hev, cp_ev = ti._copy_usm_ndarray_for_reshape(
             src=x, dst=res, sycl_queue=exec_q, depends=dep_evs
diff --git a/dpnp/tensor/_print.py b/dpnp/tensor/_print.py
index c9325af9d312..51de51265907 100644
--- a/dpnp/tensor/_print.py
+++ b/dpnp/tensor/_print.py
@@ -31,8 +31,8 @@
 import operator
 
 import dpctl
-import dpctl.utils
 import numpy as np
+from dpctl.utils import SequentialOrderManager
 
 import dpnp.tensor as dpt
 import dpnp.tensor._tensor_impl as ti
@@ -286,7 +286,7 @@ def _nd_corners(arr_in, edge_items):
         else:
             blocks.append((np.s_[:],))
 
-    _manager = dpctl.utils.SequentialOrderManager[exec_q]
+    _manager = SequentialOrderManager[exec_q]
     dep_evs = _manager.submitted_events
     hev_list = []
     for slc in itertools.product(*blocks):
diff --git a/dpnp/tensor/_reduction.py b/dpnp/tensor/_reduction.py
index 82b75503e269..dfa77c63fe92 100644
--- a/dpnp/tensor/_reduction.py
+++ b/dpnp/tensor/_reduction.py
@@ -26,8 +26,7 @@
 # THE POSSIBILITY OF SUCH DAMAGE.
 # *****************************************************************************
 
-import dpctl
-from dpctl.utils import ExecutionPlacementError, SequentialOrderManager
+from dpctl.utils import SequentialOrderManager
 
 import dpnp.tensor as dpt
 import dpnp.tensor._tensor_impl as ti
@@ -88,8 +87,8 @@ def _comparison_over_axis(x, axis, keepdims, out, _reduction_fn):
             raise ValueError(
                 f"Output array of type {res_dt} is needed, got {out.dtype}"
             )
-        if dpctl.utils.get_execution_queue((exec_q, out.sycl_queue)) is None:
-            raise ExecutionPlacementError(
+        if dpt.get_execution_queue((exec_q, out.sycl_queue)) is None:
+            raise dpt.ExecutionPlacementError(
                 "Input and output allocation queues are not compatible"
             )
         if keepdims:
@@ -204,8 +203,8 @@ def _reduction_over_axis(
             raise ValueError(
                 f"Output array of type {res_dt} is needed, got {out.dtype}"
             )
-        if dpctl.utils.get_execution_queue((q, out.sycl_queue)) is None:
-            raise ExecutionPlacementError(
+        if dpt.get_execution_queue((q, out.sycl_queue)) is None:
+            raise dpt.ExecutionPlacementError(
                 "Input and output allocation queues are not compatible"
             )
         if keepdims:
@@ -351,8 +350,8 @@ def _search_over_axis(x, axis, keepdims, out, _reduction_fn):
             raise ValueError(
                 f"Output array of type {res_dt} is needed, got {out.dtype}"
             )
-        if dpctl.utils.get_execution_queue((exec_q, out.sycl_queue)) is None:
-            raise ExecutionPlacementError(
+        if dpt.get_execution_queue((exec_q, out.sycl_queue)) is None:
+            raise dpt.ExecutionPlacementError(
                 "Input and output allocation queues are not compatible"
             )
         if keepdims:
diff --git a/dpnp/tensor/_reshape.py b/dpnp/tensor/_reshape.py
index 6d817c5ccdf0..0187ae496003 100644
--- a/dpnp/tensor/_reshape.py
+++ b/dpnp/tensor/_reshape.py
@@ -28,8 +28,8 @@
 
 import operator
 
-import dpctl.utils
 import numpy as np
+from dpctl.utils import SequentialOrderManager
 
 import dpnp.tensor as dpt
 
@@ -179,7 +179,7 @@ def reshape(X, /, shape, *, order="C", copy=None):
             buffer=X.usm_type,
             buffer_ctor_kwargs={"queue": copy_q},
         )
-        _manager = dpctl.utils.SequentialOrderManager[copy_q]
+        _manager = SequentialOrderManager[copy_q]
         dep_evs = _manager.submitted_events
         if order == "C":
             hev, r_e = _copy_usm_ndarray_for_reshape(
diff --git a/dpnp/tensor/_search_functions.py b/dpnp/tensor/_search_functions.py
index 7e443351311a..339f2b2a4e3d 100644
--- a/dpnp/tensor/_search_functions.py
+++ b/dpnp/tensor/_search_functions.py
@@ -26,8 +26,7 @@
 # THE POSSIBILITY OF SUCH DAMAGE.
 # *****************************************************************************
 
-import dpctl
-from dpctl.utils import ExecutionPlacementError, SequentialOrderManager
+from dpctl.utils import SequentialOrderManager
 
 import dpnp.tensor as dpt
 import dpnp.tensor._tensor_impl as ti
@@ -174,46 +173,46 @@ def where(condition, x1, x2, /, *, order="K", out=None):
         exec_q = q1
         out_usm_type = condition_usm_type
     elif q3 is None:
-        exec_q = dpctl.utils.get_execution_queue((q1, q2))
+        exec_q = dpt.get_execution_queue((q1, q2))
         if exec_q is None:
-            raise ExecutionPlacementError(
+            raise dpt.ExecutionPlacementError(
                 "Execution placement can not be unambiguously inferred "
                 "from input arguments."
             )
-        out_usm_type = dpctl.utils.get_coerced_usm_type(
+        out_usm_type = dpt.get_coerced_usm_type(
             (
                 condition_usm_type,
                 x1_usm_type,
             )
         )
     elif q2 is None:
-        exec_q = dpctl.utils.get_execution_queue((q1, q3))
+        exec_q = dpt.get_execution_queue((q1, q3))
         if exec_q is None:
-            raise ExecutionPlacementError(
+            raise dpt.ExecutionPlacementError(
                 "Execution placement can not be unambiguously inferred "
                 "from input arguments."
             )
-        out_usm_type = dpctl.utils.get_coerced_usm_type(
+        out_usm_type = dpt.get_coerced_usm_type(
             (
                 condition_usm_type,
                 x2_usm_type,
             )
         )
     else:
-        exec_q = dpctl.utils.get_execution_queue((q1, q2, q3))
+        exec_q = dpt.get_execution_queue((q1, q2, q3))
         if exec_q is None:
-            raise ExecutionPlacementError(
+            raise dpt.ExecutionPlacementError(
                 "Execution placement can not be unambiguously inferred "
                 "from input arguments."
             )
-        out_usm_type = dpctl.utils.get_coerced_usm_type(
+        out_usm_type = dpt.get_coerced_usm_type(
             (
                 condition_usm_type,
                 x1_usm_type,
                 x2_usm_type,
             )
         )
-    dpctl.utils.validate_usm_type(out_usm_type, allow_none=False)
+    dpt.validate_usm_type(out_usm_type, allow_none=False)
     condition_shape = condition.shape
     x1_shape = _get_shape(x1)
     x2_shape = _get_shape(x2)
@@ -280,8 +279,8 @@ def where(condition, x1, x2, /, *, order="K", out=None):
                 f"got {out.dtype}"
             )
 
-        if dpctl.utils.get_execution_queue((exec_q, out.sycl_queue)) is None:
-            raise ExecutionPlacementError(
+        if dpt.get_execution_queue((exec_q, out.sycl_queue)) is None:
+            raise dpt.ExecutionPlacementError(
                 "Input and output allocation queues are not compatible"
             )
 
diff --git a/dpnp/tensor/_searchsorted.py b/dpnp/tensor/_searchsorted.py
index 66a2df7ff375..0702e1711ef9 100644
--- a/dpnp/tensor/_searchsorted.py
+++ b/dpnp/tensor/_searchsorted.py
@@ -32,6 +32,11 @@
 import dpctl
 import dpctl.utils as du
 
+from ._compute_follows_data import (
+    ExecutionPlacementError,
+    get_coerced_usm_type,
+    get_execution_queue,
+)
 from ._copy_utils import _empty_like_orderK
 from ._ctors import empty
 from ._tensor_impl import _copy_usm_ndarray_into_usm_ndarray as ti_copy
@@ -99,13 +104,13 @@ def searchsorted(
         )
 
     if sorter is None:
-        q = du.get_execution_queue([x1.sycl_queue, x2.sycl_queue])
+        q = get_execution_queue([x1.sycl_queue, x2.sycl_queue])
     else:
-        q = du.get_execution_queue(
+        q = get_execution_queue(
             [x1.sycl_queue, x2.sycl_queue, sorter.sycl_queue]
         )
     if q is None:
-        raise du.ExecutionPlacementError(
+        raise ExecutionPlacementError(
             "Execution placement can not be unambiguously "
             "inferred from input arguments."
         )
@@ -164,7 +169,7 @@ def searchsorted(
             _manager.add_event_pair(ht_ev, ev)
             x2 = x2_buf
 
-    dst_usm_type = du.get_coerced_usm_type([x1.usm_type, x2.usm_type])
+    dst_usm_type = get_coerced_usm_type([x1.usm_type, x2.usm_type])
     index_dt = ti_default_device_index_type(q)
 
     dst = _empty_like_orderK(x2, index_dt, usm_type=dst_usm_type)
diff --git a/dpnp/tensor/_set_functions.py b/dpnp/tensor/_set_functions.py
index e6131ddf7d2a..3b1a9b66d0da 100644
--- a/dpnp/tensor/_set_functions.py
+++ b/dpnp/tensor/_set_functions.py
@@ -683,7 +683,7 @@ def isin(
     q1, x_usm_type = _get_queue_usm_type(x)
     q2, test_usm_type = _get_queue_usm_type(test_elements)
     if q1 is None and q2 is None:
-        raise du.ExecutionPlacementError(
+        raise dpt.ExecutionPlacementError(
             "Execution placement can not be unambiguously inferred "
             "from input arguments. "
             "One of the arguments must represent USM allocation and "
@@ -696,19 +696,19 @@ def isin(
         exec_q = q1
         res_usm_type = x_usm_type
     else:
-        exec_q = du.get_execution_queue((q1, q2))
+        exec_q = dpt.get_execution_queue((q1, q2))
         if exec_q is None:
-            raise du.ExecutionPlacementError(
+            raise dpt.ExecutionPlacementError(
                 "Execution placement can not be unambiguously inferred "
                 "from input arguments."
             )
-        res_usm_type = du.get_coerced_usm_type(
+        res_usm_type = dpt.get_coerced_usm_type(
             (
                 x_usm_type,
                 test_usm_type,
             )
         )
-    du.validate_usm_type(res_usm_type, allow_none=False)
+    dpt.validate_usm_type(res_usm_type, allow_none=False)
     sycl_dev = exec_q.sycl_device
 
     if not isinstance(invert, bool):
diff --git a/dpnp/tensor/_testing.py b/dpnp/tensor/_testing.py
index ec1f0c47be60..33b1b30980a3 100644
--- a/dpnp/tensor/_testing.py
+++ b/dpnp/tensor/_testing.py
@@ -26,7 +26,6 @@
 # THE POSSIBILITY OF SUCH DAMAGE.
 # *****************************************************************************
 
-import dpctl.utils as du
 import numpy as np
 
 import dpnp.tensor as dpt
@@ -135,9 +134,9 @@ def allclose(a1, a2, atol=1e-8, rtol=1e-5, equal_nan=False):
             "Absolute and relative tolerances must be non-negative"
         )
     equal_nan = bool(equal_nan)
-    exec_q = du.get_execution_queue(tuple(a.sycl_queue for a in (a1, a2)))
+    exec_q = dpt.get_execution_queue(tuple(a.sycl_queue for a in (a1, a2)))
     if exec_q is None:
-        raise du.ExecutionPlacementError(
+        raise dpt.ExecutionPlacementError(
             "Execution placement can not be unambiguously inferred "
             "from input arguments."
         )
diff --git a/dpnp/tensor/_utility_functions.py b/dpnp/tensor/_utility_functions.py
index 644c2ce9911f..a02f7406d135 100644
--- a/dpnp/tensor/_utility_functions.py
+++ b/dpnp/tensor/_utility_functions.py
@@ -206,46 +206,46 @@ def _concat_diff_input(arr, axis, prepend, append):
             exec_q = q1
             coerced_usm_type = x_usm_type
         elif q3 is None:
-            exec_q = du.get_execution_queue((q1, q2))
+            exec_q = dpt.get_execution_queue((q1, q2))
             if exec_q is None:
-                raise du.ExecutionPlacementError(
+                raise dpt.ExecutionPlacementError(
                     "Execution placement can not be unambiguously inferred "
                     "from input arguments."
                 )
-            coerced_usm_type = du.get_coerced_usm_type(
+            coerced_usm_type = dpt.get_coerced_usm_type(
                 (
                     x_usm_type,
                     prepend_usm_type,
                 )
             )
         elif q2 is None:
-            exec_q = du.get_execution_queue((q1, q3))
+            exec_q = dpt.get_execution_queue((q1, q3))
             if exec_q is None:
-                raise du.ExecutionPlacementError(
+                raise dpt.ExecutionPlacementError(
                     "Execution placement can not be unambiguously inferred "
                     "from input arguments."
                 )
-            coerced_usm_type = du.get_coerced_usm_type(
+            coerced_usm_type = dpt.get_coerced_usm_type(
                 (
                     x_usm_type,
                     append_usm_type,
                 )
             )
         else:
-            exec_q = du.get_execution_queue((q1, q2, q3))
+            exec_q = dpt.get_execution_queue((q1, q2, q3))
             if exec_q is None:
-                raise du.ExecutionPlacementError(
+                raise dpt.ExecutionPlacementError(
                     "Execution placement can not be unambiguously inferred "
                     "from input arguments."
                 )
-            coerced_usm_type = du.get_coerced_usm_type(
+            coerced_usm_type = dpt.get_coerced_usm_type(
                 (
                     x_usm_type,
                     prepend_usm_type,
                     append_usm_type,
                 )
             )
-        du.validate_usm_type(coerced_usm_type, allow_none=False)
+        dpt.validate_usm_type(coerced_usm_type, allow_none=False)
         arr_shape = arr.shape
         prepend_shape = _get_shape(prepend)
         append_shape = _get_shape(append)
@@ -318,19 +318,19 @@ def _concat_diff_input(arr, axis, prepend, append):
             exec_q = q1
             coerced_usm_type = x_usm_type
         else:
-            exec_q = du.get_execution_queue((q1, q2))
+            exec_q = dpt.get_execution_queue((q1, q2))
             if exec_q is None:
-                raise du.ExecutionPlacementError(
+                raise dpt.ExecutionPlacementError(
                     "Execution placement can not be unambiguously inferred "
                     "from input arguments."
                 )
-            coerced_usm_type = du.get_coerced_usm_type(
+            coerced_usm_type = dpt.get_coerced_usm_type(
                 (
                     x_usm_type,
                     prepend_usm_type,
                 )
             )
-        du.validate_usm_type(coerced_usm_type, allow_none=False)
+        dpt.validate_usm_type(coerced_usm_type, allow_none=False)
         arr_shape = arr.shape
         prepend_shape = _get_shape(prepend)
         if not isinstance(prepend_shape, (tuple, list)):
@@ -375,19 +375,19 @@ def _concat_diff_input(arr, axis, prepend, append):
             exec_q = q1
             coerced_usm_type = x_usm_type
         else:
-            exec_q = du.get_execution_queue((q1, q2))
+            exec_q = dpt.get_execution_queue((q1, q2))
             if exec_q is None:
-                raise du.ExecutionPlacementError(
+                raise dpt.ExecutionPlacementError(
                     "Execution placement can not be unambiguously inferred "
                     "from input arguments."
                 )
-            coerced_usm_type = du.get_coerced_usm_type(
+            coerced_usm_type = dpt.get_coerced_usm_type(
                 (
                     x_usm_type,
                     append_usm_type,
                 )
             )
-        du.validate_usm_type(coerced_usm_type, allow_none=False)
+        dpt.validate_usm_type(coerced_usm_type, allow_none=False)
         arr_shape = arr.shape
         append_shape = _get_shape(append)
         if not isinstance(append_shape, (tuple, list)):
diff --git a/dpnp/tests/tensor/elementwise/test_add.py b/dpnp/tests/tensor/elementwise/test_add.py
index 0320ec642a66..28a4efb21e94 100644
--- a/dpnp/tests/tensor/elementwise/test_add.py
+++ b/dpnp/tests/tensor/elementwise/test_add.py
@@ -32,7 +32,6 @@
 import dpctl
 import numpy as np
 import pytest
-from dpctl.utils import ExecutionPlacementError
 
 import dpnp.tensor as dpt
 from dpnp.tensor._type_utils import _can_cast
@@ -101,9 +100,7 @@ def test_add_usm_type_matrix(op1_usm_type, op2_usm_type):
 
     r = dpt.add(ar1, ar2)
     assert isinstance(r, dpt.usm_ndarray)
-    expected_usm_type = dpctl.utils.get_coerced_usm_type(
-        (op1_usm_type, op2_usm_type)
-    )
+    expected_usm_type = dpt.get_coerced_usm_type((op1_usm_type, op2_usm_type))
     assert r.usm_type == expected_usm_type
 
 
@@ -299,7 +296,7 @@ def test_add_errors():
     ar1 = dpt.ones(2, dtype="float32", sycl_queue=gpu_queue)
     ar2 = dpt.ones_like(ar1, sycl_queue=gpu_queue)
     y = dpt.empty_like(ar1, sycl_queue=cpu_queue)
-    with pytest.raises(ExecutionPlacementError) as excinfo:
+    with pytest.raises(dpt.ExecutionPlacementError) as excinfo:
         dpt.add(ar1, ar2, out=y)
     assert "Input and output allocation queues are not compatible" in str(
         excinfo.value
@@ -316,7 +313,7 @@ def test_add_errors():
 
     ar1 = np.ones(2, dtype="float32")
     ar2 = np.ones_like(ar1, dtype="int32")
-    with pytest.raises(ExecutionPlacementError) as excinfo:
+    with pytest.raises(dpt.ExecutionPlacementError) as excinfo:
         dpt.add(ar1, ar2)
     assert re.match(
         "Execution placement can not be unambiguously inferred.*",
@@ -484,7 +481,7 @@ def test_add_inplace_errors():
 
     ar1 = dpt.ones(2, dtype="float32", sycl_queue=gpu_queue)
     ar2 = dpt.ones_like(ar1, sycl_queue=cpu_queue)
-    with pytest.raises(ExecutionPlacementError):
+    with pytest.raises(dpt.ExecutionPlacementError):
         dpt.add(ar1, ar2, out=ar1)
 
     ar1 = dpt.ones(2, dtype="float32")
@@ -522,7 +519,7 @@ def test_add_inplace_operator_errors():
 
     x_q1 = dpt.ones(10, dtype="i4", sycl_queue=q1)
     x_q2 = dpt.ones(10, dtype="i4", sycl_queue=q2)
-    with pytest.raises(ExecutionPlacementError):
+    with pytest.raises(dpt.ExecutionPlacementError):
         dpt.add._inplace_op(x_q1, x_q2)
 
 
@@ -564,10 +561,10 @@ def test_add_cfd():
 
     x1 = dpt.ones(10, sycl_queue=q1)
     x2 = dpt.ones(10, sycl_queue=q2)
-    with pytest.raises(ExecutionPlacementError):
+    with pytest.raises(dpt.ExecutionPlacementError):
         dpt.add(x1, x2)
 
-    with pytest.raises(ExecutionPlacementError):
+    with pytest.raises(dpt.ExecutionPlacementError):
         dpt.add(x1, x1, out=x2)
 
 
diff --git a/dpnp/tests/tensor/elementwise/test_divide.py b/dpnp/tests/tensor/elementwise/test_divide.py
index e39436394f7d..99de5a51214d 100644
--- a/dpnp/tests/tensor/elementwise/test_divide.py
+++ b/dpnp/tests/tensor/elementwise/test_divide.py
@@ -95,9 +95,7 @@ def test_divide_usm_type_matrix(op1_usm_type, op2_usm_type):
 
     r = dpt.divide(ar1, ar2)
     assert isinstance(r, dpt.usm_ndarray)
-    expected_usm_type = dpctl.utils.get_coerced_usm_type(
-        (op1_usm_type, op2_usm_type)
-    )
+    expected_usm_type = dpt.get_coerced_usm_type((op1_usm_type, op2_usm_type))
     assert r.usm_type == expected_usm_type
 
 
diff --git a/dpnp/tests/tensor/elementwise/test_equal.py b/dpnp/tests/tensor/elementwise/test_equal.py
index 2791d600f7a3..f5e0cd520762 100644
--- a/dpnp/tests/tensor/elementwise/test_equal.py
+++ b/dpnp/tests/tensor/elementwise/test_equal.py
@@ -90,9 +90,7 @@ def test_equal_usm_type_matrix(op1_usm_type, op2_usm_type):
 
     r = dpt.equal(ar1, ar2)
     assert isinstance(r, dpt.usm_ndarray)
-    expected_usm_type = dpctl.utils.get_coerced_usm_type(
-        (op1_usm_type, op2_usm_type)
-    )
+    expected_usm_type = dpt.get_coerced_usm_type((op1_usm_type, op2_usm_type))
     assert r.usm_type == expected_usm_type
 
 
diff --git a/dpnp/tests/tensor/elementwise/test_floor_divide.py b/dpnp/tests/tensor/elementwise/test_floor_divide.py
index 6a18575722b5..5762b09afdb3 100644
--- a/dpnp/tests/tensor/elementwise/test_floor_divide.py
+++ b/dpnp/tests/tensor/elementwise/test_floor_divide.py
@@ -92,9 +92,7 @@ def test_floor_divide_usm_type_matrix(op1_usm_type, op2_usm_type):
 
     r = dpt.floor_divide(ar1, ar2)
     assert isinstance(r, dpt.usm_ndarray)
-    expected_usm_type = dpctl.utils.get_coerced_usm_type(
-        (op1_usm_type, op2_usm_type)
-    )
+    expected_usm_type = dpt.get_coerced_usm_type((op1_usm_type, op2_usm_type))
     assert r.usm_type == expected_usm_type
 
 
diff --git a/dpnp/tests/tensor/elementwise/test_greater.py b/dpnp/tests/tensor/elementwise/test_greater.py
index 7234bd03d86a..eb5f2b3929df 100644
--- a/dpnp/tests/tensor/elementwise/test_greater.py
+++ b/dpnp/tests/tensor/elementwise/test_greater.py
@@ -166,9 +166,7 @@ def test_greater_usm_type_matrix(op1_usm_type, op2_usm_type):
 
     r = dpt.greater(ar1, ar2)
     assert isinstance(r, dpt.usm_ndarray)
-    expected_usm_type = dpctl.utils.get_coerced_usm_type(
-        (op1_usm_type, op2_usm_type)
-    )
+    expected_usm_type = dpt.get_coerced_usm_type((op1_usm_type, op2_usm_type))
     assert r.usm_type == expected_usm_type
 
 
diff --git a/dpnp/tests/tensor/elementwise/test_greater_equal.py b/dpnp/tests/tensor/elementwise/test_greater_equal.py
index 888dfbd342b7..f2e97bf62189 100644
--- a/dpnp/tests/tensor/elementwise/test_greater_equal.py
+++ b/dpnp/tests/tensor/elementwise/test_greater_equal.py
@@ -164,9 +164,7 @@ def test_greater_equal_usm_type_matrix(op1_usm_type, op2_usm_type):
 
     r = dpt.greater_equal(ar1, ar2)
     assert isinstance(r, dpt.usm_ndarray)
-    expected_usm_type = dpctl.utils.get_coerced_usm_type(
-        (op1_usm_type, op2_usm_type)
-    )
+    expected_usm_type = dpt.get_coerced_usm_type((op1_usm_type, op2_usm_type))
     assert r.usm_type == expected_usm_type
 
 
diff --git a/dpnp/tests/tensor/elementwise/test_hypot.py b/dpnp/tests/tensor/elementwise/test_hypot.py
index 7cebaf3bf6ab..bc87736318ee 100644
--- a/dpnp/tests/tensor/elementwise/test_hypot.py
+++ b/dpnp/tests/tensor/elementwise/test_hypot.py
@@ -90,9 +90,7 @@ def test_hypot_usm_type_matrix(op1_usm_type, op2_usm_type):
 
     r = dpt.hypot(ar1, ar2)
     assert isinstance(r, dpt.usm_ndarray)
-    expected_usm_type = dpctl.utils.get_coerced_usm_type(
-        (op1_usm_type, op2_usm_type)
-    )
+    expected_usm_type = dpt.get_coerced_usm_type((op1_usm_type, op2_usm_type))
     assert r.usm_type == expected_usm_type
 
 
diff --git a/dpnp/tests/tensor/elementwise/test_less.py b/dpnp/tests/tensor/elementwise/test_less.py
index 65fb9c2d9a84..0abf1e440643 100644
--- a/dpnp/tests/tensor/elementwise/test_less.py
+++ b/dpnp/tests/tensor/elementwise/test_less.py
@@ -166,9 +166,7 @@ def test_less_usm_type_matrix(op1_usm_type, op2_usm_type):
 
     r = dpt.less(ar1, ar2)
     assert isinstance(r, dpt.usm_ndarray)
-    expected_usm_type = dpctl.utils.get_coerced_usm_type(
-        (op1_usm_type, op2_usm_type)
-    )
+    expected_usm_type = dpt.get_coerced_usm_type((op1_usm_type, op2_usm_type))
     assert r.usm_type == expected_usm_type
 
 
diff --git a/dpnp/tests/tensor/elementwise/test_less_equal.py b/dpnp/tests/tensor/elementwise/test_less_equal.py
index b3f9d3b42a69..1a5744475210 100644
--- a/dpnp/tests/tensor/elementwise/test_less_equal.py
+++ b/dpnp/tests/tensor/elementwise/test_less_equal.py
@@ -165,9 +165,7 @@ def test_less_equal_usm_type_matrix(op1_usm_type, op2_usm_type):
 
     r = dpt.less_equal(ar1, ar2)
     assert isinstance(r, dpt.usm_ndarray)
-    expected_usm_type = dpctl.utils.get_coerced_usm_type(
-        (op1_usm_type, op2_usm_type)
-    )
+    expected_usm_type = dpt.get_coerced_usm_type((op1_usm_type, op2_usm_type))
     assert r.usm_type == expected_usm_type
 
 
diff --git a/dpnp/tests/tensor/elementwise/test_logaddexp.py b/dpnp/tests/tensor/elementwise/test_logaddexp.py
index a1502f4c3d11..fc16c1722d98 100644
--- a/dpnp/tests/tensor/elementwise/test_logaddexp.py
+++ b/dpnp/tests/tensor/elementwise/test_logaddexp.py
@@ -95,9 +95,7 @@ def test_logaddexp_usm_type_matrix(op1_usm_type, op2_usm_type):
 
     r = dpt.logaddexp(ar1, ar2)
     assert isinstance(r, dpt.usm_ndarray)
-    expected_usm_type = dpctl.utils.get_coerced_usm_type(
-        (op1_usm_type, op2_usm_type)
-    )
+    expected_usm_type = dpt.get_coerced_usm_type((op1_usm_type, op2_usm_type))
     assert r.usm_type == expected_usm_type
 
 
diff --git a/dpnp/tests/tensor/elementwise/test_logical_and.py b/dpnp/tests/tensor/elementwise/test_logical_and.py
index 064c295812b1..09f5838265af 100644
--- a/dpnp/tests/tensor/elementwise/test_logical_and.py
+++ b/dpnp/tests/tensor/elementwise/test_logical_and.py
@@ -193,9 +193,7 @@ def test_logical_and_usm_type_matrix(op1_usm_type, op2_usm_type):
 
     r = dpt.logical_and(ar1, ar2)
     assert isinstance(r, dpt.usm_ndarray)
-    expected_usm_type = dpctl.utils.get_coerced_usm_type(
-        (op1_usm_type, op2_usm_type)
-    )
+    expected_usm_type = dpt.get_coerced_usm_type((op1_usm_type, op2_usm_type))
     assert r.usm_type == expected_usm_type
 
 
diff --git a/dpnp/tests/tensor/elementwise/test_logical_or.py b/dpnp/tests/tensor/elementwise/test_logical_or.py
index 6987183e37a7..42c7e6f645b3 100644
--- a/dpnp/tests/tensor/elementwise/test_logical_or.py
+++ b/dpnp/tests/tensor/elementwise/test_logical_or.py
@@ -194,9 +194,7 @@ def test_logical_or_usm_type_matrix(op1_usm_type, op2_usm_type):
 
     r = dpt.logical_or(ar1, ar2)
     assert isinstance(r, dpt.usm_ndarray)
-    expected_usm_type = dpctl.utils.get_coerced_usm_type(
-        (op1_usm_type, op2_usm_type)
-    )
+    expected_usm_type = dpt.get_coerced_usm_type((op1_usm_type, op2_usm_type))
     assert r.usm_type == expected_usm_type
 
 
diff --git a/dpnp/tests/tensor/elementwise/test_logical_xor.py b/dpnp/tests/tensor/elementwise/test_logical_xor.py
index 043c704bcf4b..da2b79974f12 100644
--- a/dpnp/tests/tensor/elementwise/test_logical_xor.py
+++ b/dpnp/tests/tensor/elementwise/test_logical_xor.py
@@ -195,9 +195,7 @@ def test_logical_xor_usm_type_matrix(op1_usm_type, op2_usm_type):
 
     r = dpt.logical_xor(ar1, ar2)
     assert isinstance(r, dpt.usm_ndarray)
-    expected_usm_type = dpctl.utils.get_coerced_usm_type(
-        (op1_usm_type, op2_usm_type)
-    )
+    expected_usm_type = dpt.get_coerced_usm_type((op1_usm_type, op2_usm_type))
     assert r.usm_type == expected_usm_type
 
 
diff --git a/dpnp/tests/tensor/elementwise/test_maximum_minimum.py b/dpnp/tests/tensor/elementwise/test_maximum_minimum.py
index 7e0bce95baf9..2eb6d9de7582 100644
--- a/dpnp/tests/tensor/elementwise/test_maximum_minimum.py
+++ b/dpnp/tests/tensor/elementwise/test_maximum_minimum.py
@@ -208,16 +208,12 @@ def test_maximum_minimum_usm_type_matrix(op1_usm_type, op2_usm_type):
 
     r = dpt.maximum(ar1, ar2)
     assert isinstance(r, dpt.usm_ndarray)
-    expected_usm_type = dpctl.utils.get_coerced_usm_type(
-        (op1_usm_type, op2_usm_type)
-    )
+    expected_usm_type = dpt.get_coerced_usm_type((op1_usm_type, op2_usm_type))
     assert r.usm_type == expected_usm_type
 
     r = dpt.minimum(ar1, ar2)
     assert isinstance(r, dpt.usm_ndarray)
-    expected_usm_type = dpctl.utils.get_coerced_usm_type(
-        (op1_usm_type, op2_usm_type)
-    )
+    expected_usm_type = dpt.get_coerced_usm_type((op1_usm_type, op2_usm_type))
     assert r.usm_type == expected_usm_type
 
 
diff --git a/dpnp/tests/tensor/elementwise/test_multiply.py b/dpnp/tests/tensor/elementwise/test_multiply.py
index df0defc7cfc6..33dbef03f347 100644
--- a/dpnp/tests/tensor/elementwise/test_multiply.py
+++ b/dpnp/tests/tensor/elementwise/test_multiply.py
@@ -91,9 +91,7 @@ def test_multiply_usm_type_matrix(op1_usm_type, op2_usm_type):
 
     r = dpt.multiply(ar1, ar2)
     assert isinstance(r, dpt.usm_ndarray)
-    expected_usm_type = dpctl.utils.get_coerced_usm_type(
-        (op1_usm_type, op2_usm_type)
-    )
+    expected_usm_type = dpt.get_coerced_usm_type((op1_usm_type, op2_usm_type))
     assert r.usm_type == expected_usm_type
 
 
diff --git a/dpnp/tests/tensor/elementwise/test_not_equal.py b/dpnp/tests/tensor/elementwise/test_not_equal.py
index 718105d2689b..3f0eb58cf8b7 100644
--- a/dpnp/tests/tensor/elementwise/test_not_equal.py
+++ b/dpnp/tests/tensor/elementwise/test_not_equal.py
@@ -90,9 +90,7 @@ def test_not_equal_usm_type_matrix(op1_usm_type, op2_usm_type):
 
     r = dpt.not_equal(ar1, ar2)
     assert isinstance(r, dpt.usm_ndarray)
-    expected_usm_type = dpctl.utils.get_coerced_usm_type(
-        (op1_usm_type, op2_usm_type)
-    )
+    expected_usm_type = dpt.get_coerced_usm_type((op1_usm_type, op2_usm_type))
     assert r.usm_type == expected_usm_type
 
 
diff --git a/dpnp/tests/tensor/elementwise/test_pow.py b/dpnp/tests/tensor/elementwise/test_pow.py
index 17d54058c320..c68e6ad13b0a 100644
--- a/dpnp/tests/tensor/elementwise/test_pow.py
+++ b/dpnp/tests/tensor/elementwise/test_pow.py
@@ -91,9 +91,7 @@ def test_power_usm_type_matrix(op1_usm_type, op2_usm_type):
 
     r = dpt.pow(ar1, ar2)
     assert isinstance(r, dpt.usm_ndarray)
-    expected_usm_type = dpctl.utils.get_coerced_usm_type(
-        (op1_usm_type, op2_usm_type)
-    )
+    expected_usm_type = dpt.get_coerced_usm_type((op1_usm_type, op2_usm_type))
     assert r.usm_type == expected_usm_type
 
 
diff --git a/dpnp/tests/tensor/elementwise/test_remainder.py b/dpnp/tests/tensor/elementwise/test_remainder.py
index 0770820599d1..b8d5ca1cf8ae 100644
--- a/dpnp/tests/tensor/elementwise/test_remainder.py
+++ b/dpnp/tests/tensor/elementwise/test_remainder.py
@@ -91,9 +91,7 @@ def test_remainder_usm_type_matrix(op1_usm_type, op2_usm_type):
 
     r = dpt.remainder(ar1, ar2)
     assert isinstance(r, dpt.usm_ndarray)
-    expected_usm_type = dpctl.utils.get_coerced_usm_type(
-        (op1_usm_type, op2_usm_type)
-    )
+    expected_usm_type = dpt.get_coerced_usm_type((op1_usm_type, op2_usm_type))
     assert r.usm_type == expected_usm_type
 
 
diff --git a/dpnp/tests/tensor/elementwise/test_subtract.py b/dpnp/tests/tensor/elementwise/test_subtract.py
index 70c652c7e65a..70d05f926c23 100644
--- a/dpnp/tests/tensor/elementwise/test_subtract.py
+++ b/dpnp/tests/tensor/elementwise/test_subtract.py
@@ -107,9 +107,7 @@ def test_subtract_usm_type_matrix(op1_usm_type, op2_usm_type):
 
     r = dpt.subtract(ar1, ar2)
     assert isinstance(r, dpt.usm_ndarray)
-    expected_usm_type = dpctl.utils.get_coerced_usm_type(
-        (op1_usm_type, op2_usm_type)
-    )
+    expected_usm_type = dpt.get_coerced_usm_type((op1_usm_type, op2_usm_type))
     assert r.usm_type == expected_usm_type
 
 
diff --git a/dpnp/tests/tensor/elementwise/test_type_utils.py b/dpnp/tests/tensor/elementwise/test_type_utils.py
index 45b1501796a3..42e096f4f42d 100644
--- a/dpnp/tests/tensor/elementwise/test_type_utils.py
+++ b/dpnp/tests/tensor/elementwise/test_type_utils.py
@@ -154,7 +154,7 @@ def test_unary_func_arg_validation():
 
 
 def test_binary_func_arg_validation():
-    with pytest.raises(dpctl.utils.ExecutionPlacementError):
+    with pytest.raises(dpt.ExecutionPlacementError):
         dpt.add([1, 2, 3], 1)
     try:
         a = dpt.arange(8)
diff --git a/dpnp/tests/tensor/test_tensor_accumulation.py b/dpnp/tests/tensor/test_tensor_accumulation.py
index 66e979e63a38..b7ea9147e100 100644
--- a/dpnp/tests/tensor/test_tensor_accumulation.py
+++ b/dpnp/tests/tensor/test_tensor_accumulation.py
@@ -29,7 +29,6 @@
 from random import randrange
 
 import pytest
-from dpctl.utils import ExecutionPlacementError
 
 import dpnp.tensor as dpt
 
@@ -308,7 +307,7 @@ def test_accumulator_arg_validation():
 
     # compute follows data
     out_wrong_queue = dpt.empty_like(x2, sycl_queue=q2)
-    with pytest.raises(ExecutionPlacementError):
+    with pytest.raises(dpt.ExecutionPlacementError):
         dpt.cumulative_sum(x2, out=out_wrong_queue)
 
 
diff --git a/dpnp/tests/tensor/test_tensor_asarray.py b/dpnp/tests/tensor/test_tensor_asarray.py
index 33d6d00e3ba8..f5caacacdac6 100644
--- a/dpnp/tests/tensor/test_tensor_asarray.py
+++ b/dpnp/tests/tensor/test_tensor_asarray.py
@@ -404,7 +404,7 @@ def test_asarray_seq_of_arrays_on_different_queues():
     assert res.sycl_queue == w.sycl_queue
     assert dpt.isdtype(res.dtype, "integral")
 
-    with pytest.raises(dpctl.utils.ExecutionPlacementError):
+    with pytest.raises(dpt.ExecutionPlacementError):
         dpt.asarray([m, [w, py_seq]])
 
 
diff --git a/dpnp/tests/tensor/test_tensor_clip.py b/dpnp/tests/tensor/test_tensor_clip.py
index de4717f22023..759fc0ef11c7 100644
--- a/dpnp/tests/tensor/test_tensor_clip.py
+++ b/dpnp/tests/tensor/test_tensor_clip.py
@@ -29,7 +29,6 @@
 import dpctl
 import numpy as np
 import pytest
-from dpctl.utils import ExecutionPlacementError
 from numpy.testing import assert_raises_regex
 
 import dpnp.tensor as dpt
@@ -392,7 +391,7 @@ def test_clip_usm_type_matrix(usm_type1, usm_type2, usm_type3):
 
     r = dpt.clip(ar1, ar2, ar3)
     assert isinstance(r, dpt.usm_ndarray)
-    expected_usm_type = dpctl.utils.get_coerced_usm_type(
+    expected_usm_type = dpt.get_coerced_usm_type(
         (usm_type1, usm_type2, usm_type3)
     )
     assert r.usm_type == expected_usm_type
@@ -409,7 +408,7 @@ def test_clip_usm_type_matrix_none_arg(usm_type1, usm_type2):
 
     r = dpt.clip(ar1, min=ar2, max=None)
     assert isinstance(r, dpt.usm_ndarray)
-    expected_usm_type = dpctl.utils.get_coerced_usm_type((usm_type1, usm_type2))
+    expected_usm_type = dpt.get_coerced_usm_type((usm_type1, usm_type2))
     assert r.usm_type == expected_usm_type
 
 
@@ -457,7 +456,7 @@ def test_clip_errors():
     ar3 = dpt.ones_like(ar1, sycl_queue=gpu_queue)
     ar4 = dpt.empty_like(ar1, sycl_queue=cpu_queue)
     assert_raises_regex(
-        ExecutionPlacementError,
+        dpt.ExecutionPlacementError,
         "Input and output allocation queues are not compatible",
         dpt.clip,
         ar1,
@@ -467,7 +466,7 @@ def test_clip_errors():
     )
 
     assert_raises_regex(
-        ExecutionPlacementError,
+        dpt.ExecutionPlacementError,
         "Input and output allocation queues are not compatible",
         dpt.clip,
         ar1,
@@ -477,7 +476,7 @@ def test_clip_errors():
     )
 
     assert_raises_regex(
-        ExecutionPlacementError,
+        dpt.ExecutionPlacementError,
         "Execution placement can not be unambiguously inferred from input "
         "arguments.",
         dpt.clip,
@@ -488,7 +487,7 @@ def test_clip_errors():
     )
 
     assert_raises_regex(
-        ExecutionPlacementError,
+        dpt.ExecutionPlacementError,
         "Execution placement can not be unambiguously inferred from input "
         "arguments.",
         dpt.clip,
@@ -499,7 +498,7 @@ def test_clip_errors():
     )
 
     assert_raises_regex(
-        ExecutionPlacementError,
+        dpt.ExecutionPlacementError,
         "Execution placement can not be unambiguously inferred from input "
         "arguments.",
         dpt.clip,
@@ -510,7 +509,7 @@ def test_clip_errors():
     )
 
     assert_raises_regex(
-        ExecutionPlacementError,
+        dpt.ExecutionPlacementError,
         "Execution placement can not be unambiguously inferred from input "
         "arguments.",
         dpt.clip,
@@ -750,19 +749,19 @@ def test_clip_compute_follows_data():
     a_max = dpt.ones(10, dtype="i4", sycl_queue=q1)
     res = dpt.empty_like(x, sycl_queue=q2)
 
-    with pytest.raises(ExecutionPlacementError):
+    with pytest.raises(dpt.ExecutionPlacementError):
         dpt.clip(x, a_min, a_max)
 
-    with pytest.raises(ExecutionPlacementError):
+    with pytest.raises(dpt.ExecutionPlacementError):
         dpt.clip(x, dpt.ones_like(x), a_max, out=res)
 
-    with pytest.raises(ExecutionPlacementError):
+    with pytest.raises(dpt.ExecutionPlacementError):
         dpt.clip(x, a_min)
 
-    with pytest.raises(ExecutionPlacementError):
+    with pytest.raises(dpt.ExecutionPlacementError):
         dpt.clip(x, None, a_max, out=res)
 
-    with pytest.raises(ExecutionPlacementError):
+    with pytest.raises(dpt.ExecutionPlacementError):
         dpt.clip(x, out=res)
 
 
diff --git a/dpnp/tests/tensor/test_tensor_diff.py b/dpnp/tests/tensor/test_tensor_diff.py
index 10153b5f5cc5..e5beea6845b1 100644
--- a/dpnp/tests/tensor/test_tensor_diff.py
+++ b/dpnp/tests/tensor/test_tensor_diff.py
@@ -29,7 +29,6 @@
 from math import prod
 
 import pytest
-from dpctl.utils import ExecutionPlacementError
 from numpy.testing import assert_raises_regex
 
 import dpnp.tensor as dpt
@@ -305,19 +304,19 @@ def test_diff_compute_follows_data():
     ar2 = dpt.ones(1, dtype="i4", sycl_queue=q2)
     ar3 = dpt.ones(1, dtype="i4", sycl_queue=q3)
 
-    with pytest.raises(ExecutionPlacementError):
+    with pytest.raises(dpt.ExecutionPlacementError):
         dpt.diff(ar1, prepend=ar2, append=ar3)
 
-    with pytest.raises(ExecutionPlacementError):
+    with pytest.raises(dpt.ExecutionPlacementError):
         dpt.diff(ar1, prepend=ar2, append=0)
 
-    with pytest.raises(ExecutionPlacementError):
+    with pytest.raises(dpt.ExecutionPlacementError):
         dpt.diff(ar1, prepend=0, append=ar2)
 
-    with pytest.raises(ExecutionPlacementError):
+    with pytest.raises(dpt.ExecutionPlacementError):
         dpt.diff(ar1, prepend=ar2)
 
-    with pytest.raises(ExecutionPlacementError):
+    with pytest.raises(dpt.ExecutionPlacementError):
         dpt.diff(ar1, append=ar2)
 
 
diff --git a/dpnp/tests/tensor/test_tensor_isin.py b/dpnp/tests/tensor/test_tensor_isin.py
index 0bb22ea242ad..08f1787f733f 100644
--- a/dpnp/tests/tensor/test_tensor_isin.py
+++ b/dpnp/tests/tensor/test_tensor_isin.py
@@ -30,7 +30,6 @@
 
 import numpy as np
 import pytest
-from dpctl.utils import ExecutionPlacementError
 
 import dpnp.tensor as dpt
 
@@ -225,7 +224,7 @@ def test_isin_empty_inputs():
 
 def test_isin_validation():
     get_queue_or_skip()
-    with pytest.raises(ExecutionPlacementError):
+    with pytest.raises(dpt.ExecutionPlacementError):
         dpt.isin(1, 1)
     not_bool = {}
     with pytest.raises(TypeError):
@@ -278,5 +277,5 @@ def test_isin_compute_follows_data():
 
     x = dpt.ones(10, sycl_queue=q1)
     test = dpt.ones_like(x, sycl_queue=q2)
-    with pytest.raises(ExecutionPlacementError):
+    with pytest.raises(dpt.ExecutionPlacementError):
         dpt.isin(x, test)
diff --git a/dpnp/tests/tensor/test_usm_ndarray_ctor.py b/dpnp/tests/tensor/test_usm_ndarray_ctor.py
index cb185ff64a1f..70066860b19f 100644
--- a/dpnp/tests/tensor/test_usm_ndarray_ctor.py
+++ b/dpnp/tests/tensor/test_usm_ndarray_ctor.py
@@ -1442,14 +1442,14 @@ def test_full_compute_follows_data():
 
     assert Y.dtype == X.dtype
     assert Y.usm_type == X.usm_type
-    assert dpctl.utils.get_execution_queue((Y.sycl_queue, X.sycl_queue))
+    assert dpt.get_execution_queue((Y.sycl_queue, X.sycl_queue))
     assert np.array_equal(dpt.asnumpy(Y), np.full(10, 3, dtype="i4"))
 
     Y = dpt.full(10, X[3], dtype="f4", sycl_queue=q2, usm_type="host")
 
     assert Y.dtype == dpt.dtype("f4")
     assert Y.usm_type == "host"
-    assert dpctl.utils.get_execution_queue((Y.sycl_queue, q2))
+    assert dpt.get_execution_queue((Y.sycl_queue, q2))
     assert np.array_equal(dpt.asnumpy(Y), np.full(10, 3, dtype="f4"))
 
 
diff --git a/dpnp/tests/tensor/test_usm_ndarray_indexing.py b/dpnp/tests/tensor/test_usm_ndarray_indexing.py
index 530d4ab2988c..b81e5456872b 100644
--- a/dpnp/tests/tensor/test_usm_ndarray_indexing.py
+++ b/dpnp/tests/tensor/test_usm_ndarray_indexing.py
@@ -29,7 +29,6 @@
 import dpctl
 import numpy as np
 import pytest
-from dpctl.utils import ExecutionPlacementError
 from numpy.testing import assert_array_equal
 
 import dpnp.tensor as dpt
@@ -1154,17 +1153,17 @@ def test_advanced_indexing_compute_follows_data():
     val0 = dpt.asarray(2, dtype=x.dtype, sycl_queue=q1)
     val1 = dpt.asarray(2, dtype=x.dtype, sycl_queue=q2)
 
-    with pytest.raises(ExecutionPlacementError):
+    with pytest.raises(dpt.ExecutionPlacementError):
         dpt.take(x, ind1, axis=0)
-    with pytest.raises(ExecutionPlacementError):
+    with pytest.raises(dpt.ExecutionPlacementError):
         x[ind1]
-    with pytest.raises(ExecutionPlacementError):
+    with pytest.raises(dpt.ExecutionPlacementError):
         dpt.put(x, ind1, val0, axis=0)
-    with pytest.raises(ExecutionPlacementError):
+    with pytest.raises(dpt.ExecutionPlacementError):
         x[ind1] = val0
-    with pytest.raises(ExecutionPlacementError):
+    with pytest.raises(dpt.ExecutionPlacementError):
         dpt.put(x, ind0, val1, axis=0)
-    with pytest.raises(ExecutionPlacementError):
+    with pytest.raises(dpt.ExecutionPlacementError):
         x[ind0] = val1
 
 
@@ -1500,7 +1499,7 @@ def test_extract_arg_validation():
     with pytest.raises(TypeError):
         dpt.extract(cond, None)
     q1 = dpctl.SyclQueue()
-    with pytest.raises(ExecutionPlacementError):
+    with pytest.raises(dpt.ExecutionPlacementError):
         dpt.extract(cond.to_device(q1), dpt.zeros_like(cond, dtype="u1"))
     with pytest.raises(ValueError):
         dpt.extract(dpt.ones((2, 3), dtype="?"), dpt.ones((3, 2), dtype="i1"))
@@ -1518,7 +1517,7 @@ def test_place_arg_validation():
         dpt.place(arr, cond, None)
     vals = dpt.ones_like(arr)
     q1 = dpctl.SyclQueue()
-    with pytest.raises(ExecutionPlacementError):
+    with pytest.raises(dpt.ExecutionPlacementError):
         dpt.place(arr.to_device(q1), cond, vals)
     with pytest.raises(ValueError):
         dpt.place(dpt.reshape(arr, (2, 2, 2)), cond, vals)
@@ -1699,7 +1698,7 @@ def test_take_along_axis_validation():
     # check compute-follows-data
     q2 = dpctl.SyclQueue(x_dev, property="enable_profiling")
     ind2 = dpt.zeros(1, dtype=ind_dt, sycl_queue=q2)
-    with pytest.raises(ExecutionPlacementError):
+    with pytest.raises(dpt.ExecutionPlacementError):
         dpt.take_along_axis(x, ind2)
 
 
@@ -1764,7 +1763,7 @@ def test_put_along_axis_validation():
     # check compute-follows-data
     q2 = dpctl.SyclQueue(x_dev, property="enable_profiling")
     ind2 = dpt.zeros(1, dtype=ind_dt, sycl_queue=q2)
-    with pytest.raises(ExecutionPlacementError):
+    with pytest.raises(dpt.ExecutionPlacementError):
         dpt.put_along_axis(x, ind2, vals)
 
 
@@ -1822,7 +1821,7 @@ def check__extract_impl_validation(fn):
         fn(x, list())
     q2 = dpctl.SyclQueue(x.sycl_device, property="enable_profiling")
     ind2 = dpt.ones(10, dtype="?", sycl_queue=q2)
-    with pytest.raises(ExecutionPlacementError):
+    with pytest.raises(dpt.ExecutionPlacementError):
         fn(x, ind2)
     with pytest.raises(ValueError):
         fn(x, ind, 1)
@@ -1850,7 +1849,7 @@ def check__take_multi_index(fn):
         fn(x, (x,), 1)
     q2 = dpctl.SyclQueue(x.sycl_device, property="enable_profiling")
     ind2 = dpt.arange(10, dtype=ind_dt, sycl_queue=q2)
-    with pytest.raises(ExecutionPlacementError):
+    with pytest.raises(dpt.ExecutionPlacementError):
         fn(x, (ind2,), 0)
     m = dpt.ones((10, 10))
     ind_1 = dpt.arange(10, dtype="i8")
@@ -1867,7 +1866,7 @@ def check__place_impl_validation(fn):
         fn(x, list(), list())
     q2 = dpctl.SyclQueue(x.sycl_device, property="enable_profiling")
     mask2 = dpt.ones(10, dtype="?", sycl_queue=q2)
-    with pytest.raises(ExecutionPlacementError):
+    with pytest.raises(dpt.ExecutionPlacementError):
         fn(x, mask2, 1)
     x2 = dpt.ones((5, 5))
     mask2 = dpt.ones((5, 5), dtype="?")
@@ -2037,7 +2036,7 @@ def test_take_out_errors():
         dpt.take(x, ind, out=out_bad_dt)
 
     out_bad_q = dpt.empty(ind.shape, dtype=x.dtype, sycl_queue=q2)
-    with pytest.raises(dpctl.utils.ExecutionPlacementError):
+    with pytest.raises(dpt.ExecutionPlacementError):
         dpt.take(x, ind, out=out_bad_q)
 
 
diff --git a/dpnp/tests/tensor/test_usm_ndarray_linalg.py b/dpnp/tests/tensor/test_usm_ndarray_linalg.py
index 13b03ff66fd5..c28754ca080f 100644
--- a/dpnp/tests/tensor/test_usm_ndarray_linalg.py
+++ b/dpnp/tests/tensor/test_usm_ndarray_linalg.py
@@ -31,7 +31,6 @@
 import dpctl
 import numpy as np
 import pytest
-from dpctl.utils import ExecutionPlacementError
 
 import dpnp.tensor as dpt
 
@@ -452,7 +451,7 @@ def test_matmul_out_errors():
     with pytest.raises(ValueError):
         dpt.matmul(m1, m2, out=dpt.empty(sh, dtype="f4", sycl_queue=q1))
 
-    with pytest.raises(ExecutionPlacementError):
+    with pytest.raises(dpt.ExecutionPlacementError):
         dpt.matmul(m1, m2, out=dpt.empty(sh, dtype=dt, sycl_queue=q2))
 
 
@@ -536,7 +535,7 @@ def test_matmul_compute_follows_data():
     m1 = dpt.zeros(sh, dtype=dt, sycl_queue=q1)
     m2 = dpt.zeros(sh, dtype=dt, sycl_queue=q2)
 
-    with pytest.raises(ExecutionPlacementError):
+    with pytest.raises(dpt.ExecutionPlacementError):
         dpt.matmul(m1, m2)
 
 
@@ -751,7 +750,7 @@ def test_tensordot_validation():
 
     t2 = dpt.empty((10, 10, 10))
     q = dpctl.SyclQueue(t2.sycl_context, t2.sycl_device, property="in_order")
-    with pytest.raises(dpctl.utils.ExecutionPlacementError):
+    with pytest.raises(dpt.ExecutionPlacementError):
         dpt.tensordot(t1, t2.to_device(q))
 
     invalid_axes = (
@@ -938,7 +937,7 @@ def test_vector_arg_validation():
     q = dpctl.SyclQueue(
         v2.sycl_context, v2.sycl_device, property="enable_profiling"
     )
-    with pytest.raises(dpctl.utils.ExecutionPlacementError):
+    with pytest.raises(dpt.ExecutionPlacementError):
         dpt.vecdot(v1, v2.to_device(q))
 
     m1 = dpt.empty((10, 5))
diff --git a/dpnp/tests/tensor/test_usm_ndarray_manipulation.py b/dpnp/tests/tensor/test_usm_ndarray_manipulation.py
index 45a53aa0532d..0375bb446370 100644
--- a/dpnp/tests/tensor/test_usm_ndarray_manipulation.py
+++ b/dpnp/tests/tensor/test_usm_ndarray_manipulation.py
@@ -31,7 +31,6 @@
 import dpctl
 import numpy as np
 import pytest
-from dpctl.utils import ExecutionPlacementError
 from numpy.testing import assert_, assert_array_equal, assert_raises_regex
 
 import dpnp.tensor as dpt
@@ -1413,7 +1412,7 @@ def test_repeat_arg_validation():
     # compute follows data
     q2 = dpctl.SyclQueue()
     reps = dpt.asarray(1, dtype="i8", sycl_queue=q2)
-    with pytest.raises(ExecutionPlacementError):
+    with pytest.raises(dpt.ExecutionPlacementError):
         dpt.repeat(x, reps)
 
     # repeats array must not contain negative elements
diff --git a/dpnp/tests/tensor/test_usm_ndarray_reductions.py b/dpnp/tests/tensor/test_usm_ndarray_reductions.py
index 4d828fbdbd49..2c431efa936d 100644
--- a/dpnp/tests/tensor/test_usm_ndarray_reductions.py
+++ b/dpnp/tests/tensor/test_usm_ndarray_reductions.py
@@ -30,7 +30,6 @@
 
 import numpy as np
 import pytest
-from dpctl.utils import ExecutionPlacementError
 from numpy.testing import assert_allclose
 
 import dpnp.tensor as dpt
@@ -56,7 +55,6 @@
     "f8",
 ]
 
-
 _all_dtypes = _no_complex_dtypes + [
     "c8",
     "c16",
@@ -650,11 +648,11 @@ def test_reduction_out_kwarg_arg_validation():
         dpt.max(x, out=dict())
     with pytest.raises(TypeError):
         dpt.argmax(x, out=dict())
-    with pytest.raises(ExecutionPlacementError):
+    with pytest.raises(dpt.ExecutionPlacementError):
         dpt.sum(x, out=out_wrong_queue)
-    with pytest.raises(ExecutionPlacementError):
+    with pytest.raises(dpt.ExecutionPlacementError):
         dpt.max(x, out=out_wrong_queue)
-    with pytest.raises(ExecutionPlacementError):
+    with pytest.raises(dpt.ExecutionPlacementError):
         dpt.argmax(x, out=dpt.empty_like(out_wrong_queue, dtype=ind_dt))
     with pytest.raises(ValueError):
         dpt.sum(x, out=out_wrong_dtype)
diff --git a/dpnp/tests/tensor/test_usm_ndarray_search_functions.py b/dpnp/tests/tensor/test_usm_ndarray_search_functions.py
index 30be5f0ee4f5..33942d93c3a7 100644
--- a/dpnp/tests/tensor/test_usm_ndarray_search_functions.py
+++ b/dpnp/tests/tensor/test_usm_ndarray_search_functions.py
@@ -31,7 +31,6 @@
 
 import numpy as np
 import pytest
-from dpctl.utils import ExecutionPlacementError
 from numpy.testing import assert_array_equal
 
 import dpnp.tensor as dpt
@@ -383,11 +382,11 @@ def test_where_compute_follows_data():
     x1 = dpt.empty((1,), dtype="i4", sycl_queue=q1)
     x2 = dpt.empty((1,), dtype="i4", sycl_queue=q2)
 
-    with pytest.raises(ExecutionPlacementError):
+    with pytest.raises(dpt.ExecutionPlacementError):
         dpt.where(dpt.empty((1,), dtype="i4", sycl_queue=q1), x1, x2)
-    with pytest.raises(ExecutionPlacementError):
+    with pytest.raises(dpt.ExecutionPlacementError):
         dpt.where(dpt.empty((1,), dtype="i4", sycl_queue=q3), x1, x2)
-    with pytest.raises(ExecutionPlacementError):
+    with pytest.raises(dpt.ExecutionPlacementError):
         dpt.where(x1, x1, x2)
 
 
@@ -533,7 +532,7 @@ def test_where_out_arg_validation():
 
     with pytest.raises(TypeError):
         dpt.where(condition, x1, x2, out=dict())
-    with pytest.raises(ExecutionPlacementError):
+    with pytest.raises(dpt.ExecutionPlacementError):
         dpt.where(condition, x1, x2, out=out_wrong_queue)
     with pytest.raises(ValueError):
         dpt.where(condition, x1, x2, out=out_wrong_dtype)
diff --git a/dpnp/tests/tensor/test_usm_ndarray_searchsorted.py b/dpnp/tests/tensor/test_usm_ndarray_searchsorted.py
index d97e224b61cc..aef782f06f08 100644
--- a/dpnp/tests/tensor/test_usm_ndarray_searchsorted.py
+++ b/dpnp/tests/tensor/test_usm_ndarray_searchsorted.py
@@ -27,7 +27,6 @@
 # *****************************************************************************
 
 import dpctl
-import dpctl.utils as dpu
 import numpy as np
 import pytest
 
@@ -334,10 +333,10 @@ def test_searchsorted_validation2():
     q2 = dpctl.SyclQueue(d, property="in_order")
     x2 = dpt.ones(5, dtype=x1.dtype, sycl_queue=q2)
 
-    with pytest.raises(dpu.ExecutionPlacementError):
+    with pytest.raises(dpt.ExecutionPlacementError):
         dpt.searchsorted(x1, x2)
 
-    with pytest.raises(dpu.ExecutionPlacementError):
+    with pytest.raises(dpt.ExecutionPlacementError):
         dpt.searchsorted(x1, x2, sorter=sorter)
 
     sorter = dpt.ones(x1.shape, dtype=dpt.bool)
diff --git a/dpnp/tests/test_fft.py b/dpnp/tests/test_fft.py
index c39303f32c1a..31215d725f82 100644
--- a/dpnp/tests/test_fft.py
+++ b/dpnp/tests/test_fft.py
@@ -1,7 +1,6 @@
 import dpctl
 import numpy
 import pytest
-from dpctl.utils import ExecutionPlacementError
 from numpy.testing import assert_raises
 
 import dpnp
@@ -277,7 +276,7 @@ def test_validate_out(self):
         # Inconsistent sycl_queue
         a = dpnp.ones((10,), dtype=dpnp.complex64, sycl_queue=dpctl.SyclQueue())
         out = dpnp.empty((10,), sycl_queue=dpctl.SyclQueue())
-        assert_raises(ExecutionPlacementError, dpnp.fft.fft, a, out=out)
+        assert_raises(dpt.ExecutionPlacementError, dpnp.fft.fft, a, out=out)
 
         # Invalid shape
         a = dpnp.ones((10,), dtype=dpnp.complex64)
diff --git a/dpnp/tests/test_fill.py b/dpnp/tests/test_fill.py
index 3102de395d93..6d6f7049d9e3 100644
--- a/dpnp/tests/test_fill.py
+++ b/dpnp/tests/test_fill.py
@@ -1,9 +1,9 @@
 import dpctl
 import pytest
-from dpctl.utils import ExecutionPlacementError
 from numpy.testing import assert_array_equal
 
 import dpnp
+from dpnp.tensor import ExecutionPlacementError
 
 
 @pytest.mark.parametrize(
diff --git a/dpnp/tests/test_indexing.py b/dpnp/tests/test_indexing.py
index 3bdd5449d223..78eec020649b 100644
--- a/dpnp/tests/test_indexing.py
+++ b/dpnp/tests/test_indexing.py
@@ -3,7 +3,6 @@
 import dpctl
 import numpy
 import pytest
-from dpctl.utils import ExecutionPlacementError
 from numpy.testing import (
     assert_,
     assert_array_equal,
@@ -1380,7 +1379,7 @@ def test_compress_invalid_out_errors(self):
         with pytest.raises(ValueError):
             dpnp.compress(condition, a, out=out_bad_shape)
         out_bad_queue = dpnp.empty(1, dtype="i4", sycl_queue=q2)
-        with pytest.raises(ExecutionPlacementError):
+        with pytest.raises(dpt.ExecutionPlacementError):
             dpnp.compress(condition, a, out=out_bad_queue)
         out_bad_dt = dpnp.empty(1, dtype="i8", sycl_queue=q1)
         with pytest.raises(TypeError):
@@ -1496,7 +1495,7 @@ def test_choose_invalid_out_errors(self):
         with pytest.raises(ValueError):
             dpnp.choose(inds, [chcs], out=out_bad_shape)
         out_bad_queue = dpnp.empty(chcs.shape, dtype=chcs.dtype, sycl_queue=q2)
-        with pytest.raises(ExecutionPlacementError):
+        with pytest.raises(dpt.ExecutionPlacementError):
             dpnp.choose(inds, [chcs], out=out_bad_queue)
         out_bad_dt = dpnp.empty(chcs.shape, dtype="i8", sycl_queue=q1)
         with pytest.raises(TypeError):
diff --git a/dpnp/tests/test_linalg.py b/dpnp/tests/test_linalg.py
index d32237c04aad..524fd4869b0c 100644
--- a/dpnp/tests/test_linalg.py
+++ b/dpnp/tests/test_linalg.py
@@ -3,7 +3,6 @@
 import dpctl
 import numpy
 import pytest
-from dpctl.utils import ExecutionPlacementError
 from numpy.testing import (
     assert_allclose,
     assert_array_equal,
@@ -607,7 +606,9 @@ def test_einsum_error1(self):
         a = dpnp.ones((5, 5))
         out = dpnp.empty((5,), sycl_queue=dpctl.SyclQueue())
         # inconsistent sycl_queue
-        assert_raises(ExecutionPlacementError, dpnp.einsum, "ii->i", a, out=out)
+        assert_raises(
+            dpt.ExecutionPlacementError, dpnp.einsum, "ii->i", a, out=out
+        )
 
         # unknown value for optimize keyword
         assert_raises(TypeError, dpnp.einsum, "ii->i", a, optimize="blah")
@@ -2777,7 +2778,7 @@ def test_matrix_rank_errors(self):
         a_dp_q = dpnp.array(a_dp, sycl_queue=a_queue)
         tol_dp_q = dpnp.array([0.5], dtype="float32", sycl_queue=tol_queue)
         assert_raises(
-            ExecutionPlacementError,
+            dpt.ExecutionPlacementError,
             dpnp.linalg.matrix_rank,
             a_dp_q,
             tol_dp_q,
diff --git a/dpnp/tests/test_logic.py b/dpnp/tests/test_logic.py
index cae51e6777ef..1ff9a780d88d 100644
--- a/dpnp/tests/test_logic.py
+++ b/dpnp/tests/test_logic.py
@@ -1,7 +1,6 @@
 import dpctl
 import numpy
 import pytest
-from dpctl.utils import ExecutionPlacementError
 from numpy.testing import (
     assert_allclose,
     assert_array_equal,
@@ -10,6 +9,7 @@
 )
 
 import dpnp
+from dpnp.tensor import ExecutionPlacementError
 
 from .helper import (
     generate_random_numpy_array,
diff --git a/dpnp/tests/test_mathematical.py b/dpnp/tests/test_mathematical.py
index ae273ffa8c03..c9a506fa6238 100644
--- a/dpnp/tests/test_mathematical.py
+++ b/dpnp/tests/test_mathematical.py
@@ -1,7 +1,6 @@
 import dpctl
 import numpy
 import pytest
-from dpctl.utils import ExecutionPlacementError
 from numpy.testing import (
     assert_allclose,
     assert_array_equal,
@@ -710,12 +709,14 @@ def test_errors(self):
         # another `to_begin` sycl queue
         to_begin = dpnp.array([-20, -15], sycl_queue=dpctl.SyclQueue())
         assert_raises(
-            ExecutionPlacementError, dpnp.ediff1d, ia, to_begin=to_begin
+            dpt.ExecutionPlacementError, dpnp.ediff1d, ia, to_begin=to_begin
         )
 
         # another `to_end` sycl queue
         to_end = dpnp.array([15, 20], sycl_queue=dpctl.SyclQueue())
-        assert_raises(ExecutionPlacementError, dpnp.ediff1d, ia, to_end=to_end)
+        assert_raises(
+            dpt.ExecutionPlacementError, dpnp.ediff1d, ia, to_end=to_end
+        )
 
 
 class TestGradient:
@@ -2131,13 +2132,13 @@ def test_unary_two_outs_cfd_error(self, func):
         out1 = dpnp.empty((), sycl_queue=dpctl.SyclQueue())
         out2 = dpnp.empty((), sycl_queue=dpctl.SyclQueue())
         with pytest.raises(
-            ExecutionPlacementError,
+            dpt.ExecutionPlacementError,
             match="Input and output allocation queues are not compatible",
         ):
             _ = fn(*args, out1)
 
         with pytest.raises(
-            ExecutionPlacementError,
+            dpt.ExecutionPlacementError,
             match="Input and output allocation queues are not compatible",
         ):
             _ = fn(*args, out=(None, out2))
diff --git a/dpnp/tests/test_nanfunctions.py b/dpnp/tests/test_nanfunctions.py
index 3c5ea5b61989..49f47ebec043 100644
--- a/dpnp/tests/test_nanfunctions.py
+++ b/dpnp/tests/test_nanfunctions.py
@@ -1,7 +1,6 @@
 import dpctl
 import numpy
 import pytest
-from dpctl.utils import ExecutionPlacementError
 from numpy.testing import (
     assert_allclose,
     assert_almost_equal,
@@ -496,7 +495,7 @@ def test_error(self):
         # out has a different queue
         exec_q = dpctl.SyclQueue()
         res = dpnp.empty(2, dtype=a.dtype, sycl_queue=exec_q)
-        with pytest.raises(ExecutionPlacementError):
+        with pytest.raises(dpt.ExecutionPlacementError):
             dpnp.nanmedian(a, axis=1, out=res)
 
 
diff --git a/dpnp/tests/test_product.py b/dpnp/tests/test_product.py
index cd71b07352da..abbaa307ee03 100644
--- a/dpnp/tests/test_product.py
+++ b/dpnp/tests/test_product.py
@@ -1,11 +1,11 @@
 import dpctl
 import numpy
 import pytest
-from dpctl.utils import ExecutionPlacementError
 from numpy.testing import assert_allclose, assert_array_equal, assert_raises
 
 import dpnp
 from dpnp.dpnp_utils import map_dtype_to_device
+from dpnp.tensor import ExecutionPlacementError
 from dpnp.tensor._numpy_helper import AxisError
 
 from .helper import (
diff --git a/dpnp/tests/test_sycl_queue.py b/dpnp/tests/test_sycl_queue.py
index f1678bd28da3..361b10444192 100644
--- a/dpnp/tests/test_sycl_queue.py
+++ b/dpnp/tests/test_sycl_queue.py
@@ -4,7 +4,6 @@
 import dpctl
 import numpy
 import pytest
-from dpctl.utils import ExecutionPlacementError
 from numpy.testing import assert_array_equal, assert_raises
 
 import dpnp
@@ -50,7 +49,7 @@ def assert_sycl_queue_equal(result, expected):
     assert result.sycl_device == expected.sycl_device
     assert result.is_in_order == expected.is_in_order
     assert result.has_enable_profiling == expected.has_enable_profiling
-    exec_queue = dpctl.utils.get_execution_queue([result, expected])
+    exec_queue = dpt.get_execution_queue([result, expected])
     assert exec_queue is not None
 
 
@@ -657,7 +656,7 @@ def test_2in_broadcasting(func, data1, data2, device):
 def test_2in_1out_diff_queue_but_equal_context(func, device):
     x1 = dpnp.arange(10)
     x2 = dpnp.arange(10, sycl_queue=dpctl.SyclQueue(device))[::-1]
-    with assert_raises((ValueError, ExecutionPlacementError)):
+    with assert_raises((ValueError, dpt.ExecutionPlacementError)):
         getattr(dpnp, func)(x1, x2)
 
 
diff --git a/dpnp/tests/test_usm_type.py b/dpnp/tests/test_usm_type.py
index a0cfe6d24979..78ef41f9ea18 100644
--- a/dpnp/tests/test_usm_type.py
+++ b/dpnp/tests/test_usm_type.py
@@ -2,7 +2,6 @@
 import tempfile
 from math import prod
 
-import dpctl.utils as du
 import numpy
 import pytest
 
@@ -29,7 +28,7 @@ def test_add(usm_type_x, usm_type_y):
 
     assert x.usm_type == usm_type_x
     assert y.usm_type == usm_type_y
-    assert z.usm_type == du.get_coerced_usm_type([usm_type_x, usm_type_y])
+    assert z.usm_type == dpt.get_coerced_usm_type([usm_type_x, usm_type_y])
 
 
 @pytest.mark.parametrize("usm_type_x", list_of_usm_types)
@@ -46,7 +45,7 @@ def test_multiply(usm_type_x, usm_type_y):
 
     assert x.usm_type == usm_type_x
     assert y.usm_type == usm_type_y
-    assert z.usm_type == du.get_coerced_usm_type([usm_type_x, usm_type_y])
+    assert z.usm_type == dpt.get_coerced_usm_type([usm_type_x, usm_type_y])
 
 
 @pytest.mark.parametrize("usm_type_x", list_of_usm_types)
@@ -63,7 +62,7 @@ def test_subtract(usm_type_x, usm_type_y):
 
     assert x.usm_type == usm_type_x
     assert y.usm_type == usm_type_y
-    assert z.usm_type == du.get_coerced_usm_type([usm_type_x, usm_type_y])
+    assert z.usm_type == dpt.get_coerced_usm_type([usm_type_x, usm_type_y])
 
 
 @pytest.mark.parametrize("usm_type_x", list_of_usm_types)
@@ -80,7 +79,7 @@ def test_divide(usm_type_x, usm_type_y):
 
     assert x.usm_type == usm_type_x
     assert y.usm_type == usm_type_y
-    assert z.usm_type == du.get_coerced_usm_type([usm_type_x, usm_type_y])
+    assert z.usm_type == dpt.get_coerced_usm_type([usm_type_x, usm_type_y])
 
 
 @pytest.mark.parametrize("usm_type_x", list_of_usm_types)
@@ -100,7 +99,7 @@ def test_remainder(usm_type_x, usm_type_y):
 
     assert x.usm_type == usm_type_x
     assert y.usm_type == usm_type_y
-    assert z.usm_type == du.get_coerced_usm_type([usm_type_x, usm_type_y])
+    assert z.usm_type == dpt.get_coerced_usm_type([usm_type_x, usm_type_y])
 
 
 @pytest.mark.parametrize("usm_type_x", list_of_usm_types)
@@ -121,7 +120,7 @@ def test_floor_divide(usm_type_x, usm_type_y):
 
     assert x.usm_type == usm_type_x
     assert y.usm_type == usm_type_y
-    assert z.usm_type == du.get_coerced_usm_type([usm_type_x, usm_type_y])
+    assert z.usm_type == dpt.get_coerced_usm_type([usm_type_x, usm_type_y])
 
 
 @pytest.mark.parametrize("usm_type_x", list_of_usm_types)
@@ -136,7 +135,7 @@ def test_power(usm_type_x, usm_type_y):
 
     assert x.usm_type == usm_type_x
     assert y.usm_type == usm_type_y
-    assert z.usm_type == du.get_coerced_usm_type([usm_type_x, usm_type_y])
+    assert z.usm_type == dpt.get_coerced_usm_type([usm_type_x, usm_type_y])
 
 
 @pytest.mark.parametrize(
@@ -320,7 +319,7 @@ def test_linspace_arrays(usm_type_start, usm_type_stop):
     start = dpnp.array([0, 0], usm_type=usm_type_start)
     stop = dpnp.array([2, 4], usm_type=usm_type_stop)
     res = dpnp.linspace(start, stop, 4)
-    assert res.usm_type == du.get_coerced_usm_type(
+    assert res.usm_type == dpt.get_coerced_usm_type(
         [usm_type_start, usm_type_stop]
     )
 
@@ -376,7 +375,7 @@ def test_logic_op_2in(op, usm_type_x, usm_type_y):
 
     assert x.usm_type == zx.usm_type == usm_type_x
     assert y.usm_type == zy.usm_type == usm_type_y
-    assert z.usm_type == du.get_coerced_usm_type([usm_type_x, usm_type_y])
+    assert z.usm_type == dpt.get_coerced_usm_type([usm_type_x, usm_type_y])
 
 
 @pytest.mark.parametrize("op", ["bitwise_count", "bitwise_not"])
@@ -404,7 +403,7 @@ def test_bitwise_op_2in(op, usm_type_x, usm_type_y):
 
     assert x.usm_type == zx.usm_type == usm_type_x
     assert y.usm_type == zy.usm_type == usm_type_y
-    assert z.usm_type == du.get_coerced_usm_type([usm_type_x, usm_type_y])
+    assert z.usm_type == dpt.get_coerced_usm_type([usm_type_x, usm_type_y])
 
 
 class TestMatmul:
@@ -445,7 +444,7 @@ def test_basic(self, usm_type_x, usm_type_y, dtype, shape1, shape2):
 
         assert x.usm_type == usm_type_x
         assert y.usm_type == usm_type_y
-        assert z.usm_type == du.get_coerced_usm_type([usm_type_x, usm_type_y])
+        assert z.usm_type == dpt.get_coerced_usm_type([usm_type_x, usm_type_y])
 
     @pytest.mark.parametrize("usm_type", list_of_usm_types)
     def test_syrk(self, usm_type):
@@ -474,7 +473,7 @@ def test_matvec(usm_type_x, usm_type_y, shape1, shape2):
 
     assert x.usm_type == usm_type_x
     assert y.usm_type == usm_type_y
-    assert z.usm_type == du.get_coerced_usm_type([usm_type_x, usm_type_y])
+    assert z.usm_type == dpt.get_coerced_usm_type([usm_type_x, usm_type_y])
 
 
 @pytest.mark.parametrize("usm_type_x", list_of_usm_types)
@@ -496,7 +495,7 @@ def test_vecdot(usm_type_x, usm_type_y, shape1, shape2):
 
     assert x.usm_type == usm_type_x
     assert y.usm_type == usm_type_y
-    assert z.usm_type == du.get_coerced_usm_type([usm_type_x, usm_type_y])
+    assert z.usm_type == dpt.get_coerced_usm_type([usm_type_x, usm_type_y])
 
 
 @pytest.mark.parametrize("usm_type_x", list_of_usm_types)
@@ -518,7 +517,7 @@ def test_vecmat(usm_type_x, usm_type_y, shape1, shape2):
 
     assert x.usm_type == usm_type_x
     assert y.usm_type == usm_type_y
-    assert z.usm_type == du.get_coerced_usm_type([usm_type_x, usm_type_y])
+    assert z.usm_type == dpt.get_coerced_usm_type([usm_type_x, usm_type_y])
 
 
 @pytest.mark.parametrize("usm_type_x", list_of_usm_types)
@@ -744,7 +743,7 @@ def test_2in_1out(func, data1, data2, usm_type_x, usm_type_y):
 
     assert x.usm_type == usm_type_x
     assert y.usm_type == usm_type_y
-    assert z.usm_type == du.get_coerced_usm_type([usm_type_x, usm_type_y])
+    assert z.usm_type == dpt.get_coerced_usm_type([usm_type_x, usm_type_y])
 
 
 @pytest.mark.parametrize(
@@ -765,7 +764,7 @@ def test_2in_2out(func, data1, data2, usm_type_x, usm_type_y):
     assert (
         z1.usm_type
         == z2.usm_type
-        == du.get_coerced_usm_type([usm_type_x, usm_type_y])
+        == dpt.get_coerced_usm_type([usm_type_x, usm_type_y])
     )
 
 
@@ -811,7 +810,7 @@ def test_piecewise(usm_type_x, usm_type_y, usm_type_z):
     assert x.usm_type == usm_type_x
     assert y.usm_type == usm_type_y
     assert z.usm_type == usm_type_z
-    assert result.usm_type == du.get_coerced_usm_type(
+    assert result.usm_type == dpt.get_coerced_usm_type(
         [usm_type_x, usm_type_y, usm_type_z]
     )
 
@@ -836,7 +835,7 @@ def test_concat_stack(func, data1, data2, usm_type_x, usm_type_y):
 
     assert x.usm_type == usm_type_x
     assert y.usm_type == usm_type_y
-    assert z.usm_type == du.get_coerced_usm_type([usm_type_x, usm_type_y])
+    assert z.usm_type == dpt.get_coerced_usm_type([usm_type_x, usm_type_y])
 
 
 @pytest.mark.parametrize("usm_type_x", list_of_usm_types)
@@ -848,7 +847,7 @@ def test_extract(usm_type_x, usm_type_y):
 
     assert x.usm_type == usm_type_x
     assert y.usm_type == usm_type_y
-    assert z.usm_type == du.get_coerced_usm_type([usm_type_x, usm_type_y])
+    assert z.usm_type == dpt.get_coerced_usm_type([usm_type_x, usm_type_y])
 
 
 @pytest.mark.parametrize(
@@ -896,7 +895,9 @@ def test_obj_ndarray(self, usm_type, usm_type_other):
 
         assert x.usm_type == usm_type
         assert y.usm_type == usm_type_other
-        assert z.usm_type == du.get_coerced_usm_type([usm_type, usm_type_other])
+        assert z.usm_type == dpt.get_coerced_usm_type(
+            [usm_type, usm_type_other]
+        )
 
 
 @pytest.mark.parametrize("usm_type", list_of_usm_types)
@@ -941,7 +942,9 @@ def test_values_ndarray(self, obj, usm_type, usm_type_other):
 
         assert x.usm_type == usm_type
         assert y.usm_type == usm_type_other
-        assert z.usm_type == du.get_coerced_usm_type([usm_type, usm_type_other])
+        assert z.usm_type == dpt.get_coerced_usm_type(
+            [usm_type, usm_type_other]
+        )
 
     @pytest.mark.parametrize("values", [-2, [-1, -2]], ids=["scalar", "list"])
     @pytest.mark.parametrize("usm_type_other", list_of_usm_types)
@@ -952,7 +955,9 @@ def test_obj_ndarray(self, values, usm_type, usm_type_other):
 
         assert x.usm_type == usm_type
         assert y.usm_type == usm_type_other
-        assert z.usm_type == du.get_coerced_usm_type([usm_type, usm_type_other])
+        assert z.usm_type == dpt.get_coerced_usm_type(
+            [usm_type, usm_type_other]
+        )
 
     @pytest.mark.parametrize("usm_type_y", list_of_usm_types)
     @pytest.mark.parametrize("usm_type_z", list_of_usm_types)
@@ -965,7 +970,7 @@ def test_obj_values_ndarray(self, usm_type, usm_type_y, usm_type_z):
         assert x.usm_type == usm_type
         assert y.usm_type == usm_type_y
         assert z.usm_type == usm_type_z
-        assert res.usm_type == du.get_coerced_usm_type(
+        assert res.usm_type == dpt.get_coerced_usm_type(
             [usm_type, usm_type_y, usm_type_z]
         )
 
@@ -980,7 +985,7 @@ def test_take(func, usm_type_x, usm_type_ind):
 
     assert x.usm_type == usm_type_x
     assert ind.usm_type == usm_type_ind
-    assert z.usm_type == du.get_coerced_usm_type([usm_type_x, usm_type_ind])
+    assert z.usm_type == dpt.get_coerced_usm_type([usm_type_x, usm_type_ind])
 
 
 @pytest.mark.parametrize(
@@ -1004,7 +1009,7 @@ def test_take_along_axis(data, ind, axis, usm_type_x, usm_type_ind):
 
     assert x.usm_type == usm_type_x
     assert ind.usm_type == usm_type_ind
-    assert z.usm_type == du.get_coerced_usm_type([usm_type_x, usm_type_ind])
+    assert z.usm_type == dpt.get_coerced_usm_type([usm_type_x, usm_type_ind])
 
 
 @pytest.mark.parametrize("usm_type", list_of_usm_types + [None])
@@ -1156,8 +1161,8 @@ def test_histogram(usm_type_v, usm_type_w):
     hist, edges = dpnp.histogram(v, weights=w)
     assert v.usm_type == usm_type_v
     assert w.usm_type == usm_type_w
-    assert hist.usm_type == du.get_coerced_usm_type([usm_type_v, usm_type_w])
-    assert edges.usm_type == du.get_coerced_usm_type([usm_type_v, usm_type_w])
+    assert hist.usm_type == dpt.get_coerced_usm_type([usm_type_v, usm_type_w])
+    assert edges.usm_type == dpt.get_coerced_usm_type([usm_type_v, usm_type_w])
 
 
 @pytest.mark.parametrize("usm_type_x", list_of_usm_types)
@@ -1172,13 +1177,13 @@ def test_histogram2d(usm_type_x, usm_type_y, usm_type_w):
     assert x.usm_type == usm_type_x
     assert y.usm_type == usm_type_y
     assert w.usm_type == usm_type_w
-    assert hist.usm_type == du.get_coerced_usm_type(
+    assert hist.usm_type == dpt.get_coerced_usm_type(
         [usm_type_x, usm_type_y, usm_type_w]
     )
-    assert edges_x.usm_type == du.get_coerced_usm_type(
+    assert edges_x.usm_type == dpt.get_coerced_usm_type(
         [usm_type_x, usm_type_y, usm_type_w]
     )
-    assert edges_y.usm_type == du.get_coerced_usm_type(
+    assert edges_y.usm_type == dpt.get_coerced_usm_type(
         [usm_type_x, usm_type_y, usm_type_w]
     )
 
@@ -1192,7 +1197,7 @@ def test_bincount(usm_type_v, usm_type_w):
     hist = dpnp.bincount(v, weights=w)
     assert v.usm_type == usm_type_v
     assert w.usm_type == usm_type_w
-    assert hist.usm_type == du.get_coerced_usm_type([usm_type_v, usm_type_w])
+    assert hist.usm_type == dpt.get_coerced_usm_type([usm_type_v, usm_type_w])
 
 
 @pytest.mark.parametrize("usm_type_v", list_of_usm_types)
@@ -1204,9 +1209,9 @@ def test_histogramdd(usm_type_v, usm_type_w):
     hist, edges = dpnp.histogramdd(v, weights=w)
     assert v.usm_type == usm_type_v
     assert w.usm_type == usm_type_w
-    assert hist.usm_type == du.get_coerced_usm_type([usm_type_v, usm_type_w])
+    assert hist.usm_type == dpt.get_coerced_usm_type([usm_type_v, usm_type_w])
     for e in edges:
-        assert e.usm_type == du.get_coerced_usm_type([usm_type_v, usm_type_w])
+        assert e.usm_type == dpt.get_coerced_usm_type([usm_type_v, usm_type_w])
 
 
 @pytest.mark.parametrize(
@@ -1247,7 +1252,7 @@ def test_histogram_bin_edges(usm_type_v, usm_type_w):
     edges = dpnp.histogram_bin_edges(v, weights=w)
     assert v.usm_type == usm_type_v
     assert w.usm_type == usm_type_w
-    assert edges.usm_type == du.get_coerced_usm_type([usm_type_v, usm_type_w])
+    assert edges.usm_type == dpt.get_coerced_usm_type([usm_type_v, usm_type_w])
 
 
 @pytest.mark.parametrize("usm_type_x", list_of_usm_types)
@@ -1256,7 +1261,7 @@ def test_select(usm_type_x, usm_type_y):
     condlist = [dpnp.array([True, False], usm_type=usm_type_x)]
     choicelist = [dpnp.array([1, 2], usm_type=usm_type_y)]
     res = dpnp.select(condlist, choicelist)
-    assert res.usm_type == du.get_coerced_usm_type([usm_type_x, usm_type_y])
+    assert res.usm_type == dpt.get_coerced_usm_type([usm_type_x, usm_type_y])
 
 
 @pytest.mark.parametrize("axis", [None, 0, -1])
@@ -1300,7 +1305,7 @@ def test_ediff1d(usm_type_x, usm_type_args, to_end, to_begin):
 
     res = dpnp.ediff1d(x, to_end=to_end, to_begin=to_begin)
 
-    assert res.usm_type == du.get_coerced_usm_type([usm_type_x, usm_type_args])
+    assert res.usm_type == dpt.get_coerced_usm_type([usm_type_x, usm_type_args])
 
 
 @pytest.mark.parametrize("usm_type", list_of_usm_types)
@@ -1337,7 +1342,7 @@ def test_choose(usm_type_x, usm_type_ind):
 
     assert chc.usm_type == usm_type_x
     assert ind.usm_type == usm_type_ind
-    assert z.usm_type == du.get_coerced_usm_type([usm_type_x, usm_type_ind])
+    assert z.usm_type == dpt.get_coerced_usm_type([usm_type_x, usm_type_ind])
 
 
 @pytest.mark.parametrize(
@@ -1371,7 +1376,7 @@ def test_basic(self, usm_type_x, usm_type_xp, usm_type_fp):
         assert x.usm_type == usm_type_x
         assert xp.usm_type == usm_type_xp
         assert fp.usm_type == usm_type_fp
-        assert result.usm_type == du.get_coerced_usm_type(
+        assert result.usm_type == dpt.get_coerced_usm_type(
             [usm_type_x, usm_type_xp, usm_type_fp]
         )
 
@@ -1390,7 +1395,7 @@ def test_left_right(self, usm_type_x, usm_type_left, usm_type_right):
 
         assert left.usm_type == usm_type_left
         assert right.usm_type == usm_type_right
-        assert result.usm_type == du.get_coerced_usm_type(
+        assert result.usm_type == dpt.get_coerced_usm_type(
             [
                 x.usm_type,
                 xp.usm_type,
@@ -1523,7 +1528,7 @@ def test_lstsq(self, m, n, nrhs, usm_type, usm_type_other):
         assert a.usm_type == usm_type
         assert b.usm_type == usm_type_other
         for param in result:
-            assert param.usm_type == du.get_coerced_usm_type(
+            assert param.usm_type == dpt.get_coerced_usm_type(
                 [usm_type, usm_type_other]
             )
 
@@ -1558,7 +1563,7 @@ def test_lu_solve(self, a_data, b_data, usm_type, usm_type_rhs):
 
         assert lu.usm_type == usm_type
         assert b.usm_type == usm_type_rhs
-        assert result.usm_type == du.get_coerced_usm_type(
+        assert result.usm_type == dpt.get_coerced_usm_type(
             [usm_type, usm_type_rhs]
         )
 
@@ -1718,7 +1723,7 @@ def test_solve(self, matrix, rhs, usm_type, usm_type_rhs):
 
         assert x.usm_type == usm_type
         assert y.usm_type == usm_type_rhs
-        assert z.usm_type == du.get_coerced_usm_type([usm_type, usm_type_rhs])
+        assert z.usm_type == dpt.get_coerced_usm_type([usm_type, usm_type_rhs])
 
     @pytest.mark.parametrize("full_matrices_param", [True, False])
     @pytest.mark.parametrize("compute_uv_param", [True, False])
@@ -1784,6 +1789,6 @@ def test_tensorsolve(self, usm_type, usm_type_other):
 
         assert a.usm_type == usm_type
         assert b.usm_type == usm_type_other
-        assert result.usm_type == du.get_coerced_usm_type(
+        assert result.usm_type == dpt.get_coerced_usm_type(
             [usm_type, usm_type_other]
         )

From d835f965f25e0caa143e8fad464161188d76c1f0 Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Fri, 10 Apr 2026 09:13:10 -0700
Subject: [PATCH 26/43] Use ExecutionPlacementError from dpnp.exceptions in
 dpnp

---
 dpnp/tests/test_fft.py        | 2 +-
 dpnp/tests/test_sycl_queue.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/dpnp/tests/test_fft.py b/dpnp/tests/test_fft.py
index 6c481e4b7917..f8cc95a7a3ca 100644
--- a/dpnp/tests/test_fft.py
+++ b/dpnp/tests/test_fft.py
@@ -277,7 +277,7 @@ def test_validate_out(self):
         # Inconsistent sycl_queue
         a = dpnp.ones((10,), dtype=dpnp.complex64, sycl_queue=dpctl.SyclQueue())
         out = dpnp.empty((10,), sycl_queue=dpctl.SyclQueue())
-        assert_raises(dpt.ExecutionPlacementError, dpnp.fft.fft, a, out=out)
+        assert_raises(ExecutionPlacementError, dpnp.fft.fft, a, out=out)
 
         # Invalid shape
         a = dpnp.ones((10,), dtype=dpnp.complex64)
diff --git a/dpnp/tests/test_sycl_queue.py b/dpnp/tests/test_sycl_queue.py
index d8ef27de50a7..5420285d5940 100644
--- a/dpnp/tests/test_sycl_queue.py
+++ b/dpnp/tests/test_sycl_queue.py
@@ -674,7 +674,7 @@ def test_2in_broadcasting(func, data1, data2, device):
 def test_2in_1out_diff_queue_but_equal_context(func, device):
     x1 = dpnp.arange(10)
     x2 = dpnp.arange(10, sycl_queue=dpctl.SyclQueue(device))[::-1]
-    with assert_raises((ValueError, dpt.ExecutionPlacementError)):
+    with assert_raises((ValueError, ExecutionPlacementError)):
         getattr(dpnp, func)(x1, x2)
 
 

From 56a0af4a6441d2b0809f2ddeb8838f196fb7f09f Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Fri, 10 Apr 2026 09:35:15 -0700
Subject: [PATCH 27/43] Apply clang-format

---
 dpnp/backend/include/dpnp4pybind11.hpp        |  20 +---
 .../include/kernels/accumulators.hpp          |  87 ++++++--------
 .../tensor/libtensor/include/kernels/clip.hpp |   3 +-
 .../include/kernels/copy_and_cast.hpp         |  13 +--
 .../include/kernels/copy_as_contiguous.hpp    |  18 +--
 .../kernels/elementwise_functions/abs.hpp     |   6 +-
 .../kernels/elementwise_functions/add.hpp     |  23 ++--
 .../kernels/elementwise_functions/atanh.hpp   |   4 +-
 .../elementwise_functions/bitwise_and.hpp     |   4 +-
 .../bitwise_left_shift.hpp                    |  10 +-
 .../bitwise_right_shift.hpp                   |  20 ++--
 .../elementwise_functions/bitwise_xor.hpp     |   4 +-
 .../kernels/elementwise_functions/cbrt.hpp    |   5 +-
 .../kernels/elementwise_functions/common.hpp  | 101 ++++++++---------
 .../elementwise_functions/common_detail.hpp   |   7 +-
 .../elementwise_functions/common_inplace.hpp  |  34 +++---
 .../kernels/elementwise_functions/equal.hpp   |   6 +-
 .../elementwise_functions/floor_divide.hpp    |  14 +--
 .../kernels/elementwise_functions/greater.hpp |   6 +-
 .../elementwise_functions/greater_equal.hpp   |   6 +-
 .../elementwise_functions/isfinite.hpp        |   3 +-
 .../kernels/elementwise_functions/isinf.hpp   |   3 +-
 .../kernels/elementwise_functions/isnan.hpp   |   3 +-
 .../kernels/elementwise_functions/less.hpp    |   6 +-
 .../elementwise_functions/less_equal.hpp      |   6 +-
 .../kernels/elementwise_functions/maximum.hpp |   9 +-
 .../kernels/elementwise_functions/minimum.hpp |   9 +-
 .../elementwise_functions/multiply.hpp        |  17 +--
 .../elementwise_functions/negative.hpp        |   5 +-
 .../elementwise_functions/not_equal.hpp       |   3 +-
 .../elementwise_functions/positive.hpp        |   5 +-
 .../kernels/elementwise_functions/pow.hpp     |  11 +-
 .../elementwise_functions/remainder.hpp       |  10 +-
 .../kernels/elementwise_functions/rsqrt.hpp   |   5 +-
 .../kernels/elementwise_functions/signbit.hpp |   5 +-
 .../elementwise_functions/subtract.hpp        |  14 +--
 .../elementwise_functions/true_divide.hpp     |  22 ++--
 .../kernels/linalg_functions/dot_product.hpp  |  36 +++---
 .../include/kernels/linalg_functions/gemm.hpp |  54 ++++-----
 .../libtensor/include/kernels/reductions.hpp  | 106 ++++++++----------
 .../include/kernels/sorting/merge_sort.hpp    |  32 ++----
 .../include/kernels/sorting/radix_sort.hpp    |  40 ++-----
 .../include/kernels/sorting/topk.hpp          |  12 +-
 .../libtensor/include/kernels/where.hpp       |   6 +-
 .../libtensor/include/utils/offset_utils.hpp  |  64 +++--------
 .../include/utils/rich_comparisons.hpp        |   6 +-
 .../libtensor/include/utils/strided_iters.hpp |  24 +---
 .../libtensor/include/utils/sycl_utils.hpp    |   5 +-
 .../libtensor/include/utils/type_dispatch.hpp |   4 +-
 .../include/utils/type_dispatch_building.hpp  |  16 +--
 .../libtensor/include/utils/type_utils.hpp    |   3 +-
 .../accumulators/accumulate_over_axis.hpp     |   3 +-
 .../accumulators/cumulative_logsumexp.cpp     |  12 +-
 .../source/accumulators/cumulative_prod.cpp   |  20 ++--
 .../source/accumulators/cumulative_sum.cpp    |  20 ++--
 .../source/boolean_advanced_indexing.cpp      |   9 +-
 dpnp/tensor/libtensor/source/clip.cpp         |   6 +-
 .../source/copy_and_cast_usm_to_usm.cpp       |   3 +-
 .../libtensor/source/copy_as_contig.cpp       |   6 +-
 .../source/device_support_queries.cpp         |  10 +-
 .../elementwise_functions.hpp                 |  27 ++---
 dpnp/tensor/libtensor/source/full_ctor.cpp    |   5 +-
 .../libtensor/source/linalg_functions/dot.cpp |  15 +--
 .../libtensor/source/reductions/argmax.cpp    |  15 +--
 .../libtensor/source/reductions/argmin.cpp    |  15 +--
 .../libtensor/source/reductions/logsumexp.cpp |   9 +-
 .../libtensor/source/reductions/max.cpp       |   9 +-
 .../libtensor/source/reductions/min.cpp       |   9 +-
 .../libtensor/source/reductions/prod.cpp      |  18 +--
 .../source/reductions/reduce_hypot.cpp        |   9 +-
 .../reductions/reduction_atomic_support.hpp   |   8 +-
 .../source/reductions/reduction_over_axis.hpp |  33 ++----
 .../libtensor/source/reductions/sum.cpp       |   9 +-
 dpnp/tensor/libtensor/source/repeat.cpp       |  11 +-
 .../source/simplify_iteration_space.cpp       |   3 +-
 .../source/sorting/merge_argsort.cpp          |   6 +-
 .../source/sorting/py_argsort_common.hpp      |   3 +-
 .../source/sorting/radix_argsort.cpp          |   6 +-
 .../libtensor/source/sorting/searchsorted.cpp |  19 ++--
 dpnp/tensor/libtensor/source/where.cpp        |   7 +-
 80 files changed, 445 insertions(+), 815 deletions(-)

diff --git a/dpnp/backend/include/dpnp4pybind11.hpp b/dpnp/backend/include/dpnp4pybind11.hpp
index ada7b7e380fb..896ff20873a5 100644
--- a/dpnp/backend/include/dpnp4pybind11.hpp
+++ b/dpnp/backend/include/dpnp4pybind11.hpp
@@ -195,22 +195,10 @@ class dpctl_capi
         return api;
     }
 
-    py::object default_sycl_queue_pyobj()
-    {
-        return *default_sycl_queue_;
-    }
-    py::object default_usm_memory_pyobj()
-    {
-        return *default_usm_memory_;
-    }
-    py::object default_usm_ndarray_pyobj()
-    {
-        return *default_usm_ndarray_;
-    }
-    py::object as_usm_memory_pyobj()
-    {
-        return *as_usm_memory_;
-    }
+    py::object default_sycl_queue_pyobj() { return *default_sycl_queue_; }
+    py::object default_usm_memory_pyobj() { return *default_usm_memory_; }
+    py::object default_usm_ndarray_pyobj() { return *default_usm_ndarray_; }
+    py::object as_usm_memory_pyobj() { return *as_usm_memory_; }
 
 private:
     struct Deleter
diff --git a/dpnp/tensor/libtensor/include/kernels/accumulators.hpp b/dpnp/tensor/libtensor/include/kernels/accumulators.hpp
index 60382e210d8b..9449c030ac67 100644
--- a/dpnp/tensor/libtensor/include/kernels/accumulators.hpp
+++ b/dpnp/tensor/libtensor/include/kernels/accumulators.hpp
@@ -85,10 +85,7 @@ struct NoOpTransformer
 {
     constexpr NoOpTransformer() {}
 
-    T operator()(const T &val) const
-    {
-        return val;
-    }
+    T operator()(const T &val) const { return val; }
 };
 
 template <typename srcTy, typename dstTy>
@@ -134,22 +131,13 @@ class stack_t
         : src_(src), size_(sz), local_scans_(local_scans)
     {
     }
-    ~stack_t(){};
+    ~stack_t() {};
 
-    T *get_src_ptr() const
-    {
-        return src_;
-    }
+    T *get_src_ptr() const { return src_; }
 
-    std::size_t get_size() const
-    {
-        return size_;
-    }
+    std::size_t get_size() const { return size_; }
 
-    T *get_local_scans_ptr() const
-    {
-        return local_scans_;
-    }
+    T *get_local_scans_ptr() const { return local_scans_; }
 };
 
 template <typename T>
@@ -170,27 +158,15 @@ class stack_strided_t
           local_stride_(local_stride)
     {
     }
-    ~stack_strided_t(){};
+    ~stack_strided_t() {};
 
-    T *get_src_ptr() const
-    {
-        return src_;
-    }
+    T *get_src_ptr() const { return src_; }
 
-    std::size_t get_size() const
-    {
-        return size_;
-    }
+    std::size_t get_size() const { return size_; }
 
-    T *get_local_scans_ptr() const
-    {
-        return local_scans_;
-    }
+    T *get_local_scans_ptr() const { return local_scans_; }
 
-    std::size_t get_local_stride() const
-    {
-        return local_stride_;
-    }
+    std::size_t get_local_stride() const { return local_stride_; }
 };
 
 } // end of namespace detail
@@ -515,32 +491,35 @@ sycl::event inclusive_scan_base_step_striped(
             it.barrier(sycl::access::fence_space::local_space);
 
             // convert back to blocked layout
-            {{const std::uint32_t local_offset0 = lid * n_wi;
+            {
+                {
+                    const std::uint32_t local_offset0 = lid * n_wi;
 #pragma unroll
-            for (nwiT m_wi = 0; m_wi < n_wi; ++m_wi) {
-                slm_iscan_tmp[local_offset0 + m_wi] = local_iscan[m_wi];
-            }
+                    for (nwiT m_wi = 0; m_wi < n_wi; ++m_wi) {
+                        slm_iscan_tmp[local_offset0 + m_wi] = local_iscan[m_wi];
+                    }
 
-            it.barrier(sycl::access::fence_space::local_space);
+                    it.barrier(sycl::access::fence_space::local_space);
                 }
             }
 
             {
-        const std::uint32_t block_offset = sgroup_id * sgSize * n_wi + lane_id;
+                const std::uint32_t block_offset =
+                    sgroup_id * sgSize * n_wi + lane_id;
 #pragma unroll
-        for (nwiT m_wi = 0; m_wi < n_wi; ++m_wi) {
-            const std::uint32_t m_wi_scaled = m_wi * sgSize;
-            const std::size_t out_id = inp_id0 + m_wi_scaled;
-            if (out_id < acc_nelems) {
-                output[out_iter_offset + out_indexer(out_id)] =
-                    slm_iscan_tmp[block_offset + m_wi_scaled];
-            }
-        }
+                for (nwiT m_wi = 0; m_wi < n_wi; ++m_wi) {
+                    const std::uint32_t m_wi_scaled = m_wi * sgSize;
+                    const std::size_t out_id = inp_id0 + m_wi_scaled;
+                    if (out_id < acc_nelems) {
+                        output[out_iter_offset + out_indexer(out_id)] =
+                            slm_iscan_tmp[block_offset + m_wi_scaled];
+                    }
+                }
             }
-});
-});
+        });
+    });
 
-return inc_scan_phase1_ev;
+    return inc_scan_phase1_ev;
 }
 
 template <typename inputT,
@@ -746,8 +725,7 @@ sycl::event inclusive_scan_iter_1d(sycl::queue &exec_q,
         }
 
         for (std::size_t reverse_stack_id = 0; reverse_stack_id < stack.size();
-             ++reverse_stack_id)
-        {
+             ++reverse_stack_id) {
             const std::size_t stack_id = stack.size() - 1 - reverse_stack_id;
 
             const auto &stack_elem = stack[stack_id];
@@ -1082,8 +1060,7 @@ sycl::event inclusive_scan_iter(sycl::queue &exec_q,
         }
 
         for (std::size_t reverse_stack_id = 0;
-             reverse_stack_id < stack.size() - 1; ++reverse_stack_id)
-        {
+             reverse_stack_id < stack.size() - 1; ++reverse_stack_id) {
             const std::size_t stack_id = stack.size() - 1 - reverse_stack_id;
 
             const auto &stack_elem = stack[stack_id];
diff --git a/dpnp/tensor/libtensor/include/kernels/clip.hpp b/dpnp/tensor/libtensor/include/kernels/clip.hpp
index 58a86a8f82d6..900fcf3df100 100644
--- a/dpnp/tensor/libtensor/include/kernels/clip.hpp
+++ b/dpnp/tensor/libtensor/include/kernels/clip.hpp
@@ -219,8 +219,7 @@ sycl::event clip_contig_impl(sycl::queue &q,
         if (is_aligned<required_alignment>(x_cp) &&
             is_aligned<required_alignment>(min_cp) &&
             is_aligned<required_alignment>(max_cp) &&
-            is_aligned<required_alignment>(dst_cp))
-        {
+            is_aligned<required_alignment>(dst_cp)) {
             static constexpr bool enable_sg_loadstore = true;
             using KernelName = clip_contig_kernel<T, vec_sz, n_vecs>;
             using Impl =
diff --git a/dpnp/tensor/libtensor/include/kernels/copy_and_cast.hpp b/dpnp/tensor/libtensor/include/kernels/copy_and_cast.hpp
index d6001a11e471..2c4146d467e6 100644
--- a/dpnp/tensor/libtensor/include/kernels/copy_and_cast.hpp
+++ b/dpnp/tensor/libtensor/include/kernels/copy_and_cast.hpp
@@ -352,8 +352,7 @@ sycl::event copy_and_cast_contig_impl(sycl::queue &q,
         const auto lws_range = sycl::range<1>(lws);
 
         if (is_aligned<required_alignment>(src_cp) &&
-            is_aligned<required_alignment>(dst_cp))
-        {
+            is_aligned<required_alignment>(dst_cp)) {
             static constexpr bool enable_sg_loadstore = true;
             using KernelName =
                 copy_cast_contig_kernel<srcTy, dstTy, vec_sz, n_vecs>;
@@ -920,10 +919,7 @@ struct CompositionIndexer
 {
     CompositionIndexer(IndexerT f, TransformerT t) : f_(f), t_(t) {}
 
-    auto operator()(std::size_t gid) const
-    {
-        return f_(t_(gid));
-    }
+    auto operator()(std::size_t gid) const { return f_(t_(gid)); }
 
 private:
     IndexerT f_;
@@ -944,10 +940,7 @@ struct RolledNDIndexer
     {
     }
 
-    ssize_t operator()(std::size_t gid) const
-    {
-        return compute_offset(gid);
-    }
+    ssize_t operator()(std::size_t gid) const { return compute_offset(gid); }
 
 private:
     int nd_ = -1;
diff --git a/dpnp/tensor/libtensor/include/kernels/copy_as_contiguous.hpp b/dpnp/tensor/libtensor/include/kernels/copy_as_contiguous.hpp
index 37126a22dc64..a723f6334e7e 100644
--- a/dpnp/tensor/libtensor/include/kernels/copy_as_contiguous.hpp
+++ b/dpnp/tensor/libtensor/include/kernels/copy_as_contiguous.hpp
@@ -261,10 +261,7 @@ typedef sycl::event (*as_c_contiguous_array_impl_fn_ptr_t)(
 template <typename fnT, typename T>
 struct AsCContigFactory
 {
-    fnT get()
-    {
-        return as_c_contiguous_array_generic_impl<T>;
-    }
+    fnT get() { return as_c_contiguous_array_generic_impl<T>; }
 };
 
 template <typename T,
@@ -496,8 +493,7 @@ sycl::event as_c_contiguous_batch_of_square_matrices_impl(
             else {
                 // map local_linear_id into (local_dim0, local_dim1)
                 for (std::uint16_t el_id = lid_lin;
-                     el_id < local_dim0 * local_dim1; el_id += lws0 * lws1)
-                {
+                     el_id < local_dim0 * local_dim1; el_id += lws0 * lws1) {
 
                     // 0 <= local_i0 < local_dim0
                     const std::uint16_t loc_i0 = el_id / local_dim1;
@@ -577,10 +573,7 @@ typedef sycl::event (
 template <typename fnT, typename T>
 struct AsCContig1DBatchOfSquareMatricesFactory
 {
-    fnT get()
-    {
-        return as_c_contiguous_1d_batch_of_square_matrices_impl<T>;
-    }
+    fnT get() { return as_c_contiguous_1d_batch_of_square_matrices_impl<T>; }
 };
 
 template <typename T>
@@ -638,9 +631,6 @@ typedef sycl::event (
 template <typename fnT, typename T>
 struct AsCContigNDBatchOfSquareMatricesFactory
 {
-    fnT get()
-    {
-        return as_c_contiguous_nd_batch_of_square_matrices_impl<T>;
-    }
+    fnT get() { return as_c_contiguous_nd_batch_of_square_matrices_impl<T>; }
 };
 } // namespace dpctl::tensor::kernels::copy_as_contig
diff --git a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/abs.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/abs.hpp
index 1f0b3df33e4e..250ba1d70455 100644
--- a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/abs.hpp
+++ b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/abs.hpp
@@ -73,8 +73,7 @@ struct AbsFunctor
 
         if constexpr (std::is_same_v<argT, bool> ||
                       (std::is_integral<argT>::value &&
-                       std::is_unsigned<argT>::value))
-        {
+                       std::is_unsigned<argT>::value)) {
             static_assert(std::is_same_v<resT, argT>);
             return x;
         }
@@ -83,8 +82,7 @@ struct AbsFunctor
                 return detail::cabs(x);
             }
             else if constexpr (std::is_same_v<argT, sycl::half> ||
-                               std::is_floating_point_v<argT>)
-            {
+                               std::is_floating_point_v<argT>) {
                 return (sycl::signbit(x) ? -x : x);
             }
             else {
diff --git a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/add.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/add.hpp
index 1b7440304f0e..c7386f99236a 100644
--- a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/add.hpp
+++ b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/add.hpp
@@ -71,23 +71,20 @@ struct AddFunctor
     resT operator()(const argT1 &in1, const argT2 &in2) const
     {
         if constexpr (tu_ns::is_complex<argT1>::value &&
-                      tu_ns::is_complex<argT2>::value)
-        {
+                      tu_ns::is_complex<argT2>::value) {
             using rT1 = typename argT1::value_type;
             using rT2 = typename argT2::value_type;
 
             return exprm_ns::complex<rT1>(in1) + exprm_ns::complex<rT2>(in2);
         }
         else if constexpr (tu_ns::is_complex<argT1>::value &&
-                           !tu_ns::is_complex<argT2>::value)
-        {
+                           !tu_ns::is_complex<argT2>::value) {
             using rT1 = typename argT1::value_type;
 
             return exprm_ns::complex<rT1>(in1) + in2;
         }
         else if constexpr (!tu_ns::is_complex<argT1>::value &&
-                           tu_ns::is_complex<argT2>::value)
-        {
+                           tu_ns::is_complex<argT2>::value) {
             using rT2 = typename argT2::value_type;
 
             return in1 + exprm_ns::complex<rT2>(in2);
@@ -402,8 +399,7 @@ struct AddContigMatrixContigRowBroadcastFactory
             using resT = typename AddOutputType<T1, T2>::value_type;
             if constexpr (dpctl::tensor::type_utils::is_complex<T1>::value ||
                           dpctl::tensor::type_utils::is_complex<T2>::value ||
-                          dpctl::tensor::type_utils::is_complex<resT>::value)
-            {
+                          dpctl::tensor::type_utils::is_complex<resT>::value) {
                 fnT fn = nullptr;
                 return fn;
             }
@@ -449,8 +445,7 @@ struct AddContigRowContigMatrixBroadcastFactory
             using resT = typename AddOutputType<T1, T2>::value_type;
             if constexpr (dpctl::tensor::type_utils::is_complex<T1>::value ||
                           dpctl::tensor::type_utils::is_complex<T2>::value ||
-                          dpctl::tensor::type_utils::is_complex<resT>::value)
-            {
+                          dpctl::tensor::type_utils::is_complex<resT>::value) {
                 fnT fn = nullptr;
                 return fn;
             }
@@ -472,10 +467,7 @@ struct AddInplaceFunctor
     using supports_vec = std::negation<
         std::disjunction<tu_ns::is_complex<argT>, tu_ns::is_complex<resT>>>;
 
-    void operator()(resT &res, const argT &in)
-    {
-        res += in;
-    }
+    void operator()(resT &res, const argT &in) { res += in; }
 
     template <int vec_sz>
     void operator()(sycl::vec<resT, vec_sz> &res,
@@ -672,8 +664,7 @@ struct AddInplaceRowMatrixBroadcastFactory
         }
         else {
             if constexpr (dpctl::tensor::type_utils::is_complex<T1>::value ||
-                          dpctl::tensor::type_utils::is_complex<T2>::value)
-            {
+                          dpctl::tensor::type_utils::is_complex<T2>::value) {
                 fnT fn = nullptr;
                 return fn;
             }
diff --git a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/atanh.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/atanh.hpp
index f72380ae3de9..32f5384f4ad8 100644
--- a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/atanh.hpp
+++ b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/atanh.hpp
@@ -123,8 +123,8 @@ struct AtanhFunctor
              */
             const realT RECIP_EPSILON =
                 realT(1) / std::numeric_limits<realT>::epsilon();
-            if (sycl::fabs(x) > RECIP_EPSILON || sycl::fabs(y) > RECIP_EPSILON)
-            {
+            if (sycl::fabs(x) > RECIP_EPSILON ||
+                sycl::fabs(y) > RECIP_EPSILON) {
                 const realT pi_half = sycl::atan(realT(1)) * 2;
 
                 const realT res_re = realT(0);
diff --git a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/bitwise_and.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/bitwise_and.hpp
index d0b644c2f6bb..dae2e62a76b2 100644
--- a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/bitwise_and.hpp
+++ b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/bitwise_and.hpp
@@ -372,8 +372,8 @@ struct BitwiseAndInplaceTypeMapFactory
     /*! @brief get typeid for output type of x &= y */
     std::enable_if_t<std::is_same<fnT, int>::value, int> get()
     {
-        if constexpr (BitwiseAndInplaceTypePairSupport<argT, resT>::is_defined)
-        {
+        if constexpr (BitwiseAndInplaceTypePairSupport<argT,
+                                                       resT>::is_defined) {
             return td_ns::GetTypeid<resT>{}.get();
         }
         else {
diff --git a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/bitwise_left_shift.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/bitwise_left_shift.hpp
index 549a220fbabc..59279a803ed8 100644
--- a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/bitwise_left_shift.hpp
+++ b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/bitwise_left_shift.hpp
@@ -307,10 +307,7 @@ struct BitwiseLeftShiftInplaceFunctor
     using supports_sg_loadstore = typename std::true_type;
     using supports_vec = typename std::true_type;
 
-    void operator()(resT &res, const argT &in) const
-    {
-        impl(res, in);
-    }
+    void operator()(resT &res, const argT &in) const { impl(res, in); }
 
     template <int vec_sz>
     void operator()(sycl::vec<resT, vec_sz> &res,
@@ -392,9 +389,8 @@ struct BitwiseLeftShiftInplaceTypeMapFactory
     /*! @brief get typeid for output type of x <<= y */
     std::enable_if_t<std::is_same<fnT, int>::value, int> get()
     {
-        if constexpr (BitwiseLeftShiftInplaceTypePairSupport<argT,
-                                                             resT>::is_defined)
-        {
+        if constexpr (BitwiseLeftShiftInplaceTypePairSupport<
+                          argT, resT>::is_defined) {
             return td_ns::GetTypeid<resT>{}.get();
         }
         else {
diff --git a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/bitwise_right_shift.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/bitwise_right_shift.hpp
index 49e05ac43f9a..241852b6a06e 100644
--- a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/bitwise_right_shift.hpp
+++ b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/bitwise_right_shift.hpp
@@ -309,10 +309,7 @@ struct BitwiseRightShiftInplaceFunctor
     using supports_sg_loadstore = typename std::true_type;
     using supports_vec = typename std::true_type;
 
-    void operator()(resT &res, const argT &in) const
-    {
-        impl(res, in);
-    }
+    void operator()(resT &res, const argT &in) const { impl(res, in); }
 
     template <int vec_sz>
     void operator()(sycl::vec<resT, vec_sz> &res,
@@ -396,9 +393,8 @@ struct BitwiseRightShiftInplaceTypeMapFactory
     /*! @brief get typeid for output type of x >>= y */
     std::enable_if_t<std::is_same<fnT, int>::value, int> get()
     {
-        if constexpr (BitwiseRightShiftInplaceTypePairSupport<argT,
-                                                              resT>::is_defined)
-        {
+        if constexpr (BitwiseRightShiftInplaceTypePairSupport<
+                          argT, resT>::is_defined) {
             return td_ns::GetTypeid<resT>{}.get();
         }
         else {
@@ -436,9 +432,8 @@ struct BitwiseRightShiftInplaceContigFactory
 {
     fnT get()
     {
-        if constexpr (!BitwiseRightShiftInplaceTypePairSupport<T1,
-                                                               T2>::is_defined)
-        {
+        if constexpr (!BitwiseRightShiftInplaceTypePairSupport<
+                          T1, T2>::is_defined) {
             fnT fn = nullptr;
             return fn;
         }
@@ -477,9 +472,8 @@ struct BitwiseRightShiftInplaceStridedFactory
 {
     fnT get()
     {
-        if constexpr (!BitwiseRightShiftInplaceTypePairSupport<T1,
-                                                               T2>::is_defined)
-        {
+        if constexpr (!BitwiseRightShiftInplaceTypePairSupport<
+                          T1, T2>::is_defined) {
             fnT fn = nullptr;
             return fn;
         }
diff --git a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/bitwise_xor.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/bitwise_xor.hpp
index 2238492d50d3..292cf3f76df6 100644
--- a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/bitwise_xor.hpp
+++ b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/bitwise_xor.hpp
@@ -375,8 +375,8 @@ struct BitwiseXorInplaceTypeMapFactory
     /*! @brief get typeid for output type of x ^= y */
     std::enable_if_t<std::is_same<fnT, int>::value, int> get()
     {
-        if constexpr (BitwiseXorInplaceTypePairSupport<argT, resT>::is_defined)
-        {
+        if constexpr (BitwiseXorInplaceTypePairSupport<argT,
+                                                       resT>::is_defined) {
             return td_ns::GetTypeid<resT>{}.get();
         }
         else {
diff --git a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/cbrt.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/cbrt.hpp
index 57bbb09523a4..20fb0ea7bcda 100644
--- a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/cbrt.hpp
+++ b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/cbrt.hpp
@@ -67,10 +67,7 @@ struct CbrtFunctor
     // do both argTy and resTy support sugroup store/load operation
     using supports_sg_loadstore = typename std::true_type;
 
-    resT operator()(const argT &in) const
-    {
-        return sycl::cbrt(in);
-    }
+    resT operator()(const argT &in) const { return sycl::cbrt(in); }
 };
 
 template <typename argTy,
diff --git a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/common.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/common.hpp
index 1c072dc58fdc..cfe3f4898491 100644
--- a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/common.hpp
+++ b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/common.hpp
@@ -88,8 +88,8 @@ struct UnaryContigFunctor
         /* Each work-item processes vec_sz elements, contiguous in memory */
         /* NOTE: work-group size must be divisible by sub-group size */
 
-        if constexpr (enable_sg_loadstore && UnaryOperatorT::is_constant::value)
-        {
+        if constexpr (enable_sg_loadstore &&
+                      UnaryOperatorT::is_constant::value) {
             // value of operator is known to be a known constant
             constexpr resT const_val = UnaryOperatorT::constant_value;
 
@@ -120,8 +120,8 @@ struct UnaryContigFunctor
         }
         else if constexpr (enable_sg_loadstore &&
                            UnaryOperatorT::supports_sg_loadstore::value &&
-                           UnaryOperatorT::supports_vec::value && (vec_sz > 1))
-        {
+                           UnaryOperatorT::supports_vec::value &&
+                           (vec_sz > 1)) {
             auto sg = ndit.get_sub_group();
             const std::uint16_t sgSize = sg.get_max_local_range()[0];
 
@@ -155,8 +155,7 @@ struct UnaryContigFunctor
         }
         else if constexpr (enable_sg_loadstore &&
                            UnaryOperatorT::supports_sg_loadstore::value &&
-                           std::is_same_v<resT, argT>)
-        {
+                           std::is_same_v<resT, argT>) {
             // default: use scalar-value function
 
             auto sg = ndit.get_sub_group();
@@ -193,8 +192,7 @@ struct UnaryContigFunctor
             }
         }
         else if constexpr (enable_sg_loadstore &&
-                           UnaryOperatorT::supports_sg_loadstore::value)
-        {
+                           UnaryOperatorT::supports_sg_loadstore::value) {
             // default: use scalar-value function
 
             auto sg = ndit.get_sub_group();
@@ -290,16 +288,16 @@ SizeT select_lws(const sycl::device &, SizeT n_work_items_needed)
 }
 
 template <typename argTy,
-          template <typename T>
-          class UnaryOutputType,
+          template <typename T> class UnaryOutputType,
           template <typename A,
                     typename R,
                     std::uint8_t vs,
                     std::uint8_t nv,
-                    bool enable>
-          class ContigFunctorT,
-          template <typename A, typename R, std::uint8_t vs, std::uint8_t nv>
-          class kernel_name,
+                    bool enable> class ContigFunctorT,
+          template <typename A,
+                    typename R,
+                    std::uint8_t vs,
+                    std::uint8_t nv> class kernel_name,
           std::uint8_t vec_sz = 4u,
           std::uint8_t n_vecs = 2u>
 sycl::event unary_contig_impl(sycl::queue &exec_q,
@@ -328,8 +326,7 @@ sycl::event unary_contig_impl(sycl::queue &exec_q,
         cgh.depends_on(depends);
 
         if (is_aligned<required_alignment>(arg_p) &&
-            is_aligned<required_alignment>(res_p))
-        {
+            is_aligned<required_alignment>(res_p)) {
             static constexpr bool enable_sg_loadstore = true;
             using KernelName = BaseKernelName;
             using Impl = ContigFunctorT<argTy, resTy, vec_sz, n_vecs,
@@ -356,12 +353,9 @@ sycl::event unary_contig_impl(sycl::queue &exec_q,
 }
 
 template <typename argTy,
-          template <typename T>
-          class UnaryOutputType,
-          template <typename A, typename R, typename I>
-          class StridedFunctorT,
-          template <typename A, typename R, typename I>
-          class kernel_name>
+          template <typename T> class UnaryOutputType,
+          template <typename A, typename R, typename I> class StridedFunctorT,
+          template <typename A, typename R, typename I> class kernel_name>
 sycl::event
     unary_strided_impl(sycl::queue &exec_q,
                        std::size_t nelems,
@@ -428,8 +422,7 @@ struct BinaryContigFunctor
 
         if constexpr (enable_sg_loadstore &&
                       BinaryOperatorT::supports_sg_loadstore::value &&
-                      BinaryOperatorT::supports_vec::value && (vec_sz > 1))
-        {
+                      BinaryOperatorT::supports_vec::value && (vec_sz > 1)) {
             auto sg = ndit.get_sub_group();
             std::uint16_t sgSize = sg.get_max_local_range()[0];
 
@@ -469,8 +462,7 @@ struct BinaryContigFunctor
             }
         }
         else if constexpr (enable_sg_loadstore &&
-                           BinaryOperatorT::supports_sg_loadstore::value)
-        {
+                           BinaryOperatorT::supports_sg_loadstore::value) {
             auto sg = ndit.get_sub_group();
             const std::uint16_t sgSize = sg.get_max_local_range()[0];
 
@@ -771,21 +763,18 @@ typedef sycl::event (*binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t)(
 
 template <typename argTy1,
           typename argTy2,
-          template <typename T1, typename T2>
-          class BinaryOutputType,
+          template <typename T1, typename T2> class BinaryOutputType,
           template <typename T1,
                     typename T2,
                     typename T3,
                     std::uint8_t vs,
                     std::uint8_t nv,
-                    bool enable_sg_loadstore>
-          class BinaryContigFunctorT,
+                    bool enable_sg_loadstore> class BinaryContigFunctorT,
           template <typename T1,
                     typename T2,
                     typename T3,
                     std::uint8_t vs,
-                    std::uint8_t nv>
-          class kernel_name,
+                    std::uint8_t nv> class kernel_name,
           std::uint8_t vec_sz = 4u,
           std::uint8_t n_vecs = 2u>
 sycl::event binary_contig_impl(sycl::queue &exec_q,
@@ -821,8 +810,7 @@ sycl::event binary_contig_impl(sycl::queue &exec_q,
 
         if (is_aligned<required_alignment>(arg1_tp) &&
             is_aligned<required_alignment>(arg2_tp) &&
-            is_aligned<required_alignment>(res_tp))
-        {
+            is_aligned<required_alignment>(res_tp)) {
             static constexpr bool enable_sg_loadstore = true;
             using KernelName = BaseKernelName;
             using Impl = BinaryContigFunctorT<argTy1, argTy2, resTy, vec_sz,
@@ -849,12 +837,15 @@ sycl::event binary_contig_impl(sycl::queue &exec_q,
 
 template <typename argTy1,
           typename argTy2,
-          template <typename T1, typename T2>
-          class BinaryOutputType,
-          template <typename T1, typename T2, typename T3, typename IndT>
-          class BinaryStridedFunctorT,
-          template <typename T1, typename T2, typename T3, typename IndT>
-          class kernel_name>
+          template <typename T1, typename T2> class BinaryOutputType,
+          template <typename T1,
+                    typename T2,
+                    typename T3,
+                    typename IndT> class BinaryStridedFunctorT,
+          template <typename T1,
+                    typename T2,
+                    typename T3,
+                    typename IndT> class kernel_name>
 sycl::event
     binary_strided_impl(sycl::queue &exec_q,
                         std::size_t nelems,
@@ -893,13 +884,14 @@ sycl::event
     return comp_ev;
 }
 
-template <typename argT1,
-          typename argT2,
-          typename resT,
-          template <typename T1, typename T2, typename T3>
-          class BinaryContigMatrixContigRowBroadcastFunctorT,
-          template <typename T1, typename T2, typename T3>
-          class kernel_name>
+template <
+    typename argT1,
+    typename argT2,
+    typename resT,
+    template <typename T1,
+              typename T2,
+              typename T3> class BinaryContigMatrixContigRowBroadcastFunctorT,
+    template <typename T1, typename T2, typename T3> class kernel_name>
 sycl::event binary_contig_matrix_contig_row_broadcast_impl(
     sycl::queue &exec_q,
     std::vector<sycl::event> &host_tasks,
@@ -967,13 +959,14 @@ sycl::event binary_contig_matrix_contig_row_broadcast_impl(
     return comp_ev;
 }
 
-template <typename argT1,
-          typename argT2,
-          typename resT,
-          template <typename T1, typename T2, typename T3>
-          class BinaryContigRowContigMatrixBroadcastFunctorT,
-          template <typename T1, typename T2, typename T3>
-          class kernel_name>
+template <
+    typename argT1,
+    typename argT2,
+    typename resT,
+    template <typename T1,
+              typename T2,
+              typename T3> class BinaryContigRowContigMatrixBroadcastFunctorT,
+    template <typename T1, typename T2, typename T3> class kernel_name>
 sycl::event binary_contig_row_contig_matrix_broadcast_impl(
     sycl::queue &exec_q,
     std::vector<sycl::event> &host_tasks,
diff --git a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/common_detail.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/common_detail.hpp
index b304b5ac3a39..68d025ec6307 100644
--- a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/common_detail.hpp
+++ b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/common_detail.hpp
@@ -58,10 +58,9 @@ sycl::event
         sycl::range<1> gRange{padded_vec_sz};
 
         cgh.parallel_for<class populate_padded_vec_krn<T>>(
-            gRange, [=](sycl::id<1> id)
-        {
-            std::size_t i = id[0];
-            padded_vec[i] = vec[i % vec_sz];
+            gRange, [=](sycl::id<1> id) {
+                std::size_t i = id[0];
+                padded_vec[i] = vec[i % vec_sz];
             });
     });
 
diff --git a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/common_inplace.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/common_inplace.hpp
index 2c028bc30155..61902fce888a 100644
--- a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/common_inplace.hpp
+++ b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/common_inplace.hpp
@@ -92,8 +92,7 @@ struct BinaryInplaceContigFunctor
         if constexpr (enable_sg_loadstore &&
                       BinaryInplaceOperatorT::supports_sg_loadstore::value &&
                       BinaryInplaceOperatorT::supports_vec::value &&
-                      (vec_sz > 1))
-        {
+                      (vec_sz > 1)) {
             auto sg = ndit.get_sub_group();
             std::uint16_t sgSize = sg.get_max_local_range()[0];
 
@@ -130,8 +129,8 @@ struct BinaryInplaceContigFunctor
             }
         }
         else if constexpr (enable_sg_loadstore &&
-                           BinaryInplaceOperatorT::supports_sg_loadstore::value)
-        {
+                           BinaryInplaceOperatorT::supports_sg_loadstore::
+                               value) {
             auto sg = ndit.get_sub_group();
             std::uint16_t sgSize = sg.get_max_local_range()[0];
 
@@ -312,10 +311,11 @@ template <typename argTy,
                     typename T2,
                     std::uint8_t vs,
                     std::uint8_t nv,
-                    bool enable_sg_loadstore>
-          class BinaryInplaceContigFunctorT,
-          template <typename T1, typename T2, std::uint8_t vs, std::uint8_t nv>
-          class kernel_name,
+                    bool enable_sg_loadstore> class BinaryInplaceContigFunctorT,
+          template <typename T1,
+                    typename T2,
+                    std::uint8_t vs,
+                    std::uint8_t nv> class kernel_name,
           std::uint8_t vec_sz = 4u,
           std::uint8_t n_vecs = 2u>
 sycl::event
@@ -341,8 +341,7 @@ sycl::event
         resTy *res_tp = reinterpret_cast<resTy *>(lhs_p) + lhs_offset;
 
         if (is_aligned<required_alignment>(arg_tp) &&
-            is_aligned<required_alignment>(res_tp))
-        {
+            is_aligned<required_alignment>(res_tp)) {
             static constexpr bool enable_sg_loadstore = true;
             using KernelName = kernel_name<argTy, resTy, vec_sz, n_vecs>;
             using Impl =
@@ -372,10 +371,10 @@ sycl::event
 
 template <typename argTy,
           typename resTy,
-          template <typename T1, typename T2, typename IndT>
-          class BinaryInplaceStridedFunctorT,
-          template <typename T1, typename T2, typename IndT>
-          class kernel_name>
+          template <typename T1,
+                    typename T2,
+                    typename IndT> class BinaryInplaceStridedFunctorT,
+          template <typename T1, typename T2, typename IndT> class kernel_name>
 sycl::event binary_inplace_strided_impl(
     sycl::queue &exec_q,
     std::size_t nelems,
@@ -410,10 +409,9 @@ sycl::event binary_inplace_strided_impl(
 
 template <typename argT,
           typename resT,
-          template <typename T1, typename T3>
-          class BinaryInplaceRowMatrixBroadcastFunctorT,
-          template <typename T1, typename T3>
-          class kernel_name>
+          template <typename T1,
+                    typename T3> class BinaryInplaceRowMatrixBroadcastFunctorT,
+          template <typename T1, typename T3> class kernel_name>
 sycl::event binary_inplace_row_matrix_broadcast_impl(
     sycl::queue &exec_q,
     std::vector<sycl::event> &host_tasks,
diff --git a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/equal.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/equal.hpp
index 3a838e919369..07b3566c5cef 100644
--- a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/equal.hpp
+++ b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/equal.hpp
@@ -73,8 +73,7 @@ struct EqualFunctor
     resT operator()(const argT1 &in1, const argT2 &in2) const
     {
         if constexpr (tu_ns::is_complex<argT1>::value &&
-                      tu_ns::is_complex<argT2>::value)
-        {
+                      tu_ns::is_complex<argT2>::value) {
             using realT1 = typename argT1::value_type;
             using realT2 = typename argT2::value_type;
 
@@ -84,8 +83,7 @@ struct EqualFunctor
         else {
             if constexpr (std::is_integral_v<argT1> &&
                           std::is_integral_v<argT2> &&
-                          std::is_signed_v<argT1> != std::is_signed_v<argT2>)
-            {
+                          std::is_signed_v<argT1> != std::is_signed_v<argT2>) {
                 if constexpr (std::is_signed_v<argT1> &&
                               !std::is_signed_v<argT2>) {
                     return (in1 < 0) ? false : (static_cast<argT2>(in1) == in2);
diff --git a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/floor_divide.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/floor_divide.hpp
index 19ee9d268770..e669a97c04ea 100644
--- a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/floor_divide.hpp
+++ b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/floor_divide.hpp
@@ -128,10 +128,7 @@ struct FloorDivideFunctor
     }
 
 private:
-    bool l_xor(bool b1, bool b2) const
-    {
-        return (b1 != b2);
-    }
+    bool l_xor(bool b1, bool b2) const { return (b1 != b2); }
 };
 
 template <typename argT1,
@@ -401,10 +398,7 @@ struct FloorDivideInplaceFunctor
     }
 
 private:
-    bool l_xor(bool b1, bool b2) const
-    {
-        return (b1 != b2);
-    }
+    bool l_xor(bool b1, bool b2) const { return (b1 != b2); }
 };
 
 template <typename argT,
@@ -462,8 +456,8 @@ struct FloorDivideInplaceTypeMapFactory
     /*! @brief get typeid for output type of x //= y */
     std::enable_if_t<std::is_same<fnT, int>::value, int> get()
     {
-        if constexpr (FloorDivideInplaceTypePairSupport<argT, resT>::is_defined)
-        {
+        if constexpr (FloorDivideInplaceTypePairSupport<argT,
+                                                        resT>::is_defined) {
             return td_ns::GetTypeid<resT>{}.get();
         }
         else {
diff --git a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/greater.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/greater.hpp
index 3e38b5f4deca..9b3659faa161 100644
--- a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/greater.hpp
+++ b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/greater.hpp
@@ -73,8 +73,7 @@ struct GreaterFunctor
     resT operator()(const argT1 &in1, const argT2 &in2) const
     {
         if constexpr (tu_ns::is_complex<argT1>::value ||
-                      tu_ns::is_complex<argT2>::value)
-        {
+                      tu_ns::is_complex<argT2>::value) {
             static_assert(std::is_same_v<argT1, argT2>);
             using dpctl::tensor::math_utils::greater_complex;
             return greater_complex<argT1>(in1, in2);
@@ -82,8 +81,7 @@ struct GreaterFunctor
         else {
             if constexpr (std::is_integral_v<argT1> &&
                           std::is_integral_v<argT2> &&
-                          std::is_signed_v<argT1> != std::is_signed_v<argT2>)
-            {
+                          std::is_signed_v<argT1> != std::is_signed_v<argT2>) {
                 if constexpr (std::is_signed_v<argT1> &&
                               !std::is_signed_v<argT2>) {
                     return (in1 < 0) ? false : (static_cast<argT2>(in1) > in2);
diff --git a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/greater_equal.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/greater_equal.hpp
index 029741b02600..25c56d4d40a4 100644
--- a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/greater_equal.hpp
+++ b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/greater_equal.hpp
@@ -73,8 +73,7 @@ struct GreaterEqualFunctor
     resT operator()(const argT1 &in1, const argT2 &in2) const
     {
         if constexpr (tu_ns::is_complex<argT1>::value ||
-                      tu_ns::is_complex<argT2>::value)
-        {
+                      tu_ns::is_complex<argT2>::value) {
             static_assert(std::is_same_v<argT1, argT2>);
             using dpctl::tensor::math_utils::greater_equal_complex;
             return greater_equal_complex<argT1>(in1, in2);
@@ -82,8 +81,7 @@ struct GreaterEqualFunctor
         else {
             if constexpr (std::is_integral_v<argT1> &&
                           std::is_integral_v<argT2> &&
-                          std::is_signed_v<argT1> != std::is_signed_v<argT2>)
-            {
+                          std::is_signed_v<argT1> != std::is_signed_v<argT2>) {
                 if constexpr (std::is_signed_v<argT1> &&
                               !std::is_signed_v<argT2>) {
                     return (in1 < 0) ? false : (static_cast<argT2>(in1) >= in2);
diff --git a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/isfinite.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/isfinite.hpp
index 5b8ee877981f..8eb435c089d8 100644
--- a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/isfinite.hpp
+++ b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/isfinite.hpp
@@ -84,8 +84,7 @@ struct IsFiniteFunctor
             return (real_isfinite && imag_isfinite);
         }
         else if constexpr (std::is_same<argT, bool>::value ||
-                           std::is_integral<argT>::value)
-        {
+                           std::is_integral<argT>::value) {
             return constant_value;
         }
         else if constexpr (std::is_same_v<argT, sycl::half>) {
diff --git a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/isinf.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/isinf.hpp
index 89ba83df9268..b7d85e21a1f2 100644
--- a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/isinf.hpp
+++ b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/isinf.hpp
@@ -82,8 +82,7 @@ struct IsInfFunctor
             return (real_isinf || imag_isinf);
         }
         else if constexpr (std::is_same<argT, bool>::value ||
-                           std::is_integral<argT>::value)
-        {
+                           std::is_integral<argT>::value) {
             return constant_value;
         }
         else if constexpr (std::is_same_v<argT, sycl::half>) {
diff --git a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/isnan.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/isnan.hpp
index f78b724bf2d3..cad2d2239de0 100644
--- a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/isnan.hpp
+++ b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/isnan.hpp
@@ -83,8 +83,7 @@ struct IsNanFunctor
             return (real_isnan || imag_isnan);
         }
         else if constexpr (std::is_same<argT, bool>::value ||
-                           std::is_integral<argT>::value)
-        {
+                           std::is_integral<argT>::value) {
             return constant_value;
         }
         else {
diff --git a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/less.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/less.hpp
index 7f1c68c5c65c..19077936372e 100644
--- a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/less.hpp
+++ b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/less.hpp
@@ -73,8 +73,7 @@ struct LessFunctor
     resT operator()(const argT1 &in1, const argT2 &in2) const
     {
         if constexpr (tu_ns::is_complex<argT1>::value ||
-                      tu_ns::is_complex<argT2>::value)
-        {
+                      tu_ns::is_complex<argT2>::value) {
             static_assert(std::is_same_v<argT1, argT2>);
             using dpctl::tensor::math_utils::less_complex;
             return less_complex<argT1>(in1, in2);
@@ -82,8 +81,7 @@ struct LessFunctor
         else {
             if constexpr (std::is_integral_v<argT1> &&
                           std::is_integral_v<argT2> &&
-                          std::is_signed_v<argT1> != std::is_signed_v<argT2>)
-            {
+                          std::is_signed_v<argT1> != std::is_signed_v<argT2>) {
                 if constexpr (std::is_signed_v<argT1> &&
                               !std::is_signed_v<argT2>) {
                     return (in1 < 0) ? true : (static_cast<argT2>(in1) < in2);
diff --git a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/less_equal.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/less_equal.hpp
index a8c58ee31277..a0b23693e70d 100644
--- a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/less_equal.hpp
+++ b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/less_equal.hpp
@@ -73,8 +73,7 @@ struct LessEqualFunctor
     resT operator()(const argT1 &in1, const argT2 &in2) const
     {
         if constexpr (tu_ns::is_complex<argT1>::value ||
-                      tu_ns::is_complex<argT2>::value)
-        {
+                      tu_ns::is_complex<argT2>::value) {
             static_assert(std::is_same_v<argT1, argT2>);
             using dpctl::tensor::math_utils::less_equal_complex;
             return less_equal_complex<argT1>(in1, in2);
@@ -82,8 +81,7 @@ struct LessEqualFunctor
         else {
             if constexpr (std::is_integral_v<argT1> &&
                           std::is_integral_v<argT2> &&
-                          std::is_signed_v<argT1> != std::is_signed_v<argT2>)
-            {
+                          std::is_signed_v<argT1> != std::is_signed_v<argT2>) {
                 if constexpr (std::is_signed_v<argT1> &&
                               !std::is_signed_v<argT2>) {
                     return (in1 < 0) ? true : (static_cast<argT2>(in1) <= in2);
diff --git a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/maximum.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/maximum.hpp
index af6f95863e65..52494cceba93 100644
--- a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/maximum.hpp
+++ b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/maximum.hpp
@@ -73,15 +73,13 @@ struct MaximumFunctor
     resT operator()(const argT1 &in1, const argT2 &in2) const
     {
         if constexpr (tu_ns::is_complex<argT1>::value ||
-                      tu_ns::is_complex<argT2>::value)
-        {
+                      tu_ns::is_complex<argT2>::value) {
             static_assert(std::is_same_v<argT1, argT2>);
             using dpctl::tensor::math_utils::max_complex;
             return max_complex<argT1>(in1, in2);
         }
         else if constexpr (std::is_floating_point_v<argT1> ||
-                           std::is_same_v<argT1, sycl::half>)
-        {
+                           std::is_same_v<argT1, sycl::half>) {
             const bool choose_first = (sycl::isnan(in1) || (in1 > in2));
             return (choose_first) ? in1 : in2;
         }
@@ -101,8 +99,7 @@ struct MaximumFunctor
             const auto &v1 = in1[i];
             const auto &v2 = in2[i];
             if constexpr (std::is_floating_point_v<argT1> ||
-                          std::is_same_v<argT1, sycl::half>)
-            {
+                          std::is_same_v<argT1, sycl::half>) {
                 const bool choose_first = (sycl::isnan(v1) || (v1 > v2));
                 res[i] = (choose_first) ? v1 : v2;
             }
diff --git a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/minimum.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/minimum.hpp
index 0a95987449a1..c11961f8c5c0 100644
--- a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/minimum.hpp
+++ b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/minimum.hpp
@@ -73,15 +73,13 @@ struct MinimumFunctor
     resT operator()(const argT1 &in1, const argT2 &in2) const
     {
         if constexpr (tu_ns::is_complex<argT1>::value ||
-                      tu_ns::is_complex<argT2>::value)
-        {
+                      tu_ns::is_complex<argT2>::value) {
             static_assert(std::is_same_v<argT1, argT2>);
             using dpctl::tensor::math_utils::min_complex;
             return min_complex<argT1>(in1, in2);
         }
         else if constexpr (std::is_floating_point_v<argT1> ||
-                           std::is_same_v<argT1, sycl::half>)
-        {
+                           std::is_same_v<argT1, sycl::half>) {
             const bool choose_first = sycl::isnan(in1) || (in1 < in2);
             return (choose_first) ? in1 : in2;
         }
@@ -101,8 +99,7 @@ struct MinimumFunctor
             const auto &v1 = in1[i];
             const auto &v2 = in2[i];
             if constexpr (std::is_floating_point_v<argT1> ||
-                          std::is_same_v<argT1, sycl::half>)
-            {
+                          std::is_same_v<argT1, sycl::half>) {
                 const bool choose_first = sycl::isnan(v1) || (v1 < v2);
                 res[i] = (choose_first) ? v1 : v2;
             }
diff --git a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/multiply.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/multiply.hpp
index 587a05106ead..58ff88b3afeb 100644
--- a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/multiply.hpp
+++ b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/multiply.hpp
@@ -72,8 +72,7 @@ struct MultiplyFunctor
     resT operator()(const argT1 &in1, const argT2 &in2) const
     {
         if constexpr (tu_ns::is_complex<argT1>::value &&
-                      tu_ns::is_complex<argT2>::value)
-        {
+                      tu_ns::is_complex<argT2>::value) {
             using realT1 = typename argT1::value_type;
             using realT2 = typename argT2::value_type;
 
@@ -358,8 +357,7 @@ struct MultiplyContigMatrixContigRowBroadcastFactory
             using resT = typename MultiplyOutputType<T1, T2>::value_type;
             if constexpr (dpctl::tensor::type_utils::is_complex<T1>::value ||
                           dpctl::tensor::type_utils::is_complex<T2>::value ||
-                          dpctl::tensor::type_utils::is_complex<resT>::value)
-            {
+                          dpctl::tensor::type_utils::is_complex<resT>::value) {
                 fnT fn = nullptr;
                 return fn;
             }
@@ -406,8 +404,7 @@ struct MultiplyContigRowContigMatrixBroadcastFactory
             using resT = typename MultiplyOutputType<T1, T2>::value_type;
             if constexpr (dpctl::tensor::type_utils::is_complex<T1>::value ||
                           dpctl::tensor::type_utils::is_complex<T2>::value ||
-                          dpctl::tensor::type_utils::is_complex<resT>::value)
-            {
+                          dpctl::tensor::type_utils::is_complex<resT>::value) {
                 fnT fn = nullptr;
                 return fn;
             }
@@ -430,10 +427,7 @@ struct MultiplyInplaceFunctor
     using supports_vec = std::negation<
         std::disjunction<tu_ns::is_complex<argT>, tu_ns::is_complex<resT>>>;
 
-    void operator()(resT &res, const argT &in)
-    {
-        res *= in;
-    }
+    void operator()(resT &res, const argT &in) { res *= in; }
 
     template <int vec_sz>
     void operator()(sycl::vec<resT, vec_sz> &res,
@@ -632,8 +626,7 @@ struct MultiplyInplaceRowMatrixBroadcastFactory
         }
         else {
             if constexpr (dpctl::tensor::type_utils::is_complex<T1>::value ||
-                          dpctl::tensor::type_utils::is_complex<T2>::value)
-            {
+                          dpctl::tensor::type_utils::is_complex<T2>::value) {
                 fnT fn = nullptr;
                 return fn;
             }
diff --git a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/negative.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/negative.hpp
index f90786013557..e0ac856a3818 100644
--- a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/negative.hpp
+++ b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/negative.hpp
@@ -68,10 +68,7 @@ struct NegativeFunctor
     using supports_sg_loadstore = typename std::negation<
         std::disjunction<is_complex<resT>, is_complex<argT>>>;
 
-    resT operator()(const argT &x) const
-    {
-        return -x;
-    }
+    resT operator()(const argT &x) const { return -x; }
 };
 
 template <typename argT,
diff --git a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/not_equal.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/not_equal.hpp
index 224e3fbe5b77..007f374b6386 100644
--- a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/not_equal.hpp
+++ b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/not_equal.hpp
@@ -73,8 +73,7 @@ struct NotEqualFunctor
     resT operator()(const argT1 &in1, const argT2 &in2) const
     {
         if constexpr (std::is_integral_v<argT1> && std::is_integral_v<argT2> &&
-                      std::is_signed_v<argT1> != std::is_signed_v<argT2>)
-        {
+                      std::is_signed_v<argT1> != std::is_signed_v<argT2>) {
             if constexpr (std::is_signed_v<argT1> && !std::is_signed_v<argT2>) {
                 return (in1 < 0) ? true : (static_cast<argT2>(in1) != in2);
             }
diff --git a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/positive.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/positive.hpp
index c1ef29c709ab..fb351b6e50d2 100644
--- a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/positive.hpp
+++ b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/positive.hpp
@@ -70,10 +70,7 @@ struct PositiveFunctor
     using supports_sg_loadstore = typename std::negation<
         std::disjunction<is_complex<resT>, is_complex<argT>>>;
 
-    resT operator()(const argT &x) const
-    {
-        return x;
-    }
+    resT operator()(const argT &x) const { return x; }
 
     template <int vec_sz>
     sycl::vec<resT, vec_sz> operator()(const sycl::vec<argT, vec_sz> &in) const
diff --git a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/pow.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/pow.hpp
index 46489f45985e..1c669ec894d2 100644
--- a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/pow.hpp
+++ b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/pow.hpp
@@ -94,8 +94,7 @@ struct PowFunctor
             return res;
         }
         else if constexpr (tu_ns::is_complex<argT1>::value &&
-                           tu_ns::is_complex<argT2>::value)
-        {
+                           tu_ns::is_complex<argT2>::value) {
             using realT1 = typename argT1::value_type;
             using realT2 = typename argT2::value_type;
 
@@ -143,9 +142,8 @@ struct PowFunctor
         }
         else {
             auto res = sycl::pow(in1, in2);
-            if constexpr (std::is_same_v<resT,
-                                         typename decltype(res)::element_type>)
-            {
+            if constexpr (std::is_same_v<
+                              resT, typename decltype(res)::element_type>) {
                 return res;
             }
             else {
@@ -400,8 +398,7 @@ struct PowInplaceFunctor
             res = res_tmp;
         }
         else if constexpr (tu_ns::is_complex<argT>::value &&
-                           tu_ns::is_complex<resT>::value)
-        {
+                           tu_ns::is_complex<resT>::value) {
             using r_resT = typename resT::value_type;
             using r_argT = typename argT::value_type;
 
diff --git a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/remainder.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/remainder.hpp
index ca87d0f41605..65cd97dbe56d 100644
--- a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/remainder.hpp
+++ b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/remainder.hpp
@@ -146,10 +146,7 @@ struct RemainderFunctor
     }
 
 private:
-    bool l_xor(bool b1, bool b2) const
-    {
-        return (b1 != b2);
-    }
+    bool l_xor(bool b1, bool b2) const { return (b1 != b2); }
 };
 
 template <typename argT1,
@@ -429,10 +426,7 @@ struct RemainderInplaceFunctor
     }
 
 private:
-    bool l_xor(bool b1, bool b2) const
-    {
-        return (b1 != b2);
-    }
+    bool l_xor(bool b1, bool b2) const { return (b1 != b2); }
 };
 
 template <typename argT,
diff --git a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/rsqrt.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/rsqrt.hpp
index 0228aecdca67..aa4f1113d839 100644
--- a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/rsqrt.hpp
+++ b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/rsqrt.hpp
@@ -67,10 +67,7 @@ struct RsqrtFunctor
     // do both argTy and resTy support sugroup store/load operation
     using supports_sg_loadstore = typename std::true_type;
 
-    resT operator()(const argT &in) const
-    {
-        return sycl::rsqrt(in);
-    }
+    resT operator()(const argT &in) const { return sycl::rsqrt(in); }
 };
 
 template <typename argTy,
diff --git a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/signbit.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/signbit.hpp
index d67120633efd..65e9e5a202a9 100644
--- a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/signbit.hpp
+++ b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/signbit.hpp
@@ -69,10 +69,7 @@ struct SignbitFunctor
     using supports_vec = std::true_type;
     using supports_sg_loadstore = std::true_type;
 
-    resT operator()(const argT &in) const
-    {
-        return std::signbit(in);
-    }
+    resT operator()(const argT &in) const { return std::signbit(in); }
 
     template <int vec_sz>
     sycl::vec<resT, vec_sz> operator()(const sycl::vec<argT, vec_sz> &in) const
diff --git a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/subtract.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/subtract.hpp
index dfd9ac72b860..431596594ad3 100644
--- a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/subtract.hpp
+++ b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/subtract.hpp
@@ -355,8 +355,7 @@ struct SubtractContigMatrixContigRowBroadcastFactory
             using resT = typename SubtractOutputType<T1, T2>::value_type;
             if constexpr (dpctl::tensor::type_utils::is_complex<T1>::value ||
                           dpctl::tensor::type_utils::is_complex<T2>::value ||
-                          dpctl::tensor::type_utils::is_complex<resT>::value)
-            {
+                          dpctl::tensor::type_utils::is_complex<resT>::value) {
                 fnT fn = nullptr;
                 return fn;
             }
@@ -405,8 +404,7 @@ struct SubtractContigRowContigMatrixBroadcastFactory
             using resT = typename SubtractOutputType<T1, T2>::value_type;
             if constexpr (dpctl::tensor::type_utils::is_complex<T1>::value ||
                           dpctl::tensor::type_utils::is_complex<T2>::value ||
-                          dpctl::tensor::type_utils::is_complex<resT>::value)
-            {
+                          dpctl::tensor::type_utils::is_complex<resT>::value) {
                 fnT fn = nullptr;
                 return fn;
             }
@@ -429,10 +427,7 @@ struct SubtractInplaceFunctor
     using supports_vec = std::negation<
         std::disjunction<tu_ns::is_complex<argT>, tu_ns::is_complex<resT>>>;
 
-    void operator()(resT &res, const argT &in)
-    {
-        res -= in;
-    }
+    void operator()(resT &res, const argT &in) { res -= in; }
 
     template <int vec_sz>
     void operator()(sycl::vec<resT, vec_sz> &res,
@@ -630,8 +625,7 @@ struct SubtractInplaceRowMatrixBroadcastFactory
         }
         else {
             if constexpr (dpctl::tensor::type_utils::is_complex<T1>::value ||
-                          dpctl::tensor::type_utils::is_complex<T2>::value)
-            {
+                          dpctl::tensor::type_utils::is_complex<T2>::value) {
                 fnT fn = nullptr;
                 return fn;
             }
diff --git a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/true_divide.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/true_divide.hpp
index 1372663b96c5..caa1cd2029c4 100644
--- a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/true_divide.hpp
+++ b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/true_divide.hpp
@@ -71,8 +71,7 @@ struct TrueDivideFunctor
     resT operator()(const argT1 &in1, const argT2 &in2) const
     {
         if constexpr (tu_ns::is_complex<argT1>::value &&
-                      tu_ns::is_complex<argT2>::value)
-        {
+                      tu_ns::is_complex<argT2>::value) {
             using realT1 = typename argT1::value_type;
             using realT2 = typename argT2::value_type;
 
@@ -80,15 +79,13 @@ struct TrueDivideFunctor
                    exprm_ns::complex<realT2>(in2);
         }
         else if constexpr (tu_ns::is_complex<argT1>::value &&
-                           !tu_ns::is_complex<argT2>::value)
-        {
+                           !tu_ns::is_complex<argT2>::value) {
             using realT1 = typename argT1::value_type;
 
             return exprm_ns::complex<realT1>(in1) / in2;
         }
         else if constexpr (!tu_ns::is_complex<argT1>::value &&
-                           tu_ns::is_complex<argT2>::value)
-        {
+                           tu_ns::is_complex<argT2>::value) {
             using realT2 = typename argT2::value_type;
 
             return in1 / exprm_ns::complex<realT2>(in2);
@@ -362,8 +359,7 @@ struct TrueDivideContigMatrixContigRowBroadcastFactory
             using resT = typename TrueDivideOutputType<T1, T2>::value_type;
             if constexpr (dpctl::tensor::type_utils::is_complex<T1>::value ||
                           dpctl::tensor::type_utils::is_complex<T2>::value ||
-                          dpctl::tensor::type_utils::is_complex<resT>::value)
-            {
+                          dpctl::tensor::type_utils::is_complex<resT>::value) {
                 fnT fn = nullptr;
                 return fn;
             }
@@ -412,8 +408,7 @@ struct TrueDivideContigRowContigMatrixBroadcastFactory
             using resT = typename TrueDivideOutputType<T1, T2>::value_type;
             if constexpr (dpctl::tensor::type_utils::is_complex<T1>::value ||
                           dpctl::tensor::type_utils::is_complex<T2>::value ||
-                          dpctl::tensor::type_utils::is_complex<resT>::value)
-            {
+                          dpctl::tensor::type_utils::is_complex<resT>::value) {
                 fnT fn = nullptr;
                 return fn;
             }
@@ -498,8 +493,8 @@ struct TrueDivideInplaceTypeMapFactory
     /*! @brief get typeid for output type of divide(T1 x, T2 y) */
     std::enable_if_t<std::is_same<fnT, int>::value, int> get()
     {
-        if constexpr (TrueDivideInplaceTypePairSupport<argT, resT>::is_defined)
-        {
+        if constexpr (TrueDivideInplaceTypePairSupport<argT,
+                                                       resT>::is_defined) {
             return td_ns::GetTypeid<resT>{}.get();
         }
         else {
@@ -652,8 +647,7 @@ struct TrueDivideInplaceRowMatrixBroadcastFactory
         }
         else {
             if constexpr (dpctl::tensor::type_utils::is_complex<T1>::value ||
-                          dpctl::tensor::type_utils::is_complex<T2>::value)
-            {
+                          dpctl::tensor::type_utils::is_complex<T2>::value) {
                 fnT fn = nullptr;
                 return fn;
             }
diff --git a/dpnp/tensor/libtensor/include/kernels/linalg_functions/dot_product.hpp b/dpnp/tensor/libtensor/include/kernels/linalg_functions/dot_product.hpp
index b7f996bfa797..b987ff2988be 100644
--- a/dpnp/tensor/libtensor/include/kernels/linalg_functions/dot_product.hpp
+++ b/dpnp/tensor/libtensor/include/kernels/linalg_functions/dot_product.hpp
@@ -172,8 +172,7 @@ struct DotProductFunctor
             reduction_max_gid_, arg_reduce_gid0 + reductions_per_wi * wg);
 
         for (std::size_t arg_reduce_gid = arg_reduce_gid0;
-             arg_reduce_gid < arg_reduce_gid_max; arg_reduce_gid += wg)
-        {
+             arg_reduce_gid < arg_reduce_gid_max; arg_reduce_gid += wg) {
             auto reduction_offsets_ = reduced_dims_indexer_(arg_reduce_gid);
             const auto &lhs_reduction_offset =
                 reduction_offsets_.get_first_offset();
@@ -270,8 +269,7 @@ struct DotProductCustomFunctor
             reduction_max_gid_, arg_reduce_gid0 + reductions_per_wi * wg);
 
         for (std::size_t arg_reduce_gid = arg_reduce_gid0;
-             arg_reduce_gid < arg_reduce_gid_max; arg_reduce_gid += wg)
-        {
+             arg_reduce_gid < arg_reduce_gid_max; arg_reduce_gid += wg) {
             auto reduction_offsets_ = reduced_dims_indexer_(arg_reduce_gid);
             const auto &lhs_reduction_offset =
                 reduction_offsets_.get_first_offset();
@@ -301,14 +299,16 @@ struct DotProductCustomFunctor
     }
 };
 
-template <
-    typename lhsTy,
-    typename rhsTy,
-    typename resTy,
-    typename BatchIndexerT,
-    typename RedIndexerT,
-    template <typename T1, typename T2, typename T3, typename T4, typename T5>
-    class kernel_name_token>
+template <typename lhsTy,
+          typename rhsTy,
+          typename resTy,
+          typename BatchIndexerT,
+          typename RedIndexerT,
+          template <typename T1,
+                    typename T2,
+                    typename T3,
+                    typename T4,
+                    typename T5> class kernel_name_token>
 sycl::event sequential_dot_product(sycl::queue &exec_q,
                                    const lhsTy *lhs,
                                    const rhsTy *rhs,
@@ -345,8 +345,7 @@ template <typename lhsTy,
                     typename T3,
                     typename T4,
                     typename T5,
-                    typename T6>
-          class kernel_name_token>
+                    typename T6> class kernel_name_token>
 sycl::event submit_atomic_dot_product(sycl::queue &exec_q,
                                       const lhsTy *lhs,
                                       const rhsTy *rhs,
@@ -716,8 +715,7 @@ struct DotProductNoAtomicFunctor
             reduction_max_gid_, arg_reduce_gid0 + reductions_per_wi * wg);
 
         for (std::size_t arg_reduce_gid = arg_reduce_gid0;
-             arg_reduce_gid < arg_reduce_gid_max; arg_reduce_gid += wg)
-        {
+             arg_reduce_gid < arg_reduce_gid_max; arg_reduce_gid += wg) {
             auto reduction_offsets_ = reduced_dims_indexer_(arg_reduce_gid);
             const auto &lhs_reduction_offset =
                 reduction_offsets_.get_first_offset();
@@ -817,8 +815,7 @@ struct DotProductNoAtomicCustomFunctor
             reduction_max_gid_, arg_reduce_gid0 + reductions_per_wi * wg);
 
         for (std::size_t arg_reduce_gid = arg_reduce_gid0;
-             arg_reduce_gid < arg_reduce_gid_max; arg_reduce_gid += wg)
-        {
+             arg_reduce_gid < arg_reduce_gid_max; arg_reduce_gid += wg) {
             auto reduction_offsets_ = reduced_dims_indexer_(arg_reduce_gid);
             const auto &lhs_reduction_offset =
                 reduction_offsets_.get_first_offset();
@@ -858,8 +855,7 @@ template <typename lhsTy,
                     typename T3,
                     typename T4,
                     typename T5,
-                    typename T6>
-          class kernel_name_token>
+                    typename T6> class kernel_name_token>
 sycl::event
     submit_no_atomic_dot_product(sycl::queue &exec_q,
                                  const lhsTy *lhs,
diff --git a/dpnp/tensor/libtensor/include/kernels/linalg_functions/gemm.hpp b/dpnp/tensor/libtensor/include/kernels/linalg_functions/gemm.hpp
index 8f84d950c0cd..5644ea172a1d 100644
--- a/dpnp/tensor/libtensor/include/kernels/linalg_functions/gemm.hpp
+++ b/dpnp/tensor/libtensor/include/kernels/linalg_functions/gemm.hpp
@@ -72,8 +72,7 @@ void scale_gemm_k_parameters(const std::size_t &local_mem_size,
     static constexpr std::size_t slm_elem_size = sizeof(T) * m_groups;
 
     while (slm_elem_size * (n_wi + delta_n) * delta_k + reserved_slm_size >=
-           local_mem_size)
-    {
+           local_mem_size) {
         n_wi = n_wi / 2;
         delta_n = delta_n / 2;
         if (delta_n == 0)
@@ -95,8 +94,7 @@ void scale_gemm_nm_parameters(const std::size_t &local_mem_size,
     while ((wi_delta_n * wg_delta_n * wi_delta_k * slm_A_elem_size) +
                (wi_delta_k * wg_delta_m * slm_B_elem_size) +
                reserved_slm_size >=
-           local_mem_size)
-    {
+           local_mem_size) {
         wg_delta_n /= 2;
         wg_delta_m /= 2;
         wi_delta_k /= 2;
@@ -641,8 +639,8 @@ class GemmBatchFunctorThreadK
                 else {
                     accV_t local_B_vec;
 #pragma unroll
-                    for (std::size_t vec_idx = 0; vec_idx < m_groups; ++vec_idx)
-                    {
+                    for (std::size_t vec_idx = 0; vec_idx < m_groups;
+                         ++vec_idx) {
                         local_B_vec[vec_idx] =
                             (sq < k && j + vec_idx < m)
                                 ? static_cast<resT>(
@@ -1006,8 +1004,7 @@ class GemmBatchFunctorThreadNM_vecm
             // populate local_lhs_block<resT> ( wg_delta_n * wi_delta_n,
             // wi_delta_k)
             for (std::uint32_t vid = lid; vid < local_lhs_block.size();
-                 vid += it.get_local_range()[0])
-            {
+                 vid += it.get_local_range()[0]) {
                 // 0 <= v_i < wg_delta_n * wi_delta_n
                 const std::uint32_t v_i = vid / wi_delta_k;
                 // 0 <= v_s < wi_delta_k
@@ -1029,8 +1026,7 @@ class GemmBatchFunctorThreadNM_vecm
             // populate local_rhs_block<vec<resT, m_vec_size>> ( wg_delta_m *
             // wi_delta_m_vecs, wi_delta_k )
             for (std::uint32_t vid = lid; vid < local_rhs_block.size();
-                 vid += it.get_local_range()[0])
-            {
+                 vid += it.get_local_range()[0]) {
                 // 0 <= v_j < wg_delta_m * wi_delta_m_vecs
                 const std::uint32_t v_j = vid / wi_delta_k;
                 // 0 <= v_s < wi_delta_k
@@ -1091,8 +1087,8 @@ class GemmBatchFunctorThreadNM_vecm
 #pragma unroll
                 for (std::uint32_t pr_i = 0; pr_i < wi_delta_n; ++pr_i) {
 #pragma unroll
-                    for (std::uint32_t pr_j = 0; pr_j < wi_delta_m_vecs; ++pr_j)
-                    {
+                    for (std::uint32_t pr_j = 0; pr_j < wi_delta_m_vecs;
+                         ++pr_j) {
                         private_C[pr_i * wi_delta_m_vecs + pr_j] +=
                             pr_lhs[pr_i] * pr_rhs[pr_j];
                     }
@@ -1108,8 +1104,8 @@ class GemmBatchFunctorThreadNM_vecm
                 std::size_t out_i = i + local_i + pr_i * wg_delta_n;
                 if (out_i < n) {
 #pragma unroll
-                    for (std::uint32_t pr_j = 0; pr_j < wi_delta_m_vecs; ++pr_j)
-                    {
+                    for (std::uint32_t pr_j = 0; pr_j < wi_delta_m_vecs;
+                         ++pr_j) {
                         const std::size_t out_j =
                             j + (local_j + pr_j * wg_delta_m) * m_vec_size;
                         const std::size_t out_flat_id =
@@ -1128,8 +1124,8 @@ class GemmBatchFunctorThreadNM_vecm
                 std::size_t out_i = i + local_i + pr_i * wg_delta_n;
                 if (out_i < n) {
                     // could be unrolled
-                    for (std::uint32_t pr_j = 0; pr_j < wi_delta_m_vecs; ++pr_j)
-                    {
+                    for (std::uint32_t pr_j = 0; pr_j < wi_delta_m_vecs;
+                         ++pr_j) {
                         std::size_t out_j =
                             j + (local_j + pr_j * wg_delta_m) * m_vec_size;
 #pragma unroll
@@ -1168,18 +1164,12 @@ struct GemmBatchFunctorThreadNM_vecm_HyperParameters
     {
     }
 
-    constexpr std::uint32_t get_wi_delta_n() const
-    {
-        return wi_delta_n;
-    }
+    constexpr std::uint32_t get_wi_delta_n() const { return wi_delta_n; }
     constexpr std::uint32_t get_wi_delta_m_vecs() const
     {
         return wi_delta_m_vecs;
     }
-    constexpr std::uint32_t get_m_vec_size() const
-    {
-        return m_vec_size;
-    }
+    constexpr std::uint32_t get_m_vec_size() const { return m_vec_size; }
 };
 
 template <typename resT>
@@ -1937,8 +1927,8 @@ class GemmBatchNoAtomicFunctorThreadNM
             else {
                 slmB_t vec{};
 #pragma unroll
-                for (std::uint8_t lane_id = 0; lane_id < wi_delta_m; ++lane_id)
-                {
+                for (std::uint8_t lane_id = 0; lane_id < wi_delta_m;
+                     ++lane_id) {
                     std::size_t g_j1 = g_j + lane_id;
                     vec[lane_id] =
                         (g_j1 < m && g_s < k)
@@ -1966,8 +1956,8 @@ class GemmBatchNoAtomicFunctorThreadNM
             const std::size_t a_pr_offset = private_i * wi_delta_k;
 
             slmB_t local_sum(identity_);
-            for (std::size_t private_s = 0; private_s < wi_delta_k; ++private_s)
-            {
+            for (std::size_t private_s = 0; private_s < wi_delta_k;
+                 ++private_s) {
                 local_sum = local_sum +
                             (local_A_block[a_offset + a_pr_offset + private_s] *
                              local_B_block[b_offset + private_s]);
@@ -1984,8 +1974,8 @@ class GemmBatchNoAtomicFunctorThreadNM
             }
             else {
 #pragma unroll
-                for (std::uint8_t lane_id = 0; lane_id < wi_delta_m; ++lane_id)
-                {
+                for (std::uint8_t lane_id = 0; lane_id < wi_delta_m;
+                     ++lane_id) {
                     const std::size_t gl_j = j + lane_id;
 
                     if (gl_i < n && gl_j < m) {
@@ -2111,8 +2101,8 @@ class GemmBatchNoAtomicFunctorThreadK
                 else {
                     accV_t local_B_vec;
 #pragma unroll
-                    for (std::size_t vec_idx = 0; vec_idx < m_groups; ++vec_idx)
-                    {
+                    for (std::size_t vec_idx = 0; vec_idx < m_groups;
+                         ++vec_idx) {
                         local_B_vec[vec_idx] =
                             (sq < k && j + vec_idx < m)
                                 ? static_cast<resT>(
diff --git a/dpnp/tensor/libtensor/include/kernels/reductions.hpp b/dpnp/tensor/libtensor/include/kernels/reductions.hpp
index ee6431dec637..75df2c201968 100644
--- a/dpnp/tensor/libtensor/include/kernels/reductions.hpp
+++ b/dpnp/tensor/libtensor/include/kernels/reductions.hpp
@@ -138,8 +138,7 @@ struct SequentialReduction
             using dpctl::tensor::type_utils::convert_impl;
             outT val;
             if constexpr (su_ns::IsLogicalAnd<outT, ReductionOp>::value ||
-                          su_ns::IsLogicalOr<outT, ReductionOp>::value)
-            {
+                          su_ns::IsLogicalOr<outT, ReductionOp>::value) {
                 val = convert_impl<bool, argT>(inp_[inp_offset]);
             }
             else {
@@ -221,8 +220,7 @@ struct ReductionOverGroupWithAtomicFunctor
             reduction_max_gid_, arg_reduce_gid0 + reductions_per_wi * wg);
 
         for (std::size_t arg_reduce_gid = arg_reduce_gid0;
-             arg_reduce_gid < arg_reduce_gid_max; arg_reduce_gid += wg)
-        {
+             arg_reduce_gid < arg_reduce_gid_max; arg_reduce_gid += wg) {
             auto inp_reduction_offset =
                 inp_reduced_dims_indexer_(arg_reduce_gid);
             auto inp_offset = inp_iter_offset + inp_reduction_offset;
@@ -230,8 +228,7 @@ struct ReductionOverGroupWithAtomicFunctor
             using dpctl::tensor::type_utils::convert_impl;
             outT val;
             if constexpr (su_ns::IsLogicalAnd<outT, ReductionOp>::value ||
-                          su_ns::IsLogicalOr<outT, ReductionOp>::value)
-            {
+                          su_ns::IsLogicalOr<outT, ReductionOp>::value) {
                 // handle nans
                 val = convert_impl<bool, argT>(inp_[inp_offset]);
             }
@@ -356,8 +353,7 @@ struct CustomReductionOverGroupWithAtomicFunctor
             reduction_max_gid_, arg_reduce_gid0 + reductions_per_wi * wg);
 
         for (std::size_t arg_reduce_gid = arg_reduce_gid0;
-             arg_reduce_gid < arg_reduce_gid_max; arg_reduce_gid += wg)
-        {
+             arg_reduce_gid < arg_reduce_gid_max; arg_reduce_gid += wg) {
             auto inp_reduction_offset =
                 inp_reduced_dims_indexer_(arg_reduce_gid);
             auto inp_offset = inp_iter_offset + inp_reduction_offset;
@@ -365,8 +361,7 @@ struct CustomReductionOverGroupWithAtomicFunctor
             using dpctl::tensor::type_utils::convert_impl;
             outT val;
             if constexpr (su_ns::IsLogicalAnd<outT, ReductionOp>::value ||
-                          su_ns::IsLogicalOr<outT, ReductionOp>::value)
-            {
+                          su_ns::IsLogicalOr<outT, ReductionOp>::value) {
                 // handle nans
                 val = convert_impl<bool, argT>(inp_[inp_offset]);
             }
@@ -401,8 +396,8 @@ struct CustomReductionOverGroupWithAtomicFunctor
                                                        ReductionOp>::value) {
                 res_ref.fetch_and(red_val_over_wg);
             }
-            else if constexpr (su_ns::IsSyclLogicalOr<outT, ReductionOp>::value)
-            {
+            else if constexpr (su_ns::IsSyclLogicalOr<outT,
+                                                      ReductionOp>::value) {
                 res_ref.fetch_or(red_val_over_wg);
             }
             else {
@@ -487,8 +482,7 @@ struct ReductionOverGroupNoAtomicFunctor
                 using dpctl::tensor::type_utils::convert_impl;
                 outT val;
                 if constexpr (su_ns::IsLogicalAnd<outT, ReductionOp>::value ||
-                              su_ns::IsLogicalOr<outT, ReductionOp>::value)
-                {
+                              su_ns::IsLogicalOr<outT, ReductionOp>::value) {
                     // handle nans
                     val = convert_impl<bool, argT>(inp_[inp_offset]);
                 }
@@ -600,8 +594,7 @@ struct CustomReductionOverGroupNoAtomicFunctor
                 if constexpr (std::is_same_v<ReductionOp,
                                              sycl::logical_and<outT>> ||
                               std::is_same_v<ReductionOp,
-                                             sycl::logical_or<outT>>)
-                {
+                                             sycl::logical_or<outT>>) {
                     // handle nans
                     val = convert_impl<bool, argT>(inp_[inp_offset]);
                 }
@@ -626,14 +619,16 @@ struct CustomReductionOverGroupNoAtomicFunctor
     }
 };
 
-template <
-    typename argTy,
-    typename resTy,
-    typename ReductionOpT,
-    typename InputOutputIterIndexerT,
-    typename ReductionIndexerT,
-    template <typename T1, typename T2, typename T3, typename T4, typename T5>
-    class kernel_name_token>
+template <typename argTy,
+          typename resTy,
+          typename ReductionOpT,
+          typename InputOutputIterIndexerT,
+          typename ReductionIndexerT,
+          template <typename T1,
+                    typename T2,
+                    typename T3,
+                    typename T4,
+                    typename T5> class kernel_name_token>
 sycl::event
     sequential_reduction(sycl::queue &exec_q,
                          const argTy *arg,
@@ -666,14 +661,16 @@ sycl::event
 template <typename BasedKernelName>
 class custom_reduction_wrapper;
 
-template <
-    typename argTy,
-    typename resTy,
-    typename ReductionOpT,
-    typename InputOutputIterIndexerT,
-    typename ReductionIndexerT,
-    template <typename T1, typename T2, typename T3, typename T4, typename T5>
-    class kernel_name_token>
+template <typename argTy,
+          typename resTy,
+          typename ReductionOpT,
+          typename InputOutputIterIndexerT,
+          typename ReductionIndexerT,
+          template <typename T1,
+                    typename T2,
+                    typename T3,
+                    typename T4,
+                    typename T5> class kernel_name_token>
 sycl::event
     submit_atomic_reduction(sycl::queue &exec_q,
                             const argTy *arg,
@@ -1051,14 +1048,16 @@ sycl::event reduction_axis0_over_group_with_atomics_contig_impl(
 
 /* = Reduction, using sycl::reduce_over_group, but not using atomic_ref = */
 
-template <
-    typename argTy,
-    typename resTy,
-    typename ReductionOpT,
-    typename InputOutputIterIndexerT,
-    typename ReductionIndexerT,
-    template <typename T1, typename T2, typename T3, typename T4, typename T5>
-    class kernel_name_token>
+template <typename argTy,
+          typename resTy,
+          typename ReductionOpT,
+          typename InputOutputIterIndexerT,
+          typename ReductionIndexerT,
+          template <typename T1,
+                    typename T2,
+                    typename T3,
+                    typename T4,
+                    typename T5> class kernel_name_token>
 sycl::event submit_no_atomic_reduction(
     sycl::queue &exec_q,
     const argTy *arg,
@@ -1928,15 +1927,13 @@ struct SequentialSearchReduction
                         // less_complex always returns false for NaNs, so check
                         if (less_complex<argT>(val, red_val) ||
                             std::isnan(std::real(val)) ||
-                            std::isnan(std::imag(val)))
-                        {
+                            std::isnan(std::imag(val))) {
                             red_val = val;
                             idx_val = static_cast<outT>(m);
                         }
                     }
                     else if constexpr (std::is_floating_point_v<argT> ||
-                                       std::is_same_v<argT, sycl::half>)
-                    {
+                                       std::is_same_v<argT, sycl::half>) {
                         if (val < red_val || std::isnan(val)) {
                             red_val = val;
                             idx_val = static_cast<outT>(m);
@@ -1955,15 +1952,13 @@ struct SequentialSearchReduction
                         using dpctl::tensor::math_utils::greater_complex;
                         if (greater_complex<argT>(val, red_val) ||
                             std::isnan(std::real(val)) ||
-                            std::isnan(std::imag(val)))
-                        {
+                            std::isnan(std::imag(val))) {
                             red_val = val;
                             idx_val = static_cast<outT>(m);
                         }
                     }
                     else if constexpr (std::is_floating_point_v<argT> ||
-                                       std::is_same_v<argT, sycl::half>)
-                    {
+                                       std::is_same_v<argT, sycl::half>) {
                         if (val > red_val || std::isnan(val)) {
                             red_val = val;
                             idx_val = static_cast<outT>(m);
@@ -2243,8 +2238,7 @@ struct CustomSearchReduction
                             // check
                             if (less_complex<argT>(val, local_red_val) ||
                                 std::isnan(std::real(val)) ||
-                                std::isnan(std::imag(val)))
-                            {
+                                std::isnan(std::imag(val))) {
                                 local_red_val = val;
                                 if constexpr (!First) {
                                     local_idx = inds_[inp_offset];
@@ -2256,8 +2250,7 @@ struct CustomSearchReduction
                             }
                         }
                         else if constexpr (std::is_floating_point_v<argT> ||
-                                           std::is_same_v<argT, sycl::half>)
-                        {
+                                           std::is_same_v<argT, sycl::half>) {
                             if (val < local_red_val || std::isnan(val)) {
                                 local_red_val = val;
                                 if constexpr (!First) {
@@ -2289,8 +2282,7 @@ struct CustomSearchReduction
                             using dpctl::tensor::math_utils::greater_complex;
                             if (greater_complex<argT>(val, local_red_val) ||
                                 std::isnan(std::real(val)) ||
-                                std::isnan(std::imag(val)))
-                            {
+                                std::isnan(std::imag(val))) {
                                 local_red_val = val;
                                 if constexpr (!First) {
                                     local_idx = inds_[inp_offset];
@@ -2302,8 +2294,7 @@ struct CustomSearchReduction
                             }
                         }
                         else if constexpr (std::is_floating_point_v<argT> ||
-                                           std::is_same_v<argT, sycl::half>)
-                        {
+                                           std::is_same_v<argT, sycl::half>) {
                             if (val > local_red_val || std::isnan(val)) {
                                 local_red_val = val;
                                 if constexpr (!First) {
@@ -2347,8 +2338,7 @@ struct CustomSearchReduction
                             : idx_identity_;
         }
         else if constexpr (std::is_floating_point_v<argT> ||
-                           std::is_same_v<argT, sycl::half>)
-        {
+                           std::is_same_v<argT, sycl::half>) {
             // equality does not hold for NaNs, so check here
             local_idx =
                 (red_val_over_wg == local_red_val || std::isnan(local_red_val))
diff --git a/dpnp/tensor/libtensor/include/kernels/sorting/merge_sort.hpp b/dpnp/tensor/libtensor/include/kernels/sorting/merge_sort.hpp
index a047c172f7bc..75d3dc5f01a0 100644
--- a/dpnp/tensor/libtensor/include/kernels/sorting/merge_sort.hpp
+++ b/dpnp/tensor/libtensor/include/kernels/sorting/merge_sort.hpp
@@ -190,8 +190,8 @@ void merge_impl(const std::size_t offset,
             // Handle intermediate items
             if (l_search_bound_1 == r_search_bound_1) {
                 const std::size_t shift_1 = l_search_bound_1 - start_1;
-                for (auto idx = local_start_2 + 1; idx < local_end_2 - 1; ++idx)
-                {
+                for (auto idx = local_start_2 + 1; idx < local_end_2 - 1;
+                     ++idx) {
                     const auto intermediate_item_2 = in_acc[idx];
                     const std::size_t shift_2 = idx - start_2;
                     out_acc[start_out + shift_1 + shift_2] =
@@ -199,8 +199,8 @@ void merge_impl(const std::size_t offset,
                 }
             }
             else {
-                for (auto idx = local_start_2 + 1; idx < local_end_2 - 1; ++idx)
-                {
+                for (auto idx = local_start_2 + 1; idx < local_end_2 - 1;
+                     ++idx) {
                     const auto intermediate_item_2 = in_acc[idx];
                     // we shouldn't seek in whole 1st sequence. Just for the
                     // part where the 2nd sequence should be
@@ -282,10 +282,7 @@ struct GetValueType<sycl::buffer<ElementType, Dim, AllocatorT>>
 template <typename Iter>
 struct GetReadOnlyAccess
 {
-    Iter operator()(const Iter &it, sycl::handler &)
-    {
-        return it;
-    }
+    Iter operator()(const Iter &it, sycl::handler &) { return it; }
 };
 
 template <typename ElementType, int Dim, typename AllocatorT>
@@ -302,10 +299,7 @@ struct GetReadOnlyAccess<sycl::buffer<ElementType, Dim, AllocatorT>>
 template <typename Iter>
 struct GetWriteDiscardAccess
 {
-    Iter operator()(Iter it, sycl::handler &)
-    {
-        return it;
-    }
+    Iter operator()(Iter it, sycl::handler &) { return it; }
 };
 
 template <typename ElementType, int Dim, typename AllocatorT>
@@ -322,10 +316,7 @@ struct GetWriteDiscardAccess<sycl::buffer<ElementType, Dim, AllocatorT>>
 template <typename Iter>
 struct GetReadWriteAccess
 {
-    Iter operator()(Iter &it, sycl::handler &)
-    {
-        return it;
-    }
+    Iter operator()(Iter &it, sycl::handler &) { return it; }
 };
 
 template <typename ElementType, int Dim, typename AllocatorT>
@@ -479,8 +470,7 @@ sycl::event sort_over_work_group_contig_impl(
 
             // load input into SLM
             for (std::size_t array_id = segment_start_idx + lid;
-                 array_id < segment_end_idx; array_id += lws)
-            {
+                 array_id < segment_end_idx; array_id += lws) {
                 T v = (array_id < sort_nelems)
                           ? input_acc[iter_id * sort_nelems + array_id]
                           : T{};
@@ -505,8 +495,7 @@ sycl::event sort_over_work_group_contig_impl(
             const std::size_t max_chunks_merged =
                 1 + ((wg_chunk_size - 1) / chunk);
             for (; n_chunks_merged < max_chunks_merged;
-                 data_in_temp = !data_in_temp, n_chunks_merged *= 2)
-            {
+                 data_in_temp = !data_in_temp, n_chunks_merged *= 2) {
                 const std::size_t nelems_sorted_so_far =
                     n_chunks_merged * chunk;
                 const std::size_t q = (lid / n_chunks_merged);
@@ -531,8 +520,7 @@ sycl::event sort_over_work_group_contig_impl(
 
             const auto &out_src = (data_in_temp) ? scratch_space : work_space;
             for (std::size_t array_id = segment_start_idx + lid;
-                 array_id < segment_end_idx; array_id += lws)
-            {
+                 array_id < segment_end_idx; array_id += lws) {
                 if (array_id < sort_nelems) {
                     output_acc[iter_id * sort_nelems + array_id] =
                         out_src[array_id - segment_start_idx];
diff --git a/dpnp/tensor/libtensor/include/kernels/sorting/radix_sort.hpp b/dpnp/tensor/libtensor/include/kernels/sorting/radix_sort.hpp
index 940c6d802a9a..5baa98e237df 100644
--- a/dpnp/tensor/libtensor/include/kernels/sorting/radix_sort.hpp
+++ b/dpnp/tensor/libtensor/include/kernels/sorting/radix_sort.hpp
@@ -374,8 +374,7 @@ sycl::event
             // each work-item in the order of their local ids
             const std::uint32_t count_start_id = radix_states * lid;
             for (std::uint32_t radix_state_id = 0;
-                 radix_state_id < radix_states; ++radix_state_id)
-            {
+                 radix_state_id < radix_states; ++radix_state_id) {
                 counts_lacc[count_start_id + radix_state_id] =
                     counts_arr[radix_state_id];
             }
@@ -396,8 +395,7 @@ sycl::event
             // count per work-group: reduce until count_lacc[] size >
             // radix_states (n_witems /= 2 per iteration)
             for (std::uint32_t n_witems = (wg_size >> 1);
-                 n_witems >= radix_states; n_witems >>= 1)
-            {
+                 n_witems >= radix_states; n_witems >>= 1) {
                 if (lid < n_witems)
                     counts_lacc[lid] += counts_lacc[n_witems + lid];
 
@@ -478,8 +476,8 @@ sycl::event radix_sort_scan_submit(sycl::queue &exec_q,
 
             // NB: No race condition here, because the condition may ever be
             // true for only on one WG, one WI.
-            if ((lid == wg_size - 1) && (begin_ptr[scan_size - 1] == n_values))
-            {
+            if ((lid == wg_size - 1) &&
+                (begin_ptr[scan_size - 1] == n_values)) {
                 // set flag, since all the values got into one
                 // this is optimization, may happy often for
                 // higher radix offsets (all zeros)
@@ -794,8 +792,7 @@ sycl::event
             offset_arr[zero_radix_state_id] = b_offset_ptr[segment_id];
 
             for (std::uint32_t radix_state_id = 1;
-                 radix_state_id < radix_states; ++radix_state_id)
-            {
+                 radix_state_id < radix_states; ++radix_state_id) {
                 const std::uint32_t local_offset_id =
                     segment_id + scan_size * radix_state_id;
 
@@ -835,8 +832,7 @@ sycl::event
 
                     OffsetT new_offset_id = 0;
                     for (std::uint32_t radix_state_id = 0;
-                         radix_state_id < radix_states; ++radix_state_id)
-                    {
+                         radix_state_id < radix_states; ++radix_state_id) {
                         bool is_current_bucket = (bucket_id == radix_state_id);
                         std::uint32_t sg_total_offset =
                             peer_prefix_hlp.peer_contribution(
@@ -864,8 +860,7 @@ sycl::event
 
                     OffsetT new_offset_id = 0;
                     for (std::uint32_t radix_state_id = 0;
-                         radix_state_id < radix_states; ++radix_state_id)
-                    {
+                         radix_state_id < radix_states; ++radix_state_id) {
                         bool is_current_bucket = (bucket_id == radix_state_id);
                         std::uint32_t sg_total_offset =
                             peer_prefix_hlp.peer_contribution(
@@ -899,8 +894,7 @@ sycl::event
 
                 OffsetT new_offset_id = 0;
                 for (std::uint32_t radix_state_id = 0;
-                     radix_state_id < radix_states; ++radix_state_id)
-                {
+                     radix_state_id < radix_states; ++radix_state_id) {
                     bool is_current_bucket = (bucket_id == radix_state_id);
                     std::uint32_t sg_total_offset =
                         peer_prefix_hlp.peer_contribution(
@@ -1038,8 +1032,7 @@ struct parallel_radix_sort_iteration_step
         static constexpr std::size_t sg16_v = 16u;
         static constexpr std::size_t sg08_v = 8u;
         if (sg32_v == reorder_sg_size || sg16_v == reorder_sg_size ||
-            sg08_v == reorder_sg_size)
-        {
+            sg08_v == reorder_sg_size) {
             static constexpr auto peer_algorithm =
                 peer_prefix_algo::subgroup_ballot;
 
@@ -1164,10 +1157,7 @@ struct subgroup_radix_sort
             return sycl::local_accessor<KeyT>(buf_size, cgh);
         }
 
-        std::size_t get_iter_stride() const
-        {
-            return std::size_t{0};
-        }
+        std::size_t get_iter_stride() const { return std::size_t{0}; }
     };
 
     template <typename KeyT>
@@ -1185,10 +1175,7 @@ struct subgroup_radix_sort
         {
             return sycl::accessor(buf, cgh, sycl::read_write, sycl::no_init);
         }
-        std::size_t get_iter_stride() const
-        {
-            return iter_stride;
-        }
+        std::size_t get_iter_stride() const { return iter_stride; }
     };
 
     static_assert(wg_size <= 1024);
@@ -1798,10 +1785,7 @@ struct IndexedProj
     {
     }
 
-    auto operator()(IndexT i) const
-    {
-        return value_projector(ptr[i]);
-    }
+    auto operator()(IndexT i) const { return value_projector(ptr[i]); }
 
 private:
     const ValueT *ptr;
diff --git a/dpnp/tensor/libtensor/include/kernels/sorting/topk.hpp b/dpnp/tensor/libtensor/include/kernels/sorting/topk.hpp
index d9a103a02e99..1bbaa9e8345a 100644
--- a/dpnp/tensor/libtensor/include/kernels/sorting/topk.hpp
+++ b/dpnp/tensor/libtensor/include/kernels/sorting/topk.hpp
@@ -299,8 +299,7 @@ sycl::event topk_merge_impl(
         // if allocation would be sufficiently large or k is larger than
         // elements processed, use full sort
         if (k_rounded >= axis_nelems || k_rounded >= sorted_block_size ||
-            alloc_len >= axis_nelems / 2)
-        {
+            alloc_len >= axis_nelems / 2) {
             return topk_full_merge_sort_impl(exec_q, iter_nelems, axis_nelems,
                                              k, arg_tp, vals_tp, inds_tp,
                                              index_comp, depends);
@@ -346,8 +345,7 @@ sycl::event topk_merge_impl(
 
                     // load input into SLM
                     for (std::size_t array_id = segment_start_idx + lid;
-                         array_id < segment_end_idx; array_id += lws)
-                    {
+                         array_id < segment_end_idx; array_id += lws) {
                         IndexTy v = (array_id < axis_nelems)
                                         ? iter_id * axis_nelems + array_id
                                         : IndexTy{};
@@ -374,8 +372,7 @@ sycl::event topk_merge_impl(
                     const std::size_t max_chunks_merged =
                         1 + ((wg_chunk_size - 1) / chunk);
                     for (; n_chunks_merged < max_chunks_merged;
-                         data_in_temp = !data_in_temp, n_chunks_merged *= 2)
-                    {
+                         data_in_temp = !data_in_temp, n_chunks_merged *= 2) {
                         const std::size_t nelems_sorted_so_far =
                             n_chunks_merged * chunk;
                         const std::size_t q = (lid / n_chunks_merged);
@@ -410,8 +407,7 @@ sycl::event topk_merge_impl(
                     const auto &out_src =
                         (data_in_temp) ? scratch_space : work_space;
                     for (std::size_t array_id = k_segment_start_idx + lid;
-                         array_id < k_segment_end_idx; array_id += lws)
-                    {
+                         array_id < k_segment_end_idx; array_id += lws) {
                         if (lid < k_rounded) {
                             index_data[iter_id * alloc_len + array_id] =
                                 out_src[array_id - k_segment_start_idx];
diff --git a/dpnp/tensor/libtensor/include/kernels/where.hpp b/dpnp/tensor/libtensor/include/kernels/where.hpp
index 454e1e61fa0d..5527cccec8d2 100644
--- a/dpnp/tensor/libtensor/include/kernels/where.hpp
+++ b/dpnp/tensor/libtensor/include/kernels/where.hpp
@@ -96,8 +96,7 @@ class WhereContigFunctor
 
         using dpctl::tensor::type_utils::is_complex;
         if constexpr (!enable_sg_loadstore || is_complex<condT>::value ||
-                      is_complex<T>::value)
-        {
+                      is_complex<T>::value) {
             const std::uint16_t sgSize =
                 ndit.get_sub_group().get_local_range()[0];
             const std::size_t gid = ndit.get_global_linear_id();
@@ -199,8 +198,7 @@ sycl::event where_contig_impl(sycl::queue &q,
         if (is_aligned<required_alignment>(cond_cp) &&
             is_aligned<required_alignment>(x1_cp) &&
             is_aligned<required_alignment>(x2_cp) &&
-            is_aligned<required_alignment>(dst_cp))
-        {
+            is_aligned<required_alignment>(dst_cp)) {
             static constexpr bool enable_sg_loadstore = true;
             using KernelName = where_contig_kernel<T, condT, vec_sz, n_vecs>;
 
diff --git a/dpnp/tensor/libtensor/include/utils/offset_utils.hpp b/dpnp/tensor/libtensor/include/utils/offset_utils.hpp
index 19664c3d4e12..3a6ac75dfc3a 100644
--- a/dpnp/tensor/libtensor/include/utils/offset_utils.hpp
+++ b/dpnp/tensor/libtensor/include/utils/offset_utils.hpp
@@ -53,9 +53,9 @@ namespace detail
 {
 struct sink_t
 {
-    sink_t(){};
+    sink_t() {};
     template <class T>
-    sink_t(T &&){};
+    sink_t(T &&) {};
 };
 
 template <class V>
@@ -137,10 +137,7 @@ std::tuple<std::unique_ptr<indT, dpctl::tensor::alloc_utils::USMDeleter>,
 struct NoOpIndexer
 {
     constexpr NoOpIndexer() {}
-    constexpr std::size_t operator()(std::size_t gid) const
-    {
-        return gid;
-    }
+    constexpr std::size_t operator()(std::size_t gid) const { return gid; }
 };
 
 using dpctl::tensor::ssize_t;
@@ -156,10 +153,7 @@ struct StridedIndexer
     {
     }
 
-    ssize_t operator()(ssize_t gid) const
-    {
-        return compute_offset(gid);
-    }
+    ssize_t operator()(ssize_t gid) const { return compute_offset(gid); }
 
     ssize_t operator()(std::size_t gid) const
     {
@@ -200,10 +194,7 @@ struct UnpackedStridedIndexer
     {
     }
 
-    ssize_t operator()(ssize_t gid) const
-    {
-        return compute_offset(gid);
-    }
+    ssize_t operator()(ssize_t gid) const { return compute_offset(gid); }
 
     ssize_t operator()(std::size_t gid) const
     {
@@ -310,14 +301,8 @@ struct TwoOffsets
     {
     }
 
-    constexpr displacementT get_first_offset() const
-    {
-        return first_offset;
-    }
-    constexpr displacementT get_second_offset() const
-    {
-        return second_offset;
-    }
+    constexpr displacementT get_first_offset() const { return first_offset; }
+    constexpr displacementT get_second_offset() const { return second_offset; }
 
 private:
     displacementT first_offset = 0;
@@ -418,18 +403,9 @@ struct ThreeOffsets
     {
     }
 
-    constexpr displacementT get_first_offset() const
-    {
-        return first_offset;
-    }
-    constexpr displacementT get_second_offset() const
-    {
-        return second_offset;
-    }
-    constexpr displacementT get_third_offset() const
-    {
-        return third_offset;
-    }
+    constexpr displacementT get_first_offset() const { return first_offset; }
+    constexpr displacementT get_second_offset() const { return second_offset; }
+    constexpr displacementT get_third_offset() const { return third_offset; }
 
 private:
     displacementT first_offset = 0;
@@ -552,22 +528,10 @@ struct FourOffsets
     {
     }
 
-    constexpr displacementT get_first_offset() const
-    {
-        return first_offset;
-    }
-    constexpr displacementT get_second_offset() const
-    {
-        return second_offset;
-    }
-    constexpr displacementT get_third_offset() const
-    {
-        return third_offset;
-    }
-    constexpr displacementT get_fourth_offset() const
-    {
-        return fourth_offset;
-    }
+    constexpr displacementT get_first_offset() const { return first_offset; }
+    constexpr displacementT get_second_offset() const { return second_offset; }
+    constexpr displacementT get_third_offset() const { return third_offset; }
+    constexpr displacementT get_fourth_offset() const { return fourth_offset; }
 
 private:
     displacementT first_offset = 0;
diff --git a/dpnp/tensor/libtensor/include/utils/rich_comparisons.hpp b/dpnp/tensor/libtensor/include/utils/rich_comparisons.hpp
index 87cdfbfbd54f..5d03294392d8 100644
--- a/dpnp/tensor/libtensor/include/utils/rich_comparisons.hpp
+++ b/dpnp/tensor/libtensor/include/utils/rich_comparisons.hpp
@@ -112,9 +112,9 @@ struct ExtendedComplexFPGreater
 };
 
 template <typename T>
-inline constexpr bool is_fp_v = (std::is_same_v<T, sycl::half> ||
-                                 std::is_same_v<T, float> ||
-                                 std::is_same_v<T, double>);
+inline constexpr bool is_fp_v =
+    (std::is_same_v<T, sycl::half> || std::is_same_v<T, float> ||
+     std::is_same_v<T, double>);
 
 } // namespace detail
 
diff --git a/dpnp/tensor/libtensor/include/utils/strided_iters.hpp b/dpnp/tensor/libtensor/include/utils/strided_iters.hpp
index 0bed181802ae..65250b755b56 100644
--- a/dpnp/tensor/libtensor/include/utils/strided_iters.hpp
+++ b/dpnp/tensor/libtensor/include/utils/strided_iters.hpp
@@ -312,14 +312,8 @@ class CIndexer_array
         elem_count = s;
     }
 
-    indT size() const
-    {
-        return elem_count;
-    }
-    indT rank() const
-    {
-        return ndim;
-    }
+    indT size() const { return elem_count; }
+    indT rank() const { return ndim; }
 
     void set(const indT i)
     {
@@ -339,10 +333,7 @@ class CIndexer_array
         multi_index[0] = i_;
     }
 
-    const index_t &get() const
-    {
-        return multi_index;
-    }
+    const index_t &get() const { return multi_index; }
 };
 
 /*
@@ -658,8 +649,7 @@ int simplify_iteration_three_strides(const int nd,
         auto str3_p = strides3[p];
         shape_w.push_back(sh_p);
         if (str1_p <= 0 && str2_p <= 0 && str3_p <= 0 &&
-            std::min({str1_p, str2_p, str3_p}) < 0)
-        {
+            std::min({str1_p, str2_p, str3_p}) < 0) {
             disp1 += str1_p * (sh_p - 1);
             str1_p = -str1_p;
             disp2 += str2_p * (sh_p - 1);
@@ -832,8 +822,7 @@ int simplify_iteration_four_strides(const int nd,
         auto str4_p = strides4[p];
         shape_w.push_back(sh_p);
         if (str1_p <= 0 && str2_p <= 0 && str3_p <= 0 && str4_p <= 0 &&
-            std::min({str1_p, str2_p, str3_p, str4_p}) < 0)
-        {
+            std::min({str1_p, str2_p, str3_p, str4_p}) < 0) {
             disp1 += str1_p * (sh_p - 1);
             str1_p = -str1_p;
             disp2 += str2_p * (sh_p - 1);
@@ -919,8 +908,7 @@ std::tuple<vecT, vecT, T, vecT, T, vecT, T, vecT, T>
 {
     const std::size_t dim = shape.size();
     if (dim != strides1.size() || dim != strides2.size() ||
-        dim != strides3.size() || dim != strides4.size())
-    {
+        dim != strides3.size() || dim != strides4.size()) {
         throw Error("Shape and strides must be of equal size.");
     }
     vecT out_shape = shape;
diff --git a/dpnp/tensor/libtensor/include/utils/sycl_utils.hpp b/dpnp/tensor/libtensor/include/utils/sycl_utils.hpp
index f45918e3c800..9ae41e5ade6e 100644
--- a/dpnp/tensor/libtensor/include/utils/sycl_utils.hpp
+++ b/dpnp/tensor/libtensor/include/utils/sycl_utils.hpp
@@ -501,10 +501,7 @@ struct GetIdentity<Op, T, std::enable_if_t<IsLogSumExp<T, Op>::value>>
 template <typename T>
 struct Hypot
 {
-    T operator()(const T &x, const T &y) const
-    {
-        return sycl::hypot(x, y);
-    }
+    T operator()(const T &x, const T &y) const { return sycl::hypot(x, y); }
 };
 
 template <typename T, class Op>
diff --git a/dpnp/tensor/libtensor/include/utils/type_dispatch.hpp b/dpnp/tensor/libtensor/include/utils/type_dispatch.hpp
index d08187aeaacc..bead0da5093e 100644
--- a/dpnp/tensor/libtensor/include/utils/type_dispatch.hpp
+++ b/dpnp/tensor/libtensor/include/utils/type_dispatch.hpp
@@ -106,8 +106,8 @@ struct usm_ndarray_types
                 throw_unrecognized_typenum_error(typenum);
             }
         }
-        else if (typenum == api.UAR_LONGLONG_ || typenum == api.UAR_ULONGLONG_)
-        {
+        else if (typenum == api.UAR_LONGLONG_ ||
+                 typenum == api.UAR_ULONGLONG_) {
             switch (sizeof(long long)) {
             case sizeof(std::int64_t):
                 return ((typenum == api.UAR_LONGLONG_)
diff --git a/dpnp/tensor/libtensor/include/utils/type_dispatch_building.hpp b/dpnp/tensor/libtensor/include/utils/type_dispatch_building.hpp
index 431e020fbdbe..7170624b5bbe 100644
--- a/dpnp/tensor/libtensor/include/utils/type_dispatch_building.hpp
+++ b/dpnp/tensor/libtensor/include/utils/type_dispatch_building.hpp
@@ -62,8 +62,7 @@ enum class typenum_t : int
 inline constexpr int num_types = 14; // number of elements in typenum_t
 
 template <typename funcPtrT,
-          template <typename fnT, typename D, typename S>
-          typename factory,
+          template <typename fnT, typename D, typename S> typename factory,
           int _num_types>
 class DispatchTableBuilder
 {
@@ -124,8 +123,7 @@ class DispatchTableBuilder
 };
 
 template <typename funcPtrT,
-          template <typename fnT, typename T>
-          typename factory,
+          template <typename fnT, typename T> typename factory,
           int _num_types>
 class DispatchVectorBuilder
 {
@@ -260,10 +258,7 @@ struct NullPtrVector
 
     NullPtrVector() : val(nullptr) {}
 
-    const_reference operator[](int) const
-    {
-        return val;
-    }
+    const_reference operator[](int) const { return val; }
 
 private:
     value_type val;
@@ -278,10 +273,7 @@ struct NullPtrTable
 
     NullPtrTable() : val() {}
 
-    const_reference operator[](int) const
-    {
-        return val;
-    }
+    const_reference operator[](int) const { return val; }
 
 private:
     value_type val;
diff --git a/dpnp/tensor/libtensor/include/utils/type_utils.hpp b/dpnp/tensor/libtensor/include/utils/type_utils.hpp
index e5855081c727..47b1a5554815 100644
--- a/dpnp/tensor/libtensor/include/utils/type_utils.hpp
+++ b/dpnp/tensor/libtensor/include/utils/type_utils.hpp
@@ -98,8 +98,7 @@ dstTy convert_impl(const srcTy &v)
     }
     else if constexpr (!std::is_integral_v<srcTy> &&
                        !std::is_same_v<dstTy, bool> &&
-                       std::is_integral_v<dstTy> && std::is_unsigned_v<dstTy>)
-    {
+                       std::is_integral_v<dstTy> && std::is_unsigned_v<dstTy>) {
         // first cast to signed variant, the cast to unsigned one
         using signedT = typename std::make_signed_t<dstTy>;
         return static_cast<dstTy>(convert_impl<signedT, srcTy>(v));
diff --git a/dpnp/tensor/libtensor/source/accumulators/accumulate_over_axis.hpp b/dpnp/tensor/libtensor/source/accumulators/accumulate_over_axis.hpp
index 4dd00620a260..bce47c45f9b1 100644
--- a/dpnp/tensor/libtensor/source/accumulators/accumulate_over_axis.hpp
+++ b/dpnp/tensor/libtensor/source/accumulators/accumulate_over_axis.hpp
@@ -445,8 +445,7 @@ bool py_accumulate_dtype_supported(const py::dtype &input_dtype,
     }
 
     if (arg_typeid < 0 || arg_typeid >= td_ns::num_types || out_typeid < 0 ||
-        out_typeid >= td_ns::num_types)
-    {
+        out_typeid >= td_ns::num_types) {
         throw std::runtime_error("Reduction type support check: lookup failed");
     }
 
diff --git a/dpnp/tensor/libtensor/source/accumulators/cumulative_logsumexp.cpp b/dpnp/tensor/libtensor/source/accumulators/cumulative_logsumexp.cpp
index e24cf56ddd62..d4961c9edbf1 100644
--- a/dpnp/tensor/libtensor/source/accumulators/cumulative_logsumexp.cpp
+++ b/dpnp/tensor/libtensor/source/accumulators/cumulative_logsumexp.cpp
@@ -140,8 +140,7 @@ struct CumLogSumExp1DContigFactory
     fnT get()
     {
         if constexpr (TypePairSupportDataForLogSumExpAccumulation<
-                          srcTy, dstTy>::is_defined)
-        {
+                          srcTy, dstTy>::is_defined) {
             using ScanOpT = su_ns::LogSumExp<dstTy>;
             static constexpr bool include_initial = false;
             if constexpr (std::is_same_v<srcTy, dstTy>) {
@@ -173,8 +172,7 @@ struct CumLogSumExp1DIncludeInitialContigFactory
     fnT get()
     {
         if constexpr (TypePairSupportDataForLogSumExpAccumulation<
-                          srcTy, dstTy>::is_defined)
-        {
+                          srcTy, dstTy>::is_defined) {
             using ScanOpT = su_ns::LogSumExp<dstTy>;
             static constexpr bool include_initial = true;
             if constexpr (std::is_same_v<srcTy, dstTy>) {
@@ -206,8 +204,7 @@ struct CumLogSumExpStridedFactory
     fnT get()
     {
         if constexpr (TypePairSupportDataForLogSumExpAccumulation<
-                          srcTy, dstTy>::is_defined)
-        {
+                          srcTy, dstTy>::is_defined) {
             using ScanOpT = su_ns::LogSumExp<dstTy>;
             static constexpr bool include_initial = false;
             if constexpr (std::is_same_v<srcTy, dstTy>) {
@@ -239,8 +236,7 @@ struct CumLogSumExpIncludeInitialStridedFactory
     fnT get()
     {
         if constexpr (TypePairSupportDataForLogSumExpAccumulation<
-                          srcTy, dstTy>::is_defined)
-        {
+                          srcTy, dstTy>::is_defined) {
             using ScanOpT = su_ns::LogSumExp<dstTy>;
             static constexpr bool include_initial = true;
             if constexpr (std::is_same_v<srcTy, dstTy>) {
diff --git a/dpnp/tensor/libtensor/source/accumulators/cumulative_prod.cpp b/dpnp/tensor/libtensor/source/accumulators/cumulative_prod.cpp
index 65f3c311eda1..319709b30a76 100644
--- a/dpnp/tensor/libtensor/source/accumulators/cumulative_prod.cpp
+++ b/dpnp/tensor/libtensor/source/accumulators/cumulative_prod.cpp
@@ -151,9 +151,8 @@ struct CumProd1DContigFactory
 {
     fnT get()
     {
-        if constexpr (TypePairSupportDataForProdAccumulation<srcTy,
-                                                             dstTy>::is_defined)
-        {
+        if constexpr (TypePairSupportDataForProdAccumulation<
+                          srcTy, dstTy>::is_defined) {
             using ScanOpT = CumProdScanOpT<dstTy>;
             static constexpr bool include_initial = false;
             if constexpr (std::is_same_v<srcTy, dstTy>) {
@@ -184,9 +183,8 @@ struct CumProd1DIncludeInitialContigFactory
 {
     fnT get()
     {
-        if constexpr (TypePairSupportDataForProdAccumulation<srcTy,
-                                                             dstTy>::is_defined)
-        {
+        if constexpr (TypePairSupportDataForProdAccumulation<
+                          srcTy, dstTy>::is_defined) {
             using ScanOpT = CumProdScanOpT<dstTy>;
             static constexpr bool include_initial = true;
             if constexpr (std::is_same_v<srcTy, dstTy>) {
@@ -217,9 +215,8 @@ struct CumProdStridedFactory
 {
     fnT get()
     {
-        if constexpr (TypePairSupportDataForProdAccumulation<srcTy,
-                                                             dstTy>::is_defined)
-        {
+        if constexpr (TypePairSupportDataForProdAccumulation<
+                          srcTy, dstTy>::is_defined) {
             using ScanOpT = CumProdScanOpT<dstTy>;
             static constexpr bool include_initial = false;
             if constexpr (std::is_same_v<srcTy, dstTy>) {
@@ -250,9 +247,8 @@ struct CumProdIncludeInitialStridedFactory
 {
     fnT get()
     {
-        if constexpr (TypePairSupportDataForProdAccumulation<srcTy,
-                                                             dstTy>::is_defined)
-        {
+        if constexpr (TypePairSupportDataForProdAccumulation<
+                          srcTy, dstTy>::is_defined) {
             using ScanOpT = CumProdScanOpT<dstTy>;
             static constexpr bool include_initial = true;
             if constexpr (std::is_same_v<srcTy, dstTy>) {
diff --git a/dpnp/tensor/libtensor/source/accumulators/cumulative_sum.cpp b/dpnp/tensor/libtensor/source/accumulators/cumulative_sum.cpp
index 60b46946acc9..f700883af2a1 100644
--- a/dpnp/tensor/libtensor/source/accumulators/cumulative_sum.cpp
+++ b/dpnp/tensor/libtensor/source/accumulators/cumulative_sum.cpp
@@ -150,9 +150,8 @@ struct CumSum1DContigFactory
 {
     fnT get()
     {
-        if constexpr (TypePairSupportDataForSumAccumulation<srcTy,
-                                                            dstTy>::is_defined)
-        {
+        if constexpr (TypePairSupportDataForSumAccumulation<
+                          srcTy, dstTy>::is_defined) {
             using ScanOpT = CumSumScanOpT<dstTy>;
             static constexpr bool include_initial = false;
             if constexpr (std::is_same_v<srcTy, dstTy>) {
@@ -183,9 +182,8 @@ struct CumSum1DIncludeInitialContigFactory
 {
     fnT get()
     {
-        if constexpr (TypePairSupportDataForSumAccumulation<srcTy,
-                                                            dstTy>::is_defined)
-        {
+        if constexpr (TypePairSupportDataForSumAccumulation<
+                          srcTy, dstTy>::is_defined) {
             using ScanOpT = CumSumScanOpT<dstTy>;
             static constexpr bool include_initial = true;
             if constexpr (std::is_same_v<srcTy, dstTy>) {
@@ -216,9 +214,8 @@ struct CumSumStridedFactory
 {
     fnT get()
     {
-        if constexpr (TypePairSupportDataForSumAccumulation<srcTy,
-                                                            dstTy>::is_defined)
-        {
+        if constexpr (TypePairSupportDataForSumAccumulation<
+                          srcTy, dstTy>::is_defined) {
             using ScanOpT = CumSumScanOpT<dstTy>;
             static constexpr bool include_initial = false;
             if constexpr (std::is_same_v<srcTy, dstTy>) {
@@ -249,9 +246,8 @@ struct CumSumIncludeInitialStridedFactory
 {
     fnT get()
     {
-        if constexpr (TypePairSupportDataForSumAccumulation<srcTy,
-                                                            dstTy>::is_defined)
-        {
+        if constexpr (TypePairSupportDataForSumAccumulation<
+                          srcTy, dstTy>::is_defined) {
             using ScanOpT = CumSumScanOpT<dstTy>;
             static constexpr bool include_initial = true;
             if constexpr (std::is_same_v<srcTy, dstTy>) {
diff --git a/dpnp/tensor/libtensor/source/boolean_advanced_indexing.cpp b/dpnp/tensor/libtensor/source/boolean_advanced_indexing.cpp
index e44abbd48303..146be45e4858 100644
--- a/dpnp/tensor/libtensor/source/boolean_advanced_indexing.cpp
+++ b/dpnp/tensor/libtensor/source/boolean_advanced_indexing.cpp
@@ -213,8 +213,7 @@ std::pair<sycl::event, sycl::event>
     // masked_dst_nelems is number of set elements in the mask, or last element
     // in cumsum
     if (!same_ortho_dims ||
-        (masked_src_nelems != static_cast<std::size_t>(cumsum_sz)))
-    {
+        (masked_src_nelems != static_cast<std::size_t>(cumsum_sz))) {
         throw py::value_error("Inconsistent array dimensions");
     }
 
@@ -539,8 +538,7 @@ std::pair<sycl::event, sycl::event>
     }
 
     if (!same_ortho_dims ||
-        (masked_dst_nelems != static_cast<std::size_t>(cumsum_sz)))
-    {
+        (masked_dst_nelems != static_cast<std::size_t>(cumsum_sz))) {
         throw py::value_error("Inconsistent array dimensions");
     }
 
@@ -791,8 +789,7 @@ std::pair<sycl::event, sycl::event>
 
     // cumsum must be int32_t or int64_t only
     if ((cumsum_typeid != int32_typeid && cumsum_typeid != int64_typeid) ||
-        (indexes_typeid != int32_typeid && indexes_typeid != int64_typeid))
-    {
+        (indexes_typeid != int32_typeid && indexes_typeid != int64_typeid)) {
         throw py::value_error("Cumulative sum array and index array must have "
                               "int32 or int64 data-type");
     }
diff --git a/dpnp/tensor/libtensor/source/clip.cpp b/dpnp/tensor/libtensor/source/clip.cpp
index 3e1c5e8cd262..4a0e5b9357de 100644
--- a/dpnp/tensor/libtensor/source/clip.cpp
+++ b/dpnp/tensor/libtensor/source/clip.cpp
@@ -142,8 +142,7 @@ std::pair<sycl::event, sycl::event>
         dpctl::tensor::overlap::SameLogicalTensors();
     if ((overlap(dst, src) && !same_logical_tensors(dst, src)) ||
         (overlap(dst, min) && !same_logical_tensors(dst, min)) ||
-        (overlap(dst, max) && !same_logical_tensors(dst, max)))
-    {
+        (overlap(dst, max) && !same_logical_tensors(dst, max))) {
         throw py::value_error("Destination array overlaps with input.");
     }
 
@@ -159,8 +158,7 @@ std::pair<sycl::event, sycl::event>
     int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum);
 
     if (src_typeid != dst_typeid || src_typeid != min_typeid ||
-        src_typeid != max_typeid)
-    {
+        src_typeid != max_typeid) {
         throw py::value_error("Input, min, max, and destination arrays must "
                               "have the same data type");
     }
diff --git a/dpnp/tensor/libtensor/source/copy_and_cast_usm_to_usm.cpp b/dpnp/tensor/libtensor/source/copy_and_cast_usm_to_usm.cpp
index 43a6fbf4a0dd..7c2db989b0c2 100644
--- a/dpnp/tensor/libtensor/source/copy_and_cast_usm_to_usm.cpp
+++ b/dpnp/tensor/libtensor/source/copy_and_cast_usm_to_usm.cpp
@@ -204,8 +204,7 @@ std::pair<sycl::event, sycl::event> copy_usm_ndarray_into_usm_ndarray(
 
             sycl::event copy_and_cast_1d_event;
             if ((src_strides_arr[0] == 1) && (dst_strides_arr[0] == 1) &&
-                (src_offset == 0) && (dst_offset == 0))
-            {
+                (src_offset == 0) && (dst_offset == 0)) {
                 auto contig_fn =
                     copy_and_cast_contig_dispatch_table[dst_type_id]
                                                        [src_type_id];
diff --git a/dpnp/tensor/libtensor/source/copy_as_contig.cpp b/dpnp/tensor/libtensor/source/copy_as_contig.cpp
index 5d78862651fc..c1c4b740dfba 100644
--- a/dpnp/tensor/libtensor/source/copy_as_contig.cpp
+++ b/dpnp/tensor/libtensor/source/copy_as_contig.cpp
@@ -535,8 +535,7 @@ std::pair<sycl::event, sycl::event>
     if (1 == nd) {
         const auto expected_dim = static_cast<py::ssize_t>(batch_nelems);
         if ((simplified_shape.front() != expected_dim) ||
-            (simplified_dst_strides.front() != dst_batch_step))
-        {
+            (simplified_dst_strides.front() != dst_batch_step)) {
             throw std::runtime_error(
                 "Unexpected result of simplifying iteration space, 2");
         }
@@ -727,8 +726,7 @@ std::pair<sycl::event, sycl::event>
     if (1 == nd) {
         const auto expected_dim = static_cast<py::ssize_t>(batch_nelems);
         if ((simplified_shape.front() != expected_dim) ||
-            (simplified_dst_strides.front() != dst_batch_step))
-        {
+            (simplified_dst_strides.front() != dst_batch_step)) {
             throw std::runtime_error(
                 "Unexpected result of simplifying iteration space, 2");
         }
diff --git a/dpnp/tensor/libtensor/source/device_support_queries.cpp b/dpnp/tensor/libtensor/source/device_support_queries.cpp
index 3cc0952c2080..6026520f3daa 100644
--- a/dpnp/tensor/libtensor/source/device_support_queries.cpp
+++ b/dpnp/tensor/libtensor/source/device_support_queries.cpp
@@ -110,15 +110,9 @@ std::string _default_device_complex_type(const sycl::device &d)
     }
 }
 
-std::string _default_device_bool_type(const sycl::device &)
-{
-    return "b1";
-}
+std::string _default_device_bool_type(const sycl::device &) { return "b1"; }
 
-std::string _default_device_index_type(const sycl::device &)
-{
-    return "i8";
-}
+std::string _default_device_index_type(const sycl::device &) { return "i8"; }
 
 sycl::device _extract_device(const py::object &arg)
 {
diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/elementwise_functions.hpp b/dpnp/tensor/libtensor/source/elementwise_functions/elementwise_functions.hpp
index b8450f8e7296..3a8dc6bfb56f 100644
--- a/dpnp/tensor/libtensor/source/elementwise_functions/elementwise_functions.hpp
+++ b/dpnp/tensor/libtensor/source/elementwise_functions/elementwise_functions.hpp
@@ -377,8 +377,7 @@ std::pair<sycl::event, sycl::event> py_binary_ufunc(
     auto const &same_logical_tensors =
         dpctl::tensor::overlap::SameLogicalTensors();
     if ((overlap(src1, dst) && !same_logical_tensors(src1, dst)) ||
-        (overlap(src2, dst) && !same_logical_tensors(src2, dst)))
-    {
+        (overlap(src2, dst) && !same_logical_tensors(src2, dst))) {
         throw py::value_error("Arrays index overlapping segments of memory");
     }
     // check memory overlap
@@ -445,8 +444,7 @@ std::pair<sycl::event, sycl::event> py_binary_ufunc(
 
         if ((nd == 1) && isEqual(simplified_src1_strides, unit_stride) &&
             isEqual(simplified_src2_strides, unit_stride) &&
-            isEqual(simplified_dst_strides, unit_stride))
-        {
+            isEqual(simplified_dst_strides, unit_stride)) {
             auto contig_fn = contig_dispatch_table[src1_typeid][src2_typeid];
 
             if (contig_fn != nullptr) {
@@ -468,8 +466,7 @@ std::pair<sycl::event, sycl::event> py_binary_ufunc(
             // special case of C-contiguous matrix and a row
             if (isEqual(simplified_src2_strides, zero_one_strides) &&
                 isEqual(simplified_src1_strides, {simplified_shape[1], one}) &&
-                isEqual(simplified_dst_strides, {simplified_shape[1], one}))
-            {
+                isEqual(simplified_dst_strides, {simplified_shape[1], one})) {
                 auto matrix_row_broadcast_fn =
                     contig_matrix_row_broadcast_dispatch_table[src1_typeid]
                                                               [src2_typeid];
@@ -483,8 +480,7 @@ std::pair<sycl::event, sycl::event> py_binary_ufunc(
                         is_aligned<required_alignment>(
                             src2_data + src2_offset * src2_itemsize) &&
                         is_aligned<required_alignment>(
-                            dst_data + dst_offset * dst_itemsize))
-                    {
+                            dst_data + dst_offset * dst_itemsize)) {
                         std::size_t n0 = simplified_shape[0];
                         std::size_t n1 = simplified_shape[1];
                         sycl::event comp_ev = matrix_row_broadcast_fn(
@@ -501,8 +497,7 @@ std::pair<sycl::event, sycl::event> py_binary_ufunc(
             }
             if (isEqual(simplified_src1_strides, one_zero_strides) &&
                 isEqual(simplified_src2_strides, {one, simplified_shape[0]}) &&
-                isEqual(simplified_dst_strides, {one, simplified_shape[0]}))
-            {
+                isEqual(simplified_dst_strides, {one, simplified_shape[0]})) {
                 auto row_matrix_broadcast_fn =
                     contig_row_matrix_broadcast_dispatch_table[src1_typeid]
                                                               [src2_typeid];
@@ -517,8 +512,7 @@ std::pair<sycl::event, sycl::event> py_binary_ufunc(
                         is_aligned<required_alignment>(
                             src2_data + src2_offset * src2_itemsize) &&
                         is_aligned<required_alignment>(
-                            dst_data + dst_offset * dst_itemsize))
-                    {
+                            dst_data + dst_offset * dst_itemsize)) {
                         std::size_t n0 = simplified_shape[1];
                         std::size_t n1 = simplified_shape[0];
                         sycl::event comp_ev = row_matrix_broadcast_fn(
@@ -590,8 +584,7 @@ py::object py_binary_ufunc_result_type(const py::dtype &input1_dtype,
     }
 
     if (src1_typeid < 0 || src1_typeid >= td_ns::num_types || src2_typeid < 0 ||
-        src2_typeid >= td_ns::num_types)
-    {
+        src2_typeid >= td_ns::num_types) {
         throw std::runtime_error("binary output type lookup failed");
     }
     int dst_typeid = output_types_table[src1_typeid][src2_typeid];
@@ -739,8 +732,7 @@ std::pair<sycl::event, sycl::event>
             std::initializer_list<py::ssize_t>{1};
 
         if ((nd == 1) && isEqual(simplified_rhs_strides, unit_stride) &&
-            isEqual(simplified_lhs_strides, unit_stride))
-        {
+            isEqual(simplified_lhs_strides, unit_stride)) {
             auto contig_fn = contig_dispatch_table[rhs_typeid][lhs_typeid];
 
             if (contig_fn != nullptr) {
@@ -759,8 +751,7 @@ std::pair<sycl::event, sycl::event>
             static constexpr py::ssize_t one{1};
             // special case of C-contiguous matrix and a row
             if (isEqual(simplified_rhs_strides, one_zero_strides) &&
-                isEqual(simplified_lhs_strides, {one, simplified_shape[0]}))
-            {
+                isEqual(simplified_lhs_strides, {one, simplified_shape[0]})) {
                 auto row_matrix_broadcast_fn =
                     contig_row_matrix_broadcast_dispatch_table[rhs_typeid]
                                                               [lhs_typeid];
diff --git a/dpnp/tensor/libtensor/source/full_ctor.cpp b/dpnp/tensor/libtensor/source/full_ctor.cpp
index dfe1d25b769c..8d7fcd22b914 100644
--- a/dpnp/tensor/libtensor/source/full_ctor.cpp
+++ b/dpnp/tensor/libtensor/source/full_ctor.cpp
@@ -127,10 +127,7 @@ sycl::event full_contig_impl(sycl::queue &exec_q,
                 constexpr UInt128() : v1{}, v2{} {}
                 UInt128(const UInt128 &) = default;
 
-                operator bool() const
-                {
-                    return bool(!v1) && bool(!v2);
-                }
+                operator bool() const { return bool(!v1) && bool(!v2); }
 
                 std::uint64_t v1;
                 std::uint64_t v2;
diff --git a/dpnp/tensor/libtensor/source/linalg_functions/dot.cpp b/dpnp/tensor/libtensor/source/linalg_functions/dot.cpp
index 05ee37594e12..9621ebc3277f 100644
--- a/dpnp/tensor/libtensor/source/linalg_functions/dot.cpp
+++ b/dpnp/tensor/libtensor/source/linalg_functions/dot.cpp
@@ -217,8 +217,7 @@ std::pair<sycl::event, sycl::event>
     int x1_nd = x1.get_ndim();
     int x2_nd = x2.get_ndim();
     if (x1_nd != (batch_dims + x1_outer_dims + inner_dims) ||
-        x2_nd != (batch_dims + x2_outer_dims + inner_dims))
-    {
+        x2_nd != (batch_dims + x2_outer_dims + inner_dims)) {
         throw py::value_error("Input arrays do not have dimensions consistent "
                               "with input dimensions");
     }
@@ -332,8 +331,7 @@ std::pair<sycl::event, sycl::event>
     sycl::event dot_ev;
     if (call_vecdot) {
         if ((is_x1_c_contig && is_x2_c_contig && is_dst_c_contig) ||
-            ((is_x1_f_contig && is_x2_f_contig) && !call_batched))
-        {
+            ((is_x1_f_contig && is_x2_f_contig) && !call_batched)) {
             dot_product_contig_impl_fn_ptr_t fn = nullptr;
             if (supports_atomics) {
                 fn = dot_product_contig_dispatch_table[x1_typeid][x2_typeid];
@@ -660,8 +658,7 @@ std::pair<sycl::event, sycl::event>
                 x1_batch_offset, x2_batch_offset, dst_batch_offset);
 
             if (batch_dims == 1 && x1_outer_dims == 1 && x2_outer_dims == 1 &&
-                inner_dims == 1)
-            {
+                inner_dims == 1) {
                 bool gemm_batch_c_contig = false;
 
                 if ((static_cast<std::size_t>(outer_inner_x1_strides[0]) ==
@@ -672,8 +669,7 @@ std::pair<sycl::event, sycl::event>
                      outer_inner_x2_strides[1] == 1) &&
                     (static_cast<std::size_t>(outer_inner_dst_strides[0]) ==
                          x2_outer_nelems &&
-                     outer_inner_dst_strides[1] == 1))
-                {
+                     outer_inner_dst_strides[1] == 1)) {
                     gemm_batch_c_contig =
                         (static_cast<std::size_t>(
                              simplified_batch_x1_strides[0]) ==
@@ -801,8 +797,7 @@ py::object py_dot_result_type(const py::dtype &input1_dtype,
     }
 
     if (src1_typeid < 0 || src1_typeid >= td_ns::num_types || src2_typeid < 0 ||
-        src2_typeid >= td_ns::num_types)
-    {
+        src2_typeid >= td_ns::num_types) {
         throw std::runtime_error("binary output type lookup failed");
     }
     int dst_typeid = output_types_table[src1_typeid][src2_typeid];
diff --git a/dpnp/tensor/libtensor/source/reductions/argmax.cpp b/dpnp/tensor/libtensor/source/reductions/argmax.cpp
index 10fc49759168..af602371dfc5 100644
--- a/dpnp/tensor/libtensor/source/reductions/argmax.cpp
+++ b/dpnp/tensor/libtensor/source/reductions/argmax.cpp
@@ -131,9 +131,8 @@ struct ArgmaxOverAxisTempsStridedFactory
 {
     fnT get() const
     {
-        if constexpr (TypePairSupportForArgmaxReductionTemps<srcTy,
-                                                             dstTy>::is_defined)
-        {
+        if constexpr (TypePairSupportForArgmaxReductionTemps<
+                          srcTy, dstTy>::is_defined) {
             if constexpr (std::is_integral_v<srcTy> &&
                           !std::is_same_v<srcTy, bool>) {
                 // op for values
@@ -165,9 +164,8 @@ struct ArgmaxOverAxis1TempsContigFactory
 {
     fnT get() const
     {
-        if constexpr (TypePairSupportForArgmaxReductionTemps<srcTy,
-                                                             dstTy>::is_defined)
-        {
+        if constexpr (TypePairSupportForArgmaxReductionTemps<
+                          srcTy, dstTy>::is_defined) {
             if constexpr (std::is_integral_v<srcTy> &&
                           !std::is_same_v<srcTy, bool>) {
                 // op for values
@@ -199,9 +197,8 @@ struct ArgmaxOverAxis0TempsContigFactory
 {
     fnT get() const
     {
-        if constexpr (TypePairSupportForArgmaxReductionTemps<srcTy,
-                                                             dstTy>::is_defined)
-        {
+        if constexpr (TypePairSupportForArgmaxReductionTemps<
+                          srcTy, dstTy>::is_defined) {
             if constexpr (std::is_integral_v<srcTy> &&
                           !std::is_same_v<srcTy, bool>) {
                 // op for values
diff --git a/dpnp/tensor/libtensor/source/reductions/argmin.cpp b/dpnp/tensor/libtensor/source/reductions/argmin.cpp
index ec4637b62d49..4869b75eacf9 100644
--- a/dpnp/tensor/libtensor/source/reductions/argmin.cpp
+++ b/dpnp/tensor/libtensor/source/reductions/argmin.cpp
@@ -131,9 +131,8 @@ struct ArgminOverAxisTempsStridedFactory
 {
     fnT get() const
     {
-        if constexpr (TypePairSupportForArgminReductionTemps<srcTy,
-                                                             dstTy>::is_defined)
-        {
+        if constexpr (TypePairSupportForArgminReductionTemps<
+                          srcTy, dstTy>::is_defined) {
             if constexpr (std::is_integral_v<srcTy> &&
                           !std::is_same_v<srcTy, bool>) {
                 // op for values
@@ -165,9 +164,8 @@ struct ArgminOverAxis1TempsContigFactory
 {
     fnT get() const
     {
-        if constexpr (TypePairSupportForArgminReductionTemps<srcTy,
-                                                             dstTy>::is_defined)
-        {
+        if constexpr (TypePairSupportForArgminReductionTemps<
+                          srcTy, dstTy>::is_defined) {
             if constexpr (std::is_integral_v<srcTy> &&
                           !std::is_same_v<srcTy, bool>) {
                 // op for values
@@ -199,9 +197,8 @@ struct ArgminOverAxis0TempsContigFactory
 {
     fnT get() const
     {
-        if constexpr (TypePairSupportForArgminReductionTemps<srcTy,
-                                                             dstTy>::is_defined)
-        {
+        if constexpr (TypePairSupportForArgminReductionTemps<
+                          srcTy, dstTy>::is_defined) {
             if constexpr (std::is_integral_v<srcTy> &&
                           !std::is_same_v<srcTy, bool>) {
                 // op for values
diff --git a/dpnp/tensor/libtensor/source/reductions/logsumexp.cpp b/dpnp/tensor/libtensor/source/reductions/logsumexp.cpp
index 75e4010bfd5b..351eab82ee6b 100644
--- a/dpnp/tensor/libtensor/source/reductions/logsumexp.cpp
+++ b/dpnp/tensor/libtensor/source/reductions/logsumexp.cpp
@@ -138,8 +138,7 @@ struct LogSumExpOverAxisTempsStridedFactory
     fnT get() const
     {
         if constexpr (TypePairSupportDataForLogSumExpReductionTemps<
-                          srcTy, dstTy>::is_defined)
-        {
+                          srcTy, dstTy>::is_defined) {
             using ReductionOpT = su_ns::LogSumExp<dstTy>;
             return dpctl::tensor::kernels::
                 reduction_over_group_temps_strided_impl<srcTy, dstTy,
@@ -157,8 +156,7 @@ struct LogSumExpOverAxis1TempsContigFactory
     fnT get() const
     {
         if constexpr (TypePairSupportDataForLogSumExpReductionTemps<
-                          srcTy, dstTy>::is_defined)
-        {
+                          srcTy, dstTy>::is_defined) {
             using ReductionOpT = su_ns::LogSumExp<dstTy>;
             return dpctl::tensor::kernels::
                 reduction_axis1_over_group_temps_contig_impl<srcTy, dstTy,
@@ -176,8 +174,7 @@ struct LogSumExpOverAxis0TempsContigFactory
     fnT get() const
     {
         if constexpr (TypePairSupportDataForLogSumExpReductionTemps<
-                          srcTy, dstTy>::is_defined)
-        {
+                          srcTy, dstTy>::is_defined) {
             using ReductionOpT = su_ns::LogSumExp<dstTy>;
             return dpctl::tensor::kernels::
                 reduction_axis0_over_group_temps_contig_impl<srcTy, dstTy,
diff --git a/dpnp/tensor/libtensor/source/reductions/max.cpp b/dpnp/tensor/libtensor/source/reductions/max.cpp
index d19ed226d3b4..628f7cfe8606 100644
--- a/dpnp/tensor/libtensor/source/reductions/max.cpp
+++ b/dpnp/tensor/libtensor/source/reductions/max.cpp
@@ -163,8 +163,7 @@ struct MaxOverAxisAtomicStridedFactory
     fnT get() const
     {
         if constexpr (TypePairSupportDataForMaxReductionAtomic<
-                          srcTy, dstTy>::is_defined)
-        {
+                          srcTy, dstTy>::is_defined) {
             if constexpr (std::is_floating_point<dstTy>::value) {
                 using ReductionOpT = su_ns::Maximum<dstTy>;
                 return dpctl::tensor::kernels::
@@ -217,8 +216,7 @@ struct MaxOverAxis1AtomicContigFactory
     fnT get() const
     {
         if constexpr (TypePairSupportDataForMaxReductionAtomic<
-                          srcTy, dstTy>::is_defined)
-        {
+                          srcTy, dstTy>::is_defined) {
             if constexpr (std::is_floating_point<dstTy>::value) {
                 using ReductionOpT = su_ns::Maximum<dstTy>;
                 return dpctl::tensor::kernels::
@@ -244,8 +242,7 @@ struct MaxOverAxis0AtomicContigFactory
     fnT get() const
     {
         if constexpr (TypePairSupportDataForMaxReductionAtomic<
-                          srcTy, dstTy>::is_defined)
-        {
+                          srcTy, dstTy>::is_defined) {
             if constexpr (std::is_floating_point<dstTy>::value) {
                 using ReductionOpT = su_ns::Maximum<dstTy>;
                 return dpctl::tensor::kernels::
diff --git a/dpnp/tensor/libtensor/source/reductions/min.cpp b/dpnp/tensor/libtensor/source/reductions/min.cpp
index 97d3432b13ed..68bfdb583b0b 100644
--- a/dpnp/tensor/libtensor/source/reductions/min.cpp
+++ b/dpnp/tensor/libtensor/source/reductions/min.cpp
@@ -163,8 +163,7 @@ struct MinOverAxisAtomicStridedFactory
     fnT get() const
     {
         if constexpr (TypePairSupportDataForMinReductionAtomic<
-                          srcTy, dstTy>::is_defined)
-        {
+                          srcTy, dstTy>::is_defined) {
             if constexpr (std::is_floating_point<dstTy>::value) {
                 using ReductionOpT = su_ns::Minimum<dstTy>;
                 return dpctl::tensor::kernels::
@@ -217,8 +216,7 @@ struct MinOverAxis1AtomicContigFactory
     fnT get() const
     {
         if constexpr (TypePairSupportDataForMinReductionAtomic<
-                          srcTy, dstTy>::is_defined)
-        {
+                          srcTy, dstTy>::is_defined) {
             if constexpr (std::is_floating_point<dstTy>::value) {
                 using ReductionOpT = su_ns::Minimum<dstTy>;
                 return dpctl::tensor::kernels::
@@ -244,8 +242,7 @@ struct MinOverAxis0AtomicContigFactory
     fnT get() const
     {
         if constexpr (TypePairSupportDataForMinReductionAtomic<
-                          srcTy, dstTy>::is_defined)
-        {
+                          srcTy, dstTy>::is_defined) {
             if constexpr (std::is_floating_point<dstTy>::value) {
                 using ReductionOpT = su_ns::Minimum<dstTy>;
                 return dpctl::tensor::kernels::
diff --git a/dpnp/tensor/libtensor/source/reductions/prod.cpp b/dpnp/tensor/libtensor/source/reductions/prod.cpp
index 6cbb21dfe02c..9ecd403159b0 100644
--- a/dpnp/tensor/libtensor/source/reductions/prod.cpp
+++ b/dpnp/tensor/libtensor/source/reductions/prod.cpp
@@ -246,8 +246,7 @@ struct ProductOverAxisAtomicStridedFactory
     fnT get() const
     {
         if constexpr (TypePairSupportDataForProductReductionAtomic<
-                          srcTy, dstTy>::is_defined)
-        {
+                          srcTy, dstTy>::is_defined) {
             using ReductionOpT = sycl::multiplies<dstTy>;
             return dpctl::tensor::kernels::
                 reduction_over_group_with_atomics_strided_impl<srcTy, dstTy,
@@ -265,8 +264,7 @@ struct ProductOverAxisTempsStridedFactory
     fnT get() const
     {
         if constexpr (TypePairSupportDataForProductReductionTemps<
-                          srcTy, dstTy>::is_defined)
-        {
+                          srcTy, dstTy>::is_defined) {
             using ReductionOpT = std::conditional_t<std::is_same_v<dstTy, bool>,
                                                     sycl::logical_and<dstTy>,
                                                     sycl::multiplies<dstTy>>;
@@ -286,8 +284,7 @@ struct ProductOverAxis1AtomicContigFactory
     fnT get() const
     {
         if constexpr (TypePairSupportDataForProductReductionAtomic<
-                          srcTy, dstTy>::is_defined)
-        {
+                          srcTy, dstTy>::is_defined) {
             using ReductionOpT = sycl::multiplies<dstTy>;
             return dpctl::tensor::kernels::
                 reduction_axis1_over_group_with_atomics_contig_impl<
@@ -305,8 +302,7 @@ struct ProductOverAxis0AtomicContigFactory
     fnT get() const
     {
         if constexpr (TypePairSupportDataForProductReductionAtomic<
-                          srcTy, dstTy>::is_defined)
-        {
+                          srcTy, dstTy>::is_defined) {
             using ReductionOpT = sycl::multiplies<dstTy>;
             return dpctl::tensor::kernels::
                 reduction_axis0_over_group_with_atomics_contig_impl<
@@ -324,8 +320,7 @@ struct ProductOverAxis1TempsContigFactory
     fnT get() const
     {
         if constexpr (TypePairSupportDataForProductReductionTemps<
-                          srcTy, dstTy>::is_defined)
-        {
+                          srcTy, dstTy>::is_defined) {
             using ReductionOpT = std::conditional_t<std::is_same_v<dstTy, bool>,
                                                     sycl::logical_and<dstTy>,
                                                     sycl::multiplies<dstTy>>;
@@ -345,8 +340,7 @@ struct ProductOverAxis0TempsContigFactory
     fnT get() const
     {
         if constexpr (TypePairSupportDataForProductReductionTemps<
-                          srcTy, dstTy>::is_defined)
-        {
+                          srcTy, dstTy>::is_defined) {
             using ReductionOpT = std::conditional_t<std::is_same_v<dstTy, bool>,
                                                     sycl::logical_and<dstTy>,
                                                     sycl::multiplies<dstTy>>;
diff --git a/dpnp/tensor/libtensor/source/reductions/reduce_hypot.cpp b/dpnp/tensor/libtensor/source/reductions/reduce_hypot.cpp
index 5279b4f6c276..b8a042e9a55b 100644
--- a/dpnp/tensor/libtensor/source/reductions/reduce_hypot.cpp
+++ b/dpnp/tensor/libtensor/source/reductions/reduce_hypot.cpp
@@ -138,8 +138,7 @@ struct HypotOverAxisTempsStridedFactory
     fnT get() const
     {
         if constexpr (TypePairSupportDataForHypotReductionTemps<
-                          srcTy, dstTy>::is_defined)
-        {
+                          srcTy, dstTy>::is_defined) {
             using ReductionOpT = su_ns::Hypot<dstTy>;
             return dpctl::tensor::kernels::
                 reduction_over_group_temps_strided_impl<srcTy, dstTy,
@@ -157,8 +156,7 @@ struct HypotOverAxis1TempsContigFactory
     fnT get() const
     {
         if constexpr (TypePairSupportDataForHypotReductionTemps<
-                          srcTy, dstTy>::is_defined)
-        {
+                          srcTy, dstTy>::is_defined) {
             using ReductionOpT = su_ns::Hypot<dstTy>;
             return dpctl::tensor::kernels::
                 reduction_axis1_over_group_temps_contig_impl<srcTy, dstTy,
@@ -176,8 +174,7 @@ struct HypotOverAxis0TempsContigFactory
     fnT get() const
     {
         if constexpr (TypePairSupportDataForHypotReductionTemps<
-                          srcTy, dstTy>::is_defined)
-        {
+                          srcTy, dstTy>::is_defined) {
             using ReductionOpT = su_ns::Hypot<dstTy>;
             return dpctl::tensor::kernels::
                 reduction_axis0_over_group_temps_contig_impl<srcTy, dstTy,
diff --git a/dpnp/tensor/libtensor/source/reductions/reduction_atomic_support.hpp b/dpnp/tensor/libtensor/source/reductions/reduction_atomic_support.hpp
index 5f9cc32f1203..af6c3f0d513a 100644
--- a/dpnp/tensor/libtensor/source/reductions/reduction_atomic_support.hpp
+++ b/dpnp/tensor/libtensor/source/reductions/reduction_atomic_support.hpp
@@ -97,8 +97,7 @@ struct ArithmeticAtomicSupportFactory
     {
         using dpctl::tensor::type_utils::is_complex;
         if constexpr (std::is_floating_point_v<T> ||
-                      std::is_same_v<T, sycl::half> || is_complex<T>::value)
-        {
+                      std::is_same_v<T, sycl::half> || is_complex<T>::value) {
             // for real- and complex- floating point types, tree reduction has
             // better round-off accumulation properties (round-off error is
             // proportional to the log2(reduction_size), while naive elementwise
@@ -117,10 +116,7 @@ struct ArithmeticAtomicSupportFactory
 template <typename fnT, typename T>
 struct MinMaxAtomicSupportFactory
 {
-    fnT get()
-    {
-        return check_atomic_support<T>;
-    }
+    fnT get() { return check_atomic_support<T>; }
 };
 
 template <typename fnT, typename T>
diff --git a/dpnp/tensor/libtensor/source/reductions/reduction_over_axis.hpp b/dpnp/tensor/libtensor/source/reductions/reduction_over_axis.hpp
index 936c8dbe9b56..8224163ccb19 100644
--- a/dpnp/tensor/libtensor/source/reductions/reduction_over_axis.hpp
+++ b/dpnp/tensor/libtensor/source/reductions/reduction_over_axis.hpp
@@ -96,8 +96,7 @@ bool py_reduction_dtype_supported(
     }
 
     if (arg_typeid < 0 || arg_typeid >= td_ns::num_types || out_typeid < 0 ||
-        out_typeid >= td_ns::num_types)
-    {
+        out_typeid >= td_ns::num_types) {
         throw std::runtime_error("Reduction type support check: lookup failed");
     }
 
@@ -158,8 +157,7 @@ bool py_tree_reduction_dtype_supported(const py::dtype &input_dtype,
     }
 
     if (arg_typeid < 0 || arg_typeid >= td_ns::num_types || out_typeid < 0 ||
-        out_typeid >= td_ns::num_types)
-    {
+        out_typeid >= td_ns::num_types) {
         throw std::runtime_error("Reduction type support check: lookup failed");
     }
 
@@ -259,8 +257,7 @@ std::pair<sycl::event, sycl::event> py_reduction_over_axis(
     bool is_src_f_contig = src.is_f_contiguous();
 
     if ((is_src_c_contig && is_dst_c_contig) ||
-        (is_src_f_contig && dst_nelems == 1))
-    {
+        (is_src_f_contig && dst_nelems == 1)) {
         // remove_all_extents gets underlying type of table
         using contig_fn_ptr_T =
             typename std::remove_all_extents<contig_fnT>::type;
@@ -292,8 +289,7 @@ std::pair<sycl::event, sycl::event> py_reduction_over_axis(
         }
     }
     else if (is_src_f_contig &&
-             ((is_dst_c_contig && dst_nd == 1) || dst.is_f_contiguous()))
-    {
+             ((is_dst_c_contig && dst_nd == 1) || dst.is_f_contiguous())) {
         // remove_all_extents gets underlying type of table
         using contig_fn_ptr_T =
             typename std::remove_all_extents<contig_fnT>::type;
@@ -391,8 +387,7 @@ std::pair<sycl::event, sycl::event> py_reduction_over_axis(
                      simplified_iteration_src_strides[0]) == reduction_nelems);
         }
         else if (static_cast<std::size_t>(
-                     simplified_reduction_src_strides[0]) == iter_nelems)
-        {
+                     simplified_reduction_src_strides[0]) == iter_nelems) {
             mat_reduce_over_axis0 =
                 (simplified_iteration_dst_strides[0] == 1) &&
                 (simplified_iteration_src_strides[0] == 1);
@@ -586,8 +581,7 @@ std::pair<sycl::event, sycl::event> py_tree_reduction_over_axis(
     bool is_src_f_contig = src.is_f_contiguous();
 
     if ((is_src_c_contig && is_dst_c_contig) ||
-        (is_src_f_contig && dst_nelems == 1))
-    {
+        (is_src_f_contig && dst_nelems == 1)) {
         auto fn = axis1_temps_dispatch_table[src_typeid][dst_typeid];
         if (fn != nullptr) {
             std::size_t iter_nelems = dst_nelems;
@@ -610,8 +604,7 @@ std::pair<sycl::event, sycl::event> py_tree_reduction_over_axis(
         }
     }
     else if (is_src_f_contig &&
-             ((is_dst_c_contig && dst_nd == 1) || dst.is_f_contiguous()))
-    {
+             ((is_dst_c_contig && dst_nd == 1) || dst.is_f_contiguous())) {
         auto fn = axis0_temps_dispatch_table[src_typeid][dst_typeid];
         if (fn != nullptr) {
             std::size_t iter_nelems = dst_nelems;
@@ -699,8 +692,7 @@ std::pair<sycl::event, sycl::event> py_tree_reduction_over_axis(
                      simplified_iteration_src_strides[0]) == reduction_nelems);
         }
         else if (static_cast<std::size_t>(
-                     simplified_reduction_src_strides[0]) == iter_nelems)
-        {
+                     simplified_reduction_src_strides[0]) == iter_nelems) {
             mat_reduce_over_axis0 =
                 (simplified_iteration_dst_strides[0] == 1) &&
                 (simplified_iteration_src_strides[0] == 1);
@@ -969,8 +961,7 @@ std::pair<sycl::event, sycl::event> py_search_over_axis(
                      simplified_iteration_src_strides[0]) == reduction_nelems);
         }
         else if (static_cast<std::size_t>(compact_reduction_src_strides[0]) ==
-                 iter_nelems)
-        {
+                 iter_nelems) {
             mat_reduce_over_axis0 =
                 (simplified_iteration_dst_strides[0] == 1) &&
                 (simplified_iteration_src_strides[0] == 1);
@@ -1153,8 +1144,7 @@ std::pair<sycl::event, sycl::event>
 
     // TODO: should be dst_nelems == 0?
     if ((is_src_c_contig && is_dst_c_contig) ||
-        (is_src_f_contig && dst_nelems == 0))
-    {
+        (is_src_f_contig && dst_nelems == 0)) {
         auto fn = axis1_contig_dispatch_vector[src_typeid];
         static constexpr py::ssize_t zero_offset = 0;
 
@@ -1168,8 +1158,7 @@ std::pair<sycl::event, sycl::event>
         return std::make_pair(keep_args_event, red_ev);
     }
     else if (is_src_f_contig &&
-             ((is_dst_c_contig && dst_nd == 1) || dst.is_f_contiguous()))
-    {
+             ((is_dst_c_contig && dst_nd == 1) || dst.is_f_contiguous())) {
         auto fn = axis0_contig_dispatch_vector[src_typeid];
         static constexpr py::ssize_t zero_offset = 0;
 
diff --git a/dpnp/tensor/libtensor/source/reductions/sum.cpp b/dpnp/tensor/libtensor/source/reductions/sum.cpp
index d7142477750a..9a0d212ed8da 100644
--- a/dpnp/tensor/libtensor/source/reductions/sum.cpp
+++ b/dpnp/tensor/libtensor/source/reductions/sum.cpp
@@ -246,8 +246,7 @@ struct SumOverAxisAtomicStridedFactory
     fnT get() const
     {
         if constexpr (TypePairSupportDataForSumReductionAtomic<
-                          srcTy, dstTy>::is_defined)
-        {
+                          srcTy, dstTy>::is_defined) {
             using ReductionOpT = sycl::plus<dstTy>;
             return dpctl::tensor::kernels::
                 reduction_over_group_with_atomics_strided_impl<srcTy, dstTy,
@@ -285,8 +284,7 @@ struct SumOverAxis1AtomicContigFactory
     fnT get() const
     {
         if constexpr (TypePairSupportDataForSumReductionAtomic<
-                          srcTy, dstTy>::is_defined)
-        {
+                          srcTy, dstTy>::is_defined) {
             using ReductionOpT = sycl::plus<dstTy>;
             return dpctl::tensor::kernels::
                 reduction_axis1_over_group_with_atomics_contig_impl<
@@ -304,8 +302,7 @@ struct SumOverAxis0AtomicContigFactory
     fnT get() const
     {
         if constexpr (TypePairSupportDataForSumReductionAtomic<
-                          srcTy, dstTy>::is_defined)
-        {
+                          srcTy, dstTy>::is_defined) {
             using ReductionOpT = sycl::plus<dstTy>;
             return dpctl::tensor::kernels::
                 reduction_axis0_over_group_with_atomics_contig_impl<
diff --git a/dpnp/tensor/libtensor/source/repeat.cpp b/dpnp/tensor/libtensor/source/repeat.cpp
index 919f51f9a4d1..b809160e257b 100644
--- a/dpnp/tensor/libtensor/source/repeat.cpp
+++ b/dpnp/tensor/libtensor/source/repeat.cpp
@@ -136,8 +136,8 @@ std::pair<sycl::event, sycl::event>
         throw py::value_error("Expecting `cumsum` array to be C-contiguous.");
     }
 
-    if (!dpctl::utils::queues_are_compatible(exec_q, {src, reps, cumsum, dst}))
-    {
+    if (!dpctl::utils::queues_are_compatible(exec_q,
+                                             {src, reps, cumsum, dst})) {
         throw py::value_error(
             "Execution queue is not compatible with allocation queues");
     }
@@ -170,8 +170,7 @@ std::pair<sycl::event, sycl::event>
 
     // shape at repeated axis must be equal to the sum of reps
     if (!same_orthog_dims || src_axis_nelems != reps_sz ||
-        src_axis_nelems != cumsum_sz)
-    {
+        src_axis_nelems != cumsum_sz) {
         throw py::value_error("Inconsistent array dimensions");
     }
 
@@ -386,8 +385,8 @@ std::pair<sycl::event, sycl::event>
         throw py::value_error("Expecting `cumsum` array to be C-contiguous.");
     }
 
-    if (!dpctl::utils::queues_are_compatible(exec_q, {src, reps, cumsum, dst}))
-    {
+    if (!dpctl::utils::queues_are_compatible(exec_q,
+                                             {src, reps, cumsum, dst})) {
         throw py::value_error(
             "Execution queue is not compatible with allocation queues");
     }
diff --git a/dpnp/tensor/libtensor/source/simplify_iteration_space.cpp b/dpnp/tensor/libtensor/source/simplify_iteration_space.cpp
index 5e42938a22f2..573aaeb0a60b 100644
--- a/dpnp/tensor/libtensor/source/simplify_iteration_space.cpp
+++ b/dpnp/tensor/libtensor/source/simplify_iteration_space.cpp
@@ -350,8 +350,7 @@ void simplify_iteration_space_4(
         simplified_dst_strides.reserve(nd);
 
         if ((src1_strides[0] < 0) && (src2_strides[0] < 0) &&
-            (src3_strides[0] < 0) && (dst_strides[0] < 0))
-        {
+            (src3_strides[0] < 0) && (dst_strides[0] < 0)) {
             simplified_src1_strides.push_back(-src1_strides[0]);
             simplified_src2_strides.push_back(-src2_strides[0]);
             simplified_src3_strides.push_back(-src3_strides[0]);
diff --git a/dpnp/tensor/libtensor/source/sorting/merge_argsort.cpp b/dpnp/tensor/libtensor/source/sorting/merge_argsort.cpp
index 2b6dcc8bf447..11df5cd2ef47 100644
--- a/dpnp/tensor/libtensor/source/sorting/merge_argsort.cpp
+++ b/dpnp/tensor/libtensor/source/sorting/merge_argsort.cpp
@@ -72,8 +72,7 @@ struct AscendingArgSortContigFactory
     fnT get()
     {
         if constexpr (std::is_same_v<IndexTy, std::int64_t> ||
-                      std::is_same_v<IndexTy, std::int32_t>)
-        {
+                      std::is_same_v<IndexTy, std::int32_t>) {
             using dpctl::tensor::rich_comparisons::AscendingSorter;
             using Comp = typename AscendingSorter<argTy>::type;
 
@@ -92,8 +91,7 @@ struct DescendingArgSortContigFactory
     fnT get()
     {
         if constexpr (std::is_same_v<IndexTy, std::int64_t> ||
-                      std::is_same_v<IndexTy, std::int32_t>)
-        {
+                      std::is_same_v<IndexTy, std::int32_t>) {
             using dpctl::tensor::rich_comparisons::DescendingSorter;
             using Comp = typename DescendingSorter<argTy>::type;
 
diff --git a/dpnp/tensor/libtensor/source/sorting/py_argsort_common.hpp b/dpnp/tensor/libtensor/source/sorting/py_argsort_common.hpp
index 6328b3339376..018f3166a0ad 100644
--- a/dpnp/tensor/libtensor/source/sorting/py_argsort_common.hpp
+++ b/dpnp/tensor/libtensor/source/sorting/py_argsort_common.hpp
@@ -128,8 +128,7 @@ std::pair<sycl::event, sycl::event>
     int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum);
 
     if ((dst_typeid != static_cast<int>(td_ns::typenum_t::INT64)) &&
-        (dst_typeid != static_cast<int>(td_ns::typenum_t::INT32)))
-    {
+        (dst_typeid != static_cast<int>(td_ns::typenum_t::INT32))) {
         throw py::value_error(
             "Output index array must have data type int32 or int64");
     }
diff --git a/dpnp/tensor/libtensor/source/sorting/radix_argsort.cpp b/dpnp/tensor/libtensor/source/sorting/radix_argsort.cpp
index e54b8f739a4b..0eec8fba9ded 100644
--- a/dpnp/tensor/libtensor/source/sorting/radix_argsort.cpp
+++ b/dpnp/tensor/libtensor/source/sorting/radix_argsort.cpp
@@ -104,8 +104,7 @@ struct AscendingRadixArgSortContigFactory
     {
         if constexpr (RadixSortSupportVector<argTy>::is_defined &&
                       (std::is_same_v<IndexTy, std::int64_t> ||
-                       std::is_same_v<IndexTy, std::int32_t>))
-        {
+                       std::is_same_v<IndexTy, std::int32_t>)) {
             return argsort_axis1_contig_caller<
                 /*ascending*/ true, argTy, IndexTy>;
         }
@@ -122,8 +121,7 @@ struct DescendingRadixArgSortContigFactory
     {
         if constexpr (RadixSortSupportVector<argTy>::is_defined &&
                       (std::is_same_v<IndexTy, std::int64_t> ||
-                       std::is_same_v<IndexTy, std::int32_t>))
-        {
+                       std::is_same_v<IndexTy, std::int32_t>)) {
             return argsort_axis1_contig_caller<
                 /*ascending*/ false, argTy, IndexTy>;
         }
diff --git a/dpnp/tensor/libtensor/source/sorting/searchsorted.cpp b/dpnp/tensor/libtensor/source/sorting/searchsorted.cpp
index 8b1ce04a97d6..6c50b0cbc08c 100644
--- a/dpnp/tensor/libtensor/source/sorting/searchsorted.cpp
+++ b/dpnp/tensor/libtensor/source/sorting/searchsorted.cpp
@@ -82,8 +82,7 @@ struct LeftSideSearchSortedContigFactory
     fnT get() const
     {
         if constexpr (std::is_same_v<indTy, std::int32_t> ||
-                      std::is_same_v<indTy, std::int64_t>)
-        {
+                      std::is_same_v<indTy, std::int64_t>) {
             static constexpr bool left_side_search(true);
             using dpctl::tensor::kernels::searchsorted_contig_impl;
             using dpctl::tensor::rich_comparisons::AscendingSorter;
@@ -107,8 +106,7 @@ struct RightSideSearchSortedContigFactory
     fnT get() const
     {
         if constexpr (std::is_same_v<indTy, std::int32_t> ||
-                      std::is_same_v<indTy, std::int64_t>)
-        {
+                      std::is_same_v<indTy, std::int64_t>) {
             static constexpr bool right_side_search(false);
 
             using dpctl::tensor::kernels::searchsorted_contig_impl;
@@ -141,8 +139,7 @@ struct LeftSideSearchSortedStridedFactory
     fnT get() const
     {
         if constexpr (std::is_same_v<indTy, std::int32_t> ||
-                      std::is_same_v<indTy, std::int64_t>)
-        {
+                      std::is_same_v<indTy, std::int64_t>) {
             static constexpr bool left_side_search(true);
             using dpctl::tensor::kernels::searchsorted_strided_impl;
             using dpctl::tensor::rich_comparisons::AscendingSorter;
@@ -166,8 +163,7 @@ struct RightSideSearchSortedStridedFactory
     fnT get() const
     {
         if constexpr (std::is_same_v<indTy, std::int32_t> ||
-                      std::is_same_v<indTy, std::int64_t>)
-        {
+                      std::is_same_v<indTy, std::int64_t>) {
             static constexpr bool right_side_search(false);
             using dpctl::tensor::kernels::searchsorted_strided_impl;
             using dpctl::tensor::rich_comparisons::AscendingSorter;
@@ -263,8 +259,8 @@ std::pair<sycl::event, sycl::event>
     dpctl::tensor::validation::CheckWritable::throw_if_not_writable(positions);
 
     // check that queues are compatible
-    if (!dpctl::utils::queues_are_compatible(exec_q, {hay, needles, positions}))
-    {
+    if (!dpctl::utils::queues_are_compatible(exec_q,
+                                             {hay, needles, positions})) {
         throw py::value_error(
             "Execution queue is not compatible with allocation queues");
     }
@@ -295,8 +291,7 @@ std::pair<sycl::event, sycl::event>
     const auto positions_typenum_t_v =
         static_cast<td_ns::typenum_t>(positions_typeid);
     if (positions_typenum_t_v != td_ns::typenum_t::INT32 &&
-        positions_typenum_t_v != td_ns::typenum_t::INT64)
-    {
+        positions_typenum_t_v != td_ns::typenum_t::INT64) {
         throw py::value_error(
             "Positions array must have data-type int32, or int64");
     }
diff --git a/dpnp/tensor/libtensor/source/where.cpp b/dpnp/tensor/libtensor/source/where.cpp
index 46c52cf83b34..1d535a712917 100644
--- a/dpnp/tensor/libtensor/source/where.cpp
+++ b/dpnp/tensor/libtensor/source/where.cpp
@@ -79,8 +79,8 @@ std::pair<sycl::event, sycl::event>
              const std::vector<sycl::event> &depends)
 {
 
-    if (!dpctl::utils::queues_are_compatible(exec_q, {x1, x2, condition, dst}))
-    {
+    if (!dpctl::utils::queues_are_compatible(exec_q,
+                                             {x1, x2, condition, dst})) {
         throw py::value_error(
             "Execution queue is not compatible with allocation queues");
     }
@@ -129,8 +129,7 @@ std::pair<sycl::event, sycl::event>
         dpctl::tensor::overlap::SameLogicalTensors();
     if ((overlap(dst, condition) && !same_logical_tensors(dst, condition)) ||
         (overlap(dst, x1) && !same_logical_tensors(dst, x1)) ||
-        (overlap(dst, x2) && !same_logical_tensors(dst, x2)))
-    {
+        (overlap(dst, x2) && !same_logical_tensors(dst, x2))) {
         throw py::value_error("Destination array overlaps with input.");
     }
 

From fbc4f43466897e28bd79d03f5cf709c222209e17 Mon Sep 17 00:00:00 2001
From: Nikita Grigorian <nikita.grigorian@intel.com>
Date: Wed, 8 Apr 2026 11:17:42 -0700
Subject: [PATCH 28/43] add __main__.py

---
 dpnp/__main__.py | 78 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 78 insertions(+)
 create mode 100644 dpnp/__main__.py

diff --git a/dpnp/__main__.py b/dpnp/__main__.py
new file mode 100644
index 000000000000..6871022a61a6
--- /dev/null
+++ b/dpnp/__main__.py
@@ -0,0 +1,78 @@
+# *****************************************************************************
+# Copyright (c) 2016, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+import argparse
+import importlib
+import os
+import os.path
+import sys
+
+
+def _dpnp_dir() -> str:
+    dpnp_dir = importlib.util.find_spec("dpnp").submodule_search_locations[0]
+    abs_dpnp_dir = os.path.abspath(dpnp_dir)
+    return abs_dpnp_dir
+
+
+def get_tensor_include_dir() -> str:
+    """Prints path to dpnp libtensor include directory"""
+    dpnp_dir = _dpnp_dir()
+    libtensor_dir = os.path.join(dpnp_dir, "tensor", "libtensor", "include")
+    return libtensor_dir
+
+
+def print_tensor_include_flags() -> None:
+    """Prints include flags for dpnp tensor library"""
+    libtensor_dir = get_tensor_include_dir()
+    print("-I " + libtensor_dir)
+
+
+def main() -> None:
+    """Main entry-point."""
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--tensor-includes",
+        action="store_true",
+        help="Include flags for dpnp libtensor headers.",
+    )
+    parser.add_argument(
+        "--tensor-include-dir",
+        action="store_true",
+        help="Path to dpnp libtensor include directory.",
+    )
+    args = parser.parse_args()
+    if not sys.argv[1:]:
+        parser.print_help()
+    if args.tensor_includes:
+        print_tensor_include_flags()
+    if args.tensor_include_dir:
+        print(get_tensor_include_dir())
+
+
+if __name__ == "__main__":
+    main()

From 1679951760e0f7311e4305e72130b6afce533f89 Mon Sep 17 00:00:00 2001
From: Nikita Grigorian <nikita.grigorian@intel.com>
Date: Fri, 10 Apr 2026 10:06:04 -0700
Subject: [PATCH 29/43] add test for new CLI options

---
 dpnp/tests/test_cli_options.py | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)
 create mode 100644 dpnp/tests/test_cli_options.py

diff --git a/dpnp/tests/test_cli_options.py b/dpnp/tests/test_cli_options.py
new file mode 100644
index 000000000000..0caca95f3974
--- /dev/null
+++ b/dpnp/tests/test_cli_options.py
@@ -0,0 +1,20 @@
+import subprocess
+import sys
+
+
+def test_tensor_includes():
+    res = subprocess.run(
+        [sys.executable, "-m", "dpnp", "--tensor-includes"],
+        capture_output=True,
+    )
+    assert res.returncode == 0
+    assert res.stdout
+    flags = res.stdout.decode("utf-8")
+    res = subprocess.run(
+        [sys.executable, "-m", "dpnp", "--tensor-include-dir"],
+        capture_output=True,
+    )
+    assert res.returncode == 0
+    assert res.stdout
+    dir = res.stdout.decode("utf-8")
+    assert flags == "-I " + dir

From 8fb303d535056c3949febfe24ae9667ac6d58394 Mon Sep 17 00:00:00 2001
From: Nikita Grigorian <nikita.grigorian@intel.com>
Date: Wed, 8 Apr 2026 11:17:42 -0700
Subject: [PATCH 30/43] add __main__.py

---
 dpnp/__main__.py | 78 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 78 insertions(+)
 create mode 100644 dpnp/__main__.py

diff --git a/dpnp/__main__.py b/dpnp/__main__.py
new file mode 100644
index 000000000000..6871022a61a6
--- /dev/null
+++ b/dpnp/__main__.py
@@ -0,0 +1,78 @@
+# *****************************************************************************
+# Copyright (c) 2016, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+import argparse
+import importlib
+import os
+import os.path
+import sys
+
+
+def _dpnp_dir() -> str:
+    dpnp_dir = importlib.util.find_spec("dpnp").submodule_search_locations[0]
+    abs_dpnp_dir = os.path.abspath(dpnp_dir)
+    return abs_dpnp_dir
+
+
+def get_tensor_include_dir() -> str:
+    """Prints path to dpnp libtensor include directory"""
+    dpnp_dir = _dpnp_dir()
+    libtensor_dir = os.path.join(dpnp_dir, "tensor", "libtensor", "include")
+    return libtensor_dir
+
+
+def print_tensor_include_flags() -> None:
+    """Prints include flags for dpnp tensor library"""
+    libtensor_dir = get_tensor_include_dir()
+    print("-I " + libtensor_dir)
+
+
+def main() -> None:
+    """Main entry-point."""
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--tensor-includes",
+        action="store_true",
+        help="Include flags for dpnp libtensor headers.",
+    )
+    parser.add_argument(
+        "--tensor-include-dir",
+        action="store_true",
+        help="Path to dpnp libtensor include directory.",
+    )
+    args = parser.parse_args()
+    if not sys.argv[1:]:
+        parser.print_help()
+    if args.tensor_includes:
+        print_tensor_include_flags()
+    if args.tensor_include_dir:
+        print(get_tensor_include_dir())
+
+
+if __name__ == "__main__":
+    main()

From cf313c80258fb2ed6becc59e7dfca88e5eb1c7b4 Mon Sep 17 00:00:00 2001
From: Nikita Grigorian <nikita.grigorian@intel.com>
Date: Fri, 10 Apr 2026 10:06:04 -0700
Subject: [PATCH 31/43] add test for new CLI options

---
 dpnp/tests/test_cli_options.py | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)
 create mode 100644 dpnp/tests/test_cli_options.py

diff --git a/dpnp/tests/test_cli_options.py b/dpnp/tests/test_cli_options.py
new file mode 100644
index 000000000000..0caca95f3974
--- /dev/null
+++ b/dpnp/tests/test_cli_options.py
@@ -0,0 +1,20 @@
+import subprocess
+import sys
+
+
+def test_tensor_includes():
+    res = subprocess.run(
+        [sys.executable, "-m", "dpnp", "--tensor-includes"],
+        capture_output=True,
+    )
+    assert res.returncode == 0
+    assert res.stdout
+    flags = res.stdout.decode("utf-8")
+    res = subprocess.run(
+        [sys.executable, "-m", "dpnp", "--tensor-include-dir"],
+        capture_output=True,
+    )
+    assert res.returncode == 0
+    assert res.stdout
+    dir = res.stdout.decode("utf-8")
+    assert flags == "-I " + dir

From c83f72b7dad28b96f123c06e4dfc758eebd5b1f9 Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Sun, 12 Apr 2026 16:29:24 -0700
Subject: [PATCH 32/43] Remove the unnecessary inludes after the merge

---
 dpnp/backend/extensions/indexing/CMakeLists.txt                 | 1 +
 dpnp/backend/extensions/indexing/choose.cpp                     | 2 --
 dpnp/backend/extensions/statistics/CMakeLists.txt               | 1 +
 dpnp/backend/extensions/statistics/histogram_common.hpp         | 2 +-
 dpnp/backend/extensions/statistics/sliding_window1d.hpp         | 2 +-
 .../extensions/ufunc/elementwise_functions/interpolate.cpp      | 2 ++
 6 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/dpnp/backend/extensions/indexing/CMakeLists.txt b/dpnp/backend/extensions/indexing/CMakeLists.txt
index 0ca611bfdc9f..731a059e4a39 100644
--- a/dpnp/backend/extensions/indexing/CMakeLists.txt
+++ b/dpnp/backend/extensions/indexing/CMakeLists.txt
@@ -66,6 +66,7 @@ set_target_properties(
 target_include_directories(
     ${python_module_name}
     PRIVATE
+        ${CMAKE_CURRENT_SOURCE_DIR}/../../
         ${CMAKE_CURRENT_SOURCE_DIR}/../common
         ${CMAKE_SOURCE_DIR}/dpnp/backend/include
         ${CMAKE_SOURCE_DIR}/dpnp/tensor/libtensor/include
diff --git a/dpnp/backend/extensions/indexing/choose.cpp b/dpnp/backend/extensions/indexing/choose.cpp
index bc315e4d93d9..fafcbe1f2495 100644
--- a/dpnp/backend/extensions/indexing/choose.cpp
+++ b/dpnp/backend/extensions/indexing/choose.cpp
@@ -44,8 +44,6 @@
 
 #include "dpnp4pybind11.hpp"
 
-#include "choose_kernel.hpp"
-
 #include "ext/common.hpp"
 #include "kernels/indexing/choose.hpp"
 
diff --git a/dpnp/backend/extensions/statistics/CMakeLists.txt b/dpnp/backend/extensions/statistics/CMakeLists.txt
index 701a852c5903..c78f4b77283f 100644
--- a/dpnp/backend/extensions/statistics/CMakeLists.txt
+++ b/dpnp/backend/extensions/statistics/CMakeLists.txt
@@ -71,6 +71,7 @@ set_target_properties(
 target_include_directories(
     ${python_module_name}
     PRIVATE
+        ${CMAKE_CURRENT_SOURCE_DIR}/../../
         ${CMAKE_CURRENT_SOURCE_DIR}/../common
         ${CMAKE_SOURCE_DIR}/dpnp/backend/include
         ${CMAKE_SOURCE_DIR}/dpnp/tensor/libtensor/include
diff --git a/dpnp/backend/extensions/statistics/histogram_common.hpp b/dpnp/backend/extensions/statistics/histogram_common.hpp
index 8091e8874d17..47fef11061f3 100644
--- a/dpnp/backend/extensions/statistics/histogram_common.hpp
+++ b/dpnp/backend/extensions/statistics/histogram_common.hpp
@@ -35,7 +35,7 @@
 
 #include <sycl/sycl.hpp>
 
-#include "dpctl4pybind11.hpp"
+#include "dpnp4pybind11.hpp"
 
 #include "ext/common.hpp"
 #include "kernels/statistics/histogram.hpp"
diff --git a/dpnp/backend/extensions/statistics/sliding_window1d.hpp b/dpnp/backend/extensions/statistics/sliding_window1d.hpp
index 329c96dfc1c6..a13c1f873e78 100644
--- a/dpnp/backend/extensions/statistics/sliding_window1d.hpp
+++ b/dpnp/backend/extensions/statistics/sliding_window1d.hpp
@@ -34,7 +34,7 @@
 
 #include <sycl/sycl.hpp>
 
-#include "dpctl4pybind11.hpp"
+#include "dpnp4pybind11.hpp"
 
 #include "kernels/statistics/sliding_window1d.hpp"
 
diff --git a/dpnp/backend/extensions/ufunc/elementwise_functions/interpolate.cpp b/dpnp/backend/extensions/ufunc/elementwise_functions/interpolate.cpp
index c12d96e762de..36dae50e7b2c 100644
--- a/dpnp/backend/extensions/ufunc/elementwise_functions/interpolate.cpp
+++ b/dpnp/backend/extensions/ufunc/elementwise_functions/interpolate.cpp
@@ -43,6 +43,8 @@
 
 #include "dpnp4pybind11.hpp"
 
+#include "kernels/elementwise_functions/interpolate.hpp"
+
 // dpctl tensor headers
 #include "utils/type_dispatch.hpp"
 #include "utils/type_utils.hpp"

From e89e3d6638f1e37e902bdba15a449cc352dade30 Mon Sep 17 00:00:00 2001
From: Anton <100830759+antonwolfy@users.noreply.github.com>
Date: Fri, 10 Apr 2026 18:32:10 +0200
Subject: [PATCH 33/43] Enable muted tests for dpnp.cumlogsumexp (#2842)

There was a w/a implemented in scope of
[dpctl#2275](https://github.com/IntelPython/dpctl/pull/2275).
Thus the PR enables the previously muted tests for `dpnp.cumlogsumexp`.
---
 dpnp/tests/test_mathematical.py | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/dpnp/tests/test_mathematical.py b/dpnp/tests/test_mathematical.py
index 1bf911785965..8de7ec2ed80d 100644
--- a/dpnp/tests/test_mathematical.py
+++ b/dpnp/tests/test_mathematical.py
@@ -17,7 +17,6 @@
 from dpnp.tensor._numpy_helper import normalize_axis_index
 
 from .helper import (
-    LTS_VERSION,
     assert_dtype_allclose,
     generate_random_numpy_array,
     get_abs_array,
@@ -31,7 +30,6 @@
     has_support_aspect16,
     has_support_aspect64,
     is_intel_numpy,
-    is_lts_driver,
     numpy_version,
 )
 from .third_party.cupy import testing
@@ -216,9 +214,6 @@ def _get_exp_array(self, a, axis, dtype):
     @pytest.mark.parametrize("axis", [None, 2, -1])
     @pytest.mark.parametrize("include_initial", [True, False])
     def test_basic(self, dtype, axis, include_initial):
-        if axis is None and not is_lts_driver(version=LTS_VERSION.V1_6):
-            pytest.skip("due to SAT-8336")
-
         a = dpnp.ones((3, 4, 5, 6, 7), dtype=dtype)
         res = dpnp.cumlogsumexp(a, axis=axis, include_initial=include_initial)
 
@@ -236,9 +231,6 @@ def test_basic(self, dtype, axis, include_initial):
     @pytest.mark.parametrize("axis", [None, 2, -1])
     @pytest.mark.parametrize("include_initial", [True, False])
     def test_include_initial(self, dtype, axis, include_initial):
-        if axis is None and not is_lts_driver(version=LTS_VERSION.V1_6):
-            pytest.skip("due to SAT-8336")
-
         a = dpnp.ones((3, 4, 5, 6, 7), dtype=dtype)
 
         if dpnp.issubdtype(a, dpnp.float32):

From cd1f5ca25d7821faddd1482b0b6b26f4877b3828 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Sun, 12 Apr 2026 16:36:41 +0200
Subject: [PATCH 34/43] Bump actions/upload-artifact from 7.0.0 to 7.0.1
 (#2847)

Bumps
[actions/upload-artifact](https://github.com/actions/upload-artifact)
from 7.0.0 to 7.0.1.
---
 .github/workflows/check-onemath.yaml    | 2 +-
 .github/workflows/conda-package.yml     | 4 ++--
 .github/workflows/openssf-scorecard.yml | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/check-onemath.yaml b/.github/workflows/check-onemath.yaml
index a56fc5a368aa..3ad8ba1ee84e 100644
--- a/.github/workflows/check-onemath.yaml
+++ b/.github/workflows/check-onemath.yaml
@@ -57,7 +57,7 @@ jobs:
           cat ${{ env.environment-file }}
 
       - name: Upload artifact
-        uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7.0.0
+        uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
         with:
           name: ${{ env.environment-file-name }}
           path: ${{ env.environment-file }}
diff --git a/.github/workflows/conda-package.yml b/.github/workflows/conda-package.yml
index 5c6ae21b58f9..afd34ee00543 100644
--- a/.github/workflows/conda-package.yml
+++ b/.github/workflows/conda-package.yml
@@ -102,13 +102,13 @@ jobs:
           MAX_BUILD_CMPL_MKL_VERSION: '2026.0a0'
 
       - name: Upload artifact
-        uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7.0.0
+        uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
         with:
           name: ${{ env.package-name }} ${{ runner.os }} Python ${{ matrix.python }}
           path: ${{ env.CONDA_BLD }}${{ env.package-name }}-*.conda
 
       - name: Upload wheels artifact
-        uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7.0.0
+        uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
         with:
           name: ${{ env.package-name }} ${{ runner.os }} Wheels Python ${{ matrix.python }}
           path: ${{ env.WHEELS_OUTPUT_FOLDER }}${{ env.package-name }}-*.whl
diff --git a/.github/workflows/openssf-scorecard.yml b/.github/workflows/openssf-scorecard.yml
index e5cc7dec86ea..d80ec0b4704a 100644
--- a/.github/workflows/openssf-scorecard.yml
+++ b/.github/workflows/openssf-scorecard.yml
@@ -64,7 +64,7 @@ jobs:
       # Upload the results as artifacts (optional). Commenting out will disable uploads of run results in SARIF
       # format to the repository Actions tab.
       - name: "Upload artifact"
-        uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7.0.0
+        uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
         with:
           name: SARIF file
           path: results.sarif

From eb94817eb54b005c5b297a5da250e06f2fea8965 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Sun, 12 Apr 2026 22:04:15 +0200
Subject: [PATCH 35/43] Bump peter-evans/create-pull-request from 8.1.0 to
 8.1.1 (#2846)

Bumps
[peter-evans/create-pull-request](https://github.com/peter-evans/create-pull-request)
from 8.1.0 to 8.1.1.
---
 .github/workflows/pre-commit-autoupdate.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/pre-commit-autoupdate.yml b/.github/workflows/pre-commit-autoupdate.yml
index 0cbd6eeb6969..66df2d06d0c6 100644
--- a/.github/workflows/pre-commit-autoupdate.yml
+++ b/.github/workflows/pre-commit-autoupdate.yml
@@ -38,7 +38,7 @@ jobs:
         run: pre-commit autoupdate
 
       - name: Create a PR with autoupdate changes
-        uses: peter-evans/create-pull-request@c0f553fe549906ede9cf27b5156039d195d2ece0 #v8.1.0
+        uses: peter-evans/create-pull-request@5f6978faf089d4d20b00c7766989d076bb2fc7f1 #v8.1.1
         with:
           commit-message: 'chore: update pre-commit hooks'
           add-paths: .pre-commit-config.yaml

From 3a1db6fc2e9605c8cd77aec7a7565e98d3c22398 Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Mon, 13 Apr 2026 05:52:04 -0700
Subject: [PATCH 36/43] Remove redundant Dpctl_TENSOR_INCLUDE_DIR from backend
 extension CMakeLists

---
 dpnp/backend/extensions/blas/CMakeLists.txt       | 1 -
 dpnp/backend/extensions/fft/CMakeLists.txt        | 1 -
 dpnp/backend/extensions/indexing/CMakeLists.txt   | 1 -
 dpnp/backend/extensions/lapack/CMakeLists.txt     | 1 -
 dpnp/backend/extensions/statistics/CMakeLists.txt | 1 -
 dpnp/backend/extensions/ufunc/CMakeLists.txt      | 1 -
 dpnp/backend/extensions/vm/CMakeLists.txt         | 1 -
 dpnp/backend/extensions/window/CMakeLists.txt     | 1 -
 8 files changed, 8 deletions(-)

diff --git a/dpnp/backend/extensions/blas/CMakeLists.txt b/dpnp/backend/extensions/blas/CMakeLists.txt
index 67e0d4cf02e1..b4013d82eb40 100644
--- a/dpnp/backend/extensions/blas/CMakeLists.txt
+++ b/dpnp/backend/extensions/blas/CMakeLists.txt
@@ -81,7 +81,6 @@ target_include_directories(
     PRIVATE
         ${SYCL_INCLUDE_DIR}
         ${Dpctl_INCLUDE_DIRS}
-        ${Dpctl_TENSOR_INCLUDE_DIR}
         ${CMAKE_BINARY_DIR} # For generated Cython headers
 )
 
diff --git a/dpnp/backend/extensions/fft/CMakeLists.txt b/dpnp/backend/extensions/fft/CMakeLists.txt
index 8f5179bbbd76..9c452d94bd23 100644
--- a/dpnp/backend/extensions/fft/CMakeLists.txt
+++ b/dpnp/backend/extensions/fft/CMakeLists.txt
@@ -74,7 +74,6 @@ target_include_directories(
     PRIVATE
         ${SYCL_INCLUDE_DIR}
         ${Dpctl_INCLUDE_DIRS}
-        ${Dpctl_TENSOR_INCLUDE_DIR}
         ${CMAKE_BINARY_DIR} # For generated Cython headers
 )
 
diff --git a/dpnp/backend/extensions/indexing/CMakeLists.txt b/dpnp/backend/extensions/indexing/CMakeLists.txt
index 731a059e4a39..ce800a87124c 100644
--- a/dpnp/backend/extensions/indexing/CMakeLists.txt
+++ b/dpnp/backend/extensions/indexing/CMakeLists.txt
@@ -79,7 +79,6 @@ target_include_directories(
     PRIVATE
         ${SYCL_INCLUDE_DIR}
         ${Dpctl_INCLUDE_DIRS}
-        ${Dpctl_TENSOR_INCLUDE_DIR}
         ${CMAKE_BINARY_DIR} # For generated Cython headers
 )
 
diff --git a/dpnp/backend/extensions/lapack/CMakeLists.txt b/dpnp/backend/extensions/lapack/CMakeLists.txt
index 6bf25ee651d2..9622ec0dcc13 100644
--- a/dpnp/backend/extensions/lapack/CMakeLists.txt
+++ b/dpnp/backend/extensions/lapack/CMakeLists.txt
@@ -99,7 +99,6 @@ target_include_directories(
     PRIVATE
         ${SYCL_INCLUDE_DIR}
         ${Dpctl_INCLUDE_DIRS}
-        ${Dpctl_TENSOR_INCLUDE_DIR}
         ${CMAKE_BINARY_DIR} # For generated Cython headers
 )
 
diff --git a/dpnp/backend/extensions/statistics/CMakeLists.txt b/dpnp/backend/extensions/statistics/CMakeLists.txt
index c78f4b77283f..434d223de3ab 100644
--- a/dpnp/backend/extensions/statistics/CMakeLists.txt
+++ b/dpnp/backend/extensions/statistics/CMakeLists.txt
@@ -84,7 +84,6 @@ target_include_directories(
     PRIVATE
         ${SYCL_INCLUDE_DIR}
         ${Dpctl_INCLUDE_DIRS}
-        ${Dpctl_TENSOR_INCLUDE_DIR}
         ${CMAKE_BINARY_DIR} # For generated Cython headers
 )
 
diff --git a/dpnp/backend/extensions/ufunc/CMakeLists.txt b/dpnp/backend/extensions/ufunc/CMakeLists.txt
index 68e6bf29135d..2b01823d01f3 100644
--- a/dpnp/backend/extensions/ufunc/CMakeLists.txt
+++ b/dpnp/backend/extensions/ufunc/CMakeLists.txt
@@ -101,7 +101,6 @@ target_include_directories(
     PRIVATE
         ${SYCL_INCLUDE_DIR}
         ${Dpctl_INCLUDE_DIRS}
-        ${Dpctl_TENSOR_INCLUDE_DIR}
         ${CMAKE_BINARY_DIR} # For generated Cython headers
 )
 
diff --git a/dpnp/backend/extensions/vm/CMakeLists.txt b/dpnp/backend/extensions/vm/CMakeLists.txt
index a739838c8dcd..05aa64e0d814 100644
--- a/dpnp/backend/extensions/vm/CMakeLists.txt
+++ b/dpnp/backend/extensions/vm/CMakeLists.txt
@@ -123,7 +123,6 @@ target_include_directories(
     PRIVATE
         ${SYCL_INCLUDE_DIR}
         ${Dpctl_INCLUDE_DIRS}
-        ${Dpctl_TENSOR_INCLUDE_DIR}
         ${CMAKE_BINARY_DIR} # For generated Cython headers
 )
 
diff --git a/dpnp/backend/extensions/window/CMakeLists.txt b/dpnp/backend/extensions/window/CMakeLists.txt
index 085cd47e7891..9dac2df9d0df 100644
--- a/dpnp/backend/extensions/window/CMakeLists.txt
+++ b/dpnp/backend/extensions/window/CMakeLists.txt
@@ -79,7 +79,6 @@ target_include_directories(
     PRIVATE
         ${SYCL_INCLUDE_DIR}
         ${Dpctl_INCLUDE_DIRS}
-        ${Dpctl_TENSOR_INCLUDE_DIR}
         ${CMAKE_BINARY_DIR} # For generated Cython headers
 )
 

From 6857a9fc26c10d6789bffc6c558607f5d38760ea Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Mon, 13 Apr 2026 07:00:31 -0700
Subject: [PATCH 37/43] Update copyright year in __main__.py

---
 dpnp/__main__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dpnp/__main__.py b/dpnp/__main__.py
index 6871022a61a6..1c9c652109ee 100644
--- a/dpnp/__main__.py
+++ b/dpnp/__main__.py
@@ -1,5 +1,5 @@
 # *****************************************************************************
-# Copyright (c) 2016, Intel Corporation
+# Copyright (c) 2026, Intel Corporation
 # All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without

From cb1044da4be9d2a03d967e66b1ce5fd32353cc75 Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Mon, 13 Apr 2026 07:04:21 -0700
Subject: [PATCH 38/43] Remove redundant include directories from lapack
 extension

---
 dpnp/backend/extensions/lapack/CMakeLists.txt | 1 -
 1 file changed, 1 deletion(-)

diff --git a/dpnp/backend/extensions/lapack/CMakeLists.txt b/dpnp/backend/extensions/lapack/CMakeLists.txt
index 9622ec0dcc13..6c898df05236 100644
--- a/dpnp/backend/extensions/lapack/CMakeLists.txt
+++ b/dpnp/backend/extensions/lapack/CMakeLists.txt
@@ -86,7 +86,6 @@ set_target_properties(
 target_include_directories(
     ${python_module_name}
     PRIVATE
-        ${CMAKE_CURRENT_SOURCE_DIR}/../../
         ${CMAKE_CURRENT_SOURCE_DIR}/../common
         ${CMAKE_SOURCE_DIR}/dpnp/backend/include
         ${CMAKE_SOURCE_DIR}/dpnp/tensor/libtensor/include

From 4b163bfbf40967403819657cf218e69412a073e0 Mon Sep 17 00:00:00 2001
From: vlad-perevezentsev <vladislav.perevezentsev@intel.com>
Date: Tue, 14 Apr 2026 20:01:49 +0200
Subject: [PATCH 39/43] Add cross-reference fallback for dpnp.tensor to dpctl
 0.21.1 docs (#2848)

This PR proposes to replace `dpctl.tensor` with `dpnp.tensor` across the
error messages.
Add a Sphinx handler to redirect dpnp.tensor.* cross-references to dpctl
0.21.1 docs
and `tensor.rst` page linking to the dpctl API reference
---
 doc/conf.py                                   | 66 +++++++++++++++++
 doc/index.rst                                 |  1 +
 doc/reference/exceptions.rst                  |  2 +-
 doc/tensor.rst                                | 70 ++++++++++++++++++
 dpnp/dpnp_algo/dpnp_arraycreation.py          |  2 +-
 dpnp/dpnp_algo/dpnp_elementwise_common.py     |  8 +--
 dpnp/dpnp_array.py                            | 22 +++---
 dpnp/dpnp_container.py                        | 12 ++--
 dpnp/dpnp_iface.py                            | 30 ++++----
 dpnp/dpnp_iface_arraycreation.py              | 72 +++++++++----------
 dpnp/dpnp_iface_indexing.py                   | 12 ++--
 dpnp/dpnp_iface_manipulation.py               |  6 +-
 dpnp/dpnp_iface_nanfunctions.py               | 10 +--
 dpnp/dpnp_iface_sorting.py                    |  2 +-
 dpnp/dpnp_iface_window.py                     | 10 +--
 dpnp/dpnp_utils/dpnp_algo_utils.pyx           |  2 +-
 dpnp/dpnp_utils/dpnp_utils_linearalgebra.py   |  6 +-
 dpnp/dpnp_utils/dpnp_utils_reduction.py       |  2 +-
 dpnp/fft/dpnp_iface_fft.py                    |  4 +-
 dpnp/linalg/dpnp_utils_linalg.py              |  2 +-
 dpnp/memory/_memory.py                        |  2 +-
 dpnp/random/dpnp_iface_random.py              | 24 +++----
 dpnp/random/dpnp_random_state.py              |  2 +-
 dpnp/tensor/_accumulation.py                  |  2 +-
 dpnp/tensor/_clip.py                          |  2 +-
 dpnp/tensor/_copy_utils.py                    | 18 ++---
 dpnp/tensor/_ctors.py                         |  6 +-
 dpnp/tensor/_elementwise_common.py            |  4 +-
 dpnp/tensor/_indexing_functions.py            | 20 +++---
 dpnp/tensor/_linear_algebra_functions.py      | 14 ++--
 dpnp/tensor/_print.py                         |  4 +-
 dpnp/tensor/_reduction.py                     |  6 +-
 dpnp/tensor/_search_functions.py              |  2 +-
 dpnp/tensor/_searchsorted.py                  |  8 +--
 dpnp/tensor/_set_functions.py                 |  8 +--
 dpnp/tensor/_slicing.pxi                      |  4 +-
 dpnp/tensor/_sorting.py                       | 12 +---
 dpnp/tensor/_statistical_functions.py         |  6 +-
 dpnp/tensor/_testing.py                       |  4 +-
 dpnp/tensor/_usmarray.pyx                     | 10 +--
 dpnp/tensor/_utility_functions.py             |  4 +-
 dpnp/tests/tensor/test_tensor_clip.py         |  2 +-
 dpnp/tests/tensor/test_tensor_diff.py         |  2 +-
 .../cupy/indexing_tests/test_insert.py        |  2 +-
 44 files changed, 319 insertions(+), 190 deletions(-)
 create mode 100644 doc/tensor.rst

diff --git a/doc/conf.py b/doc/conf.py
index 469e6d5f5353..57119eab5396 100644
--- a/doc/conf.py
+++ b/doc/conf.py
@@ -6,6 +6,7 @@
 # http://www.sphinx-doc.org/en/master/config
 
 from datetime import datetime
+from urllib.parse import urljoin
 
 from sphinx.ext.autodoc import FunctionDocumenter
 from sphinx.ext.napoleon import NumpyDocstring, docstring
@@ -231,6 +232,9 @@ def _can_document_member(member, *args, **kwargs):
 
 autosummary_generate = True
 
+_DPCTL_021_BASE = "https://intelpython.github.io/dpctl/0.21.1/"
+_DPCTL_021_INV = urljoin(_DPCTL_021_BASE, "objects.inv")
+
 intersphinx_mapping = {
     "python": ("https://docs.python.org/3/", None),
     "numpy": ("https://numpy.org/doc/stable/", None),
@@ -302,3 +306,65 @@ def _parse_returns_section_patched(self, section: str) -> list[str]:
 
 
 NumpyDocstring._parse_returns_section = _parse_returns_section_patched
+
+
+# TODO: Remove once dpnp.tensor docs are generated in dpnp
+def _load_dpctl_tensor_inventory(app):
+    """Load dpctl 0.21.1 inventory for dpnp.tensor fallback only."""
+    from sphinx.ext.intersphinx import fetch_inventory
+    from sphinx.util import logging
+
+    logger = logging.getLogger(__name__)
+
+    try:
+        inv = fetch_inventory(app, _DPCTL_021_BASE, _DPCTL_021_INV)
+    except Exception as exc:
+        logger.warning(
+            "Failed to load dpctl 0.21.1 inventory from %s: %s",
+            _DPCTL_021_INV,
+            exc,
+        )
+        inv = {}
+
+    app.builder.env._dpctl_tensor_021_inventory = inv
+
+
+# TODO: Remove once dpnp.tensor docs are generated in dpnp
+def _resolve_dpnp_tensor_refs(app, env, node, contnode):
+    """Resolve dpnp.tensor.* references to dpctl 0.21.1 documentation.
+
+    This temporary workaround is needed because dpnp.tensor documentation
+    is not generated yet, while the corresponding API is still documented
+    in dpctl 0.21.1.
+    """
+    from docutils import nodes as docutils_nodes
+
+    target = node.get("reftarget", "")
+    if not target.startswith("dpnp.tensor"):
+        return None
+
+    dpctl_target = target.replace("dpnp.tensor", "dpctl.tensor", 1)
+    dpctl_tensor_inv = getattr(env, "_dpctl_tensor_021_inventory", {})
+
+    for _objtype, objects in dpctl_tensor_inv.items():
+        if dpctl_target not in objects:
+            continue
+
+        item = objects[dpctl_target]
+        location = item.uri
+        if location.endswith("$"):
+            location = location[:-1] + dpctl_target
+
+        refuri = urljoin(_DPCTL_021_BASE, location)
+        newnode = docutils_nodes.reference(
+            "", "", internal=False, refuri=refuri
+        )
+        newnode += contnode.deepcopy()
+        return newnode
+
+    return None
+
+
+def setup(app):
+    app.connect("builder-inited", _load_dpctl_tensor_inventory, priority=400)
+    app.connect("missing-reference", _resolve_dpnp_tensor_refs, priority=400)
diff --git a/doc/index.rst b/doc/index.rst
index 38c12489636b..847680fc11d9 100644
--- a/doc/index.rst
+++ b/doc/index.rst
@@ -13,6 +13,7 @@ Data Parallel Extension for NumPy*
    overview
    quick_start_guide
    reference/index
+   tensor
 
 .. toctree::
    :maxdepth: 1
diff --git a/doc/reference/exceptions.rst b/doc/reference/exceptions.rst
index 8f459b9f3aaa..69980ac8d8c2 100644
--- a/doc/reference/exceptions.rst
+++ b/doc/reference/exceptions.rst
@@ -20,7 +20,7 @@ Exceptions
 .. exception:: DLPackCreationError
 
    Given when constructing DLPack capsule from either :class:`dpnp.ndarray` or
-   :class:`dpctl.tensor.usm_ndarray` based on a USM allocation
+   :class:`dpnp.tensor.usm_ndarray` based on a USM allocation
    on a partitioned SYCL device.
 
    .. rubric:: Examples
diff --git a/doc/tensor.rst b/doc/tensor.rst
new file mode 100644
index 000000000000..22a1812f38a3
--- /dev/null
+++ b/doc/tensor.rst
@@ -0,0 +1,70 @@
+.. _tensor:
+
+Tensor (``dpnp.tensor``)
+========================
+
+``dpnp.tensor`` provides a reference implementation of the
+`Python Array API <https://data-apis.org/array-api/latest/>`_ specification.
+The implementation uses data-parallel algorithms suitable for execution on
+accelerators, such as GPUs.
+
+It also provides the underlying Array API-compliant implementation
+used by ``dpnp``.
+
+``dpnp.tensor`` is written using C++ and
+`SYCL <https://registry.khronos.org/SYCL/specs/sycl-2020/html/sycl-2020.html>`_
+and oneAPI extensions implemented in
+`Intel(R) oneAPI DPC++ compiler <https://www.intel.com/content/www/us/en/developer/tools/oneapi/dpc-compiler.html>`_.
+
+Design and Motivation
+---------------------
+
+The tensor implementation was originally developed as a standalone project and
+later integrated into the `dpctl <https://intelpython.github.io/dpctl/latest/index.html>`_
+library as ``dpctl.tensor``. It has since been migrated into ``dpnp``,
+making ``dpnp`` the primary owner and development location of the tensor implementation.
+
+This change simplifies maintenance, reduces cross-project
+dependencies, and enables independent development and release cycles.
+
+Relationship to ``dpnp.ndarray``
+--------------------------------
+
+:class:`dpnp.ndarray` is a high-level array object built on top of
+``dpnp.tensor.usm_ndarray``, storing array data in Unified Shared Memory
+(USM) allocated on a SYCL device. Most users interact with
+:class:`dpnp.ndarray` directly; ``dpnp.tensor.usm_ndarray`` may appear in error
+messages or type signatures when working with device placement or
+interoperability.
+
+Relationship to ``dpctl``
+-------------------------
+
+The migration of ``dpctl.tensor`` into ``dpnp.tensor`` does not replace
+`dpctl <https://intelpython.github.io/dpctl/latest/index.html>`_ itself.
+``dpctl`` remains responsible for device and queue management
+(:class:`dpctl.SyclDevice`, :class:`dpctl.SyclQueue`) as well as USM memory
+allocation. ``dpnp`` builds on top of these capabilities.
+
+Example
+-------
+
+.. code-block:: python
+
+    import dpnp
+    import dpnp.tensor as dpt
+
+    # Create a tensor array on the default device
+    x = dpt.asarray([1.0, 2.0, 3.0])
+
+    # dpnp.ndarray wraps the underlying usm_ndarray
+    a = dpnp.asarray([1.0, 2.0, 3.0])
+    assert isinstance(a.get_array(), dpt.usm_ndarray)
+
+.. note::
+
+   The ``dpnp.tensor`` API documentation will be added in a future release.
+
+   The current implementation remains compatible with the original
+   ``dpctl.tensor`` API. For the complete API reference, see the
+   `dpctl 0.21.1 tensor documentation <https://intelpython.github.io/dpctl/0.21.1/api_reference/dpctl/tensor.html>`_.
diff --git a/dpnp/dpnp_algo/dpnp_arraycreation.py b/dpnp/dpnp_algo/dpnp_arraycreation.py
index df21ea5bbc44..9c9110b85384 100644
--- a/dpnp/dpnp_algo/dpnp_arraycreation.py
+++ b/dpnp/dpnp_algo/dpnp_arraycreation.py
@@ -45,7 +45,7 @@
 
 
 def _as_usm_ndarray(a, usm_type, sycl_queue):
-    """Converts input object to `dpctl.tensor.usm_ndarray`"""
+    """Converts input object to `dpnp.tensor.usm_ndarray`"""
 
     if isinstance(a, dpnp_array):
         a = a.get_array()
diff --git a/dpnp/dpnp_algo/dpnp_elementwise_common.py b/dpnp/dpnp_algo/dpnp_elementwise_common.py
index 4eb613db35a2..96db4b4fe4e0 100644
--- a/dpnp/dpnp_algo/dpnp_elementwise_common.py
+++ b/dpnp/dpnp_algo/dpnp_elementwise_common.py
@@ -119,7 +119,7 @@ class DPNPUnaryFunc(UnaryElementwiseFunc):
             sycl_dev - The :class:`dpctl.SyclDevice` where the function
                 evaluation is carried out.
         The function is invoked when the argument of the unary function
-        requires casting, e.g. the argument of `dpctl.tensor.log` is an
+        requires casting, e.g. the argument of `dpnp.tensor.log` is an
         array with integral data type.
 
     """
@@ -137,7 +137,7 @@ def __init__(
         def _call_func(src, dst, sycl_queue, depends=None):
             """
             A callback to register in UnaryElementwiseFunc class of
-            dpctl.tensor
+            dpnp.tensor
             """
 
             if depends is None:
@@ -588,7 +588,7 @@ class DPNPBinaryFunc(BinaryElementwiseFunc):
                 evaluation is carried out.
         The function is only called when both arguments of the binary
         function require casting, e.g. both arguments of
-        `dpctl.tensor.logaddexp` are arrays with integral data type.
+        `dpnp.tensor.logaddexp` are arrays with integral data type.
     weak_type_resolver : {None, callable}, optional
         Function to influence type promotion behavior for Python scalar types
         of this binary function. The function takes 3 arguments:
@@ -615,7 +615,7 @@ def __init__(
         def _call_func(src1, src2, dst, sycl_queue, depends=None):
             """
             A callback to register in UnaryElementwiseFunc class of
-            dpctl.tensor
+            dpnp.tensor
             """
 
             if depends is None:
diff --git a/dpnp/dpnp_array.py b/dpnp/dpnp_array.py
index 6ab67217e482..00a1b2d00e5d 100644
--- a/dpnp/dpnp_array.py
+++ b/dpnp/dpnp_array.py
@@ -72,7 +72,7 @@ class dpnp_array:
     An array object represents a multidimensional tensor of numeric elements
     stored in a USM allocation on a SYCL device.
 
-    This is a wrapper around :class:`dpctl.tensor.usm_ndarray` that provides
+    This is a wrapper around :class:`dpnp.tensor.usm_ndarray` that provides
     methods to be compliant with original NumPy.
 
     """
@@ -609,12 +609,12 @@ def __usm_ndarray__(self):
         """
         Property to support ``__usm_ndarray__`` protocol.
 
-        It assumes to return :class:`dpctl.tensor.usm_ndarray` instance
+        It assumes to return :class:`dpnp.tensor.usm_ndarray` instance
         corresponding to the content of the object.
 
         This property is intended to speed-up conversion from
-        :class:`dpnp.ndarray` to :class:`dpctl.tensor.usm_ndarray` passed into
-        :func:`dpctl.tensor.asarray` function. The input object that implements
+        :class:`dpnp.ndarray` to :class:`dpnp.tensor.usm_ndarray` passed into
+        :func:`dpnp.tensor.asarray` function. The input object that implements
         ``__usm_ndarray__`` protocol is recognized as owner of USM allocation
         that is managed by a smart pointer, and asynchronous deallocation
         will not involve GIL.
@@ -631,13 +631,13 @@ def __xor__(self, other, /):
     def _create_from_usm_ndarray(usm_ary: dpt.usm_ndarray):
         """
         Return :class:`dpnp.ndarray` instance from USM allocation providing
-        by an instance of :class:`dpctl.tensor.usm_ndarray`.
+        by an instance of :class:`dpnp.tensor.usm_ndarray`.
 
         """
 
         if not isinstance(usm_ary, dpt.usm_ndarray):
             raise TypeError(
-                f"Expected dpctl.tensor.usm_ndarray, got {type(usm_ary)}"
+                f"Expected dpnp.tensor.usm_ndarray, got {type(usm_ary)}"
             )
         res = dpnp_array.__new__(dpnp_array)
         res._array_obj = usm_ary
@@ -956,7 +956,7 @@ def astype(
             `device` can be ``None``, a oneAPI filter selector string,
             an instance of :class:`dpctl.SyclDevice` corresponding to
             a non-partitioned SYCL device, an instance of
-            :class:`dpctl.SyclQueue`, or a :class:`dpctl.tensor.Device` object
+            :class:`dpctl.SyclQueue`, or a :class:`dpnp.tensor.Device` object
             returned by :attr:`dpnp.ndarray.device`.
             If the value is ``None``, returned array is created on the same
             device as that array.
@@ -1067,7 +1067,7 @@ def copy(
             `device` can be ``None``, a oneAPI filter selector string,
             an instance of :class:`dpctl.SyclDevice` corresponding to
             a non-partitioned SYCL device, an instance of
-            :class:`dpctl.SyclQueue`, or a :class:`dpctl.tensor.Device` object
+            :class:`dpctl.SyclQueue`, or a :class:`dpnp.tensor.Device` object
             returned by :attr:`dpnp.ndarray.device`.
 
             Default: ``None``.
@@ -1162,7 +1162,7 @@ def data(self):
     @property
     def device(self):
         """
-        Return :class:`dpctl.tensor.Device` object representing residence of
+        Return :class:`dpnp.tensor.Device` object representing residence of
         the array data.
 
         The ``Device`` object represents Array API notion of the device, and
@@ -1329,7 +1329,7 @@ def flatten(self, /, order="C"):
         return self.reshape(-1, order=order, copy=True)
 
     def get_array(self):
-        """Get :class:`dpctl.tensor.usm_ndarray` object."""
+        """Get :class:`dpnp.tensor.usm_ndarray` object."""
         return self._array_obj
 
     # 'getfield',
@@ -2182,7 +2182,7 @@ def to_device(self, device, /, *, stream=None):
             `device` can be ``None``, a oneAPI filter selector string,
             an instance of :class:`dpctl.SyclDevice` corresponding to
             a non-partitioned SYCL device, an instance of
-            :class:`dpctl.SyclQueue`, or a :class:`dpctl.tensor.Device` object
+            :class:`dpctl.SyclQueue`, or a :class:`dpnp.tensor.Device` object
             returned by :attr:`dpnp.ndarray.device`.
         stream : {SyclQueue, None}, optional
             Execution queue to synchronize with. If ``None``, synchronization
diff --git a/dpnp/dpnp_container.py b/dpnp/dpnp_container.py
index 374cc2c26f09..4b38c2915178 100644
--- a/dpnp/dpnp_container.py
+++ b/dpnp/dpnp_container.py
@@ -64,7 +64,7 @@ def arange(
     usm_type="device",
     sycl_queue=None,
 ):
-    """Validate input parameters before passing them into `dpctl.tensor` module"""
+    """Validate input parameters before passing them into `dpnp.tensor` module"""
     dpt.validate_usm_type(usm_type, allow_none=False)
     sycl_queue_normalized = dpnp.get_normalized_queue_device(
         sycl_queue=sycl_queue, device=device
@@ -151,7 +151,7 @@ def empty(
     usm_type="device",
     sycl_queue=None,
 ):
-    """Validate input parameters before passing them into `dpctl.tensor` module"""
+    """Validate input parameters before passing them into `dpnp.tensor` module"""
     dpt.validate_usm_type(usm_type, allow_none=False)
     sycl_queue_normalized = dpnp.get_normalized_queue_device(
         sycl_queue=sycl_queue, device=device
@@ -182,7 +182,7 @@ def eye(
     usm_type="device",
     sycl_queue=None,
 ):
-    """Validate input parameters before passing them into `dpctl.tensor` module"""
+    """Validate input parameters before passing them into `dpnp.tensor` module"""
     dpt.validate_usm_type(usm_type, allow_none=False)
     sycl_queue_normalized = dpnp.get_normalized_queue_device(
         sycl_queue=sycl_queue, device=device
@@ -213,7 +213,7 @@ def full(
     usm_type=None,
     sycl_queue=None,
 ):
-    """Validate input parameters before passing them into `dpctl.tensor` module"""
+    """Validate input parameters before passing them into `dpnp.tensor` module"""
     dpt.validate_usm_type(usm_type, allow_none=True)
 
     sycl_queue_normalized = dpnp.get_normalized_queue_device(
@@ -246,7 +246,7 @@ def ones(
     usm_type="device",
     sycl_queue=None,
 ):
-    """Validate input parameters before passing them into `dpctl.tensor` module"""
+    """Validate input parameters before passing them into `dpnp.tensor` module"""
     dpt.validate_usm_type(usm_type, allow_none=False)
     sycl_queue_normalized = dpnp.get_normalized_queue_device(
         sycl_queue=sycl_queue, device=device
@@ -286,7 +286,7 @@ def zeros(
     usm_type="device",
     sycl_queue=None,
 ):
-    """Validate input parameters before passing them into `dpctl.tensor` module"""
+    """Validate input parameters before passing them into `dpnp.tensor` module"""
     dpt.validate_usm_type(usm_type, allow_none=False)
     sycl_queue_normalized = dpnp.get_normalized_queue_device(
         sycl_queue=sycl_queue, device=device
diff --git a/dpnp/dpnp_iface.py b/dpnp/dpnp_iface.py
index a9bf24bc56ea..c9d16a20e83d 100644
--- a/dpnp/dpnp_iface.py
+++ b/dpnp/dpnp_iface.py
@@ -142,7 +142,7 @@ def asnumpy(a, order="C"):
 
 def as_usm_ndarray(a, dtype=None, device=None, usm_type=None, sycl_queue=None):
     """
-    Return :class:`dpctl.tensor.usm_ndarray` from input object `a`.
+    Return :class:`dpnp.tensor.usm_ndarray` from input object `a`.
 
     Parameters
     ----------
@@ -159,7 +159,7 @@ def as_usm_ndarray(a, dtype=None, device=None, usm_type=None, sycl_queue=None):
         `device` can be ``None``, a oneAPI filter selector string, an instance
         of :class:`dpctl.SyclDevice` corresponding to a non-partitioned SYCL
         device, an instance of :class:`dpctl.SyclQueue`, or a
-        :class:`dpctl.tensor.Device` object returned by
+        :class:`dpnp.tensor.Device` object returned by
         :attr:`dpnp.ndarray.device`.
         If the value is ``None``, returned array is created on the same device
         as `a`.
@@ -180,7 +180,7 @@ def as_usm_ndarray(a, dtype=None, device=None, usm_type=None, sycl_queue=None):
     out : usm_ndarray
         A dpctl USM ndarray from input array or scalar `a`.
         If `a` is instance of :class:`dpnp.ndarray`
-        or :class:`dpctl.tensor.usm_ndarray`, no array allocation will be done
+        or :class:`dpnp.tensor.usm_ndarray`, no array allocation will be done
         and `dtype`, `device`, `usm_type`, `sycl_queue` keywords
         will be ignored.
 
@@ -256,7 +256,7 @@ def check_limitations(
 def check_supported_arrays_type(*arrays, scalar_type=False, all_scalars=False):
     """
     Return ``True`` if each array has either type of scalar,
-    :class:`dpnp.ndarray` or :class:`dpctl.tensor.usm_ndarray`.
+    :class:`dpnp.ndarray` or :class:`dpnp.tensor.usm_ndarray`.
     But if any array has unsupported type, ``TypeError`` will be raised.
 
     Parameters
@@ -318,7 +318,7 @@ def default_float_type(device=None, sycl_queue=None):
         `device` can be ``None``, a oneAPI filter selector string, an instance
         of :class:`dpctl.SyclDevice` corresponding to a non-partitioned SYCL
         device, an instance of :class:`dpctl.SyclQueue`, or a
-        :class:`dpctl.tensor.Device` object returned by
+        :class:`dpnp.tensor.Device` object returned by
         :attr:`dpnp.ndarray.device`.
         The value ``None`` is interpreted as to use a default device.
 
@@ -434,7 +434,7 @@ def get_include():
 def get_normalized_queue_device(obj=None, device=None, sycl_queue=None):
     """
     Utility to process complementary keyword arguments 'device' and 'sycl_queue'
-    in subsequent calls of functions from `dpctl.tensor` module.
+    in subsequent calls of functions from `dpnp.tensor` module.
 
     If both arguments 'device' and 'sycl_queue' have default value ``None``
     and 'obj' has `sycl_queue` attribute, it assumes that Compute Follows Data
@@ -445,7 +445,7 @@ def get_normalized_queue_device(obj=None, device=None, sycl_queue=None):
     ----------
     obj : object, optional
         A python object. Can be an instance of `dpnp_array`,
-        `dpctl.tensor.usm_ndarray`, an object representing SYCL USM allocation
+        `dpnp.tensor.usm_ndarray`, an object representing SYCL USM allocation
         and implementing `__sycl_usm_array_interface__` protocol, an instance
         of `numpy.ndarray`, an object supporting Python buffer protocol,
         a Python scalar, or a (possibly nested) sequence of Python scalars.
@@ -462,7 +462,7 @@ def get_normalized_queue_device(obj=None, device=None, sycl_queue=None):
         `device` can be ``None``, a oneAPI filter selector string, an instance
         of :class:`dpctl.SyclDevice` corresponding to a non-partitioned SYCL
         device, an instance of :class:`dpctl.SyclQueue`, or a
-        :class:`dpctl.tensor.Device` object returned by
+        :class:`dpnp.tensor.Device` object returned by
         :attr:`dpnp.ndarray.device`.
         The value ``None`` is interpreted as to use the same device as `obj`.
 
@@ -472,7 +472,7 @@ def get_normalized_queue_device(obj=None, device=None, sycl_queue=None):
     -------
     sycl_queue: dpctl.SyclQueue
         A :class:`dpctl.SyclQueue` object normalized by
-        `normalize_queue_device` call of `dpctl.tensor` module invoked with
+        `normalize_queue_device` call of `dpnp.tensor` module invoked with
         `device` and `sycl_queue` values. If both incoming `device` and
         `sycl_queue` are ``None`` and `obj` has `sycl_queue` attribute,
         the normalization will be performed for `obj.sycl_queue` value.
@@ -540,13 +540,13 @@ def get_result_array(a, out=None, casting="safe"):
 
 def get_usm_ndarray(a):
     """
-    Return :class:`dpctl.tensor.usm_ndarray` from input array `a`.
+    Return :class:`dpnp.tensor.usm_ndarray` from input array `a`.
 
     Parameters
     ----------
     a : {dpnp.ndarray, usm_ndarray}
         Input array of supported type :class:`dpnp.ndarray`
-        or :class:`dpctl.tensor.usm_ndarray`.
+        or :class:`dpnp.tensor.usm_ndarray`.
 
     Returns
     -------
@@ -571,13 +571,13 @@ def get_usm_ndarray(a):
 
 def get_usm_ndarray_or_scalar(a):
     """
-    Return scalar or :class:`dpctl.tensor.usm_ndarray` from input object `a`.
+    Return scalar or :class:`dpnp.tensor.usm_ndarray` from input object `a`.
 
     Parameters
     ----------
     a : {scalar, dpnp_array, usm_ndarray}
         Input of any supported type: scalar, :class:`dpnp.ndarray`
-        or :class:`dpctl.tensor.usm_ndarray`.
+        or :class:`dpnp.tensor.usm_ndarray`.
 
     Returns
     -------
@@ -634,7 +634,7 @@ def is_cuda_backend(obj=None):
 def is_supported_array_or_scalar(a):
     """
     Return ``True`` if `a` is a scalar or an array of either
-    :class:`dpnp.ndarray` or :class:`dpctl.tensor.usm_ndarray` type,
+    :class:`dpnp.ndarray` or :class:`dpnp.tensor.usm_ndarray` type,
     ``False`` otherwise.
 
     Parameters
@@ -656,7 +656,7 @@ def is_supported_array_or_scalar(a):
 def is_supported_array_type(a):
     """
     Return ``True`` if an array of either type :class:`dpnp.ndarray`
-    or :class:`dpctl.tensor.usm_ndarray` type, ``False`` otherwise.
+    or :class:`dpnp.tensor.usm_ndarray` type, ``False`` otherwise.
 
     Parameters
     ----------
diff --git a/dpnp/dpnp_iface_arraycreation.py b/dpnp/dpnp_iface_arraycreation.py
index 4f5f047be1dd..da6b45517eb3 100644
--- a/dpnp/dpnp_iface_arraycreation.py
+++ b/dpnp/dpnp_iface_arraycreation.py
@@ -175,7 +175,7 @@ def arange(
         `device` can be ``None``, a oneAPI filter selector string, an instance
         of :class:`dpctl.SyclDevice` corresponding to a non-partitioned SYCL
         device, an instance of :class:`dpctl.SyclQueue`, or a
-        :class:`dpctl.tensor.Device` object returned by
+        :class:`dpnp.tensor.Device` object returned by
         :attr:`dpnp.ndarray.device`.
 
         Default: ``None``.
@@ -295,7 +295,7 @@ def array(
         `device` can be ``None``, a oneAPI filter selector string, an instance
         of :class:`dpctl.SyclDevice` corresponding to a non-partitioned SYCL
         device, an instance of :class:`dpctl.SyclQueue`, or a
-        :class:`dpctl.tensor.Device` object returned by
+        :class:`dpnp.tensor.Device` object returned by
         :attr:`dpnp.ndarray.device`.
 
         Default: ``None``.
@@ -440,7 +440,7 @@ def asanyarray(
         `device` can be ``None``, a oneAPI filter selector string, an instance
         of :class:`dpctl.SyclDevice` corresponding to a non-partitioned SYCL
         device, an instance of :class:`dpctl.SyclQueue`, or a
-        :class:`dpctl.tensor.Device` object returned by
+        :class:`dpnp.tensor.Device` object returned by
         :attr:`dpnp.ndarray.device`.
 
         Default: ``None``.
@@ -545,7 +545,7 @@ def asarray(
         `device` can be ``None``, a oneAPI filter selector string, an instance
         of :class:`dpctl.SyclDevice` corresponding to a non-partitioned SYCL
         device, an instance of :class:`dpctl.SyclQueue`, or a
-        :class:`dpctl.tensor.Device` object returned by
+        :class:`dpnp.tensor.Device` object returned by
         :attr:`dpnp.ndarray.device`.
 
         Default: ``None``.
@@ -646,7 +646,7 @@ def ascontiguousarray(
         `device` can be ``None``, a oneAPI filter selector string, an instance
         of :class:`dpctl.SyclDevice` corresponding to a non-partitioned SYCL
         device, an instance of :class:`dpctl.SyclQueue`, or a
-        :class:`dpctl.tensor.Device` object returned by
+        :class:`dpnp.tensor.Device` object returned by
         :attr:`dpnp.ndarray.device`.
 
         Default: ``None``.
@@ -760,7 +760,7 @@ def asfortranarray(
         `device` can be ``None``, a oneAPI filter selector string, an instance
         of :class:`dpctl.SyclDevice` corresponding to a non-partitioned SYCL
         device, an instance of :class:`dpctl.SyclQueue`, or a
-        :class:`dpctl.tensor.Device` object returned by
+        :class:`dpnp.tensor.Device` object returned by
         :attr:`dpnp.ndarray.device`.
 
         Default: ``None``.
@@ -897,7 +897,7 @@ def astype(x, dtype, /, *, order="K", casting="unsafe", copy=True, device=None):
         `device` can be ``None``, a oneAPI filter selector string, an instance
         of :class:`dpctl.SyclDevice` corresponding to a non-partitioned SYCL
         device, an instance of :class:`dpctl.SyclQueue`, or a
-        :class:`dpctl.tensor.Device` object returned by
+        :class:`dpnp.tensor.Device` object returned by
         :attr:`dpnp.ndarray.device`.
         If the value is ``None``, returned array is created on the same device
         as `x`.
@@ -966,7 +966,7 @@ def copy(
         `device` can be ``None``, a oneAPI filter selector string, an instance
         of :class:`dpctl.SyclDevice` corresponding to a non-partitioned SYCL
         device, an instance of :class:`dpctl.SyclQueue`, or a
-        :class:`dpctl.tensor.Device` object returned by
+        :class:`dpnp.tensor.Device` object returned by
         :attr:`dpnp.ndarray.device`.
 
         Default: ``None``.
@@ -1086,7 +1086,7 @@ def diag(v, /, k=0, *, device=None, usm_type=None, sycl_queue=None):
         `device` can be ``None``, a oneAPI filter selector string, an instance
         of :class:`dpctl.SyclDevice` corresponding to a non-partitioned SYCL
         device, an instance of :class:`dpctl.SyclQueue`, or a
-        :class:`dpctl.tensor.Device` object returned by
+        :class:`dpnp.tensor.Device` object returned by
         :attr:`dpnp.ndarray.device`.
 
         Default: ``None``.
@@ -1191,7 +1191,7 @@ def diagflat(v, /, k=0, *, device=None, usm_type=None, sycl_queue=None):
         `device` can be ``None``, a oneAPI filter selector string, an instance
         of :class:`dpctl.SyclDevice` corresponding to a non-partitioned SYCL
         device, an instance of :class:`dpctl.SyclQueue`, or a
-        :class:`dpctl.tensor.Device` object returned by
+        :class:`dpnp.tensor.Device` object returned by
         :attr:`dpnp.ndarray.device`.
 
         Default: ``None``.
@@ -1297,7 +1297,7 @@ def empty(
         `device` can be ``None``, a oneAPI filter selector string, an instance
         of :class:`dpctl.SyclDevice` corresponding to a non-partitioned SYCL
         device, an instance of :class:`dpctl.SyclQueue`, or a
-        :class:`dpctl.tensor.Device` object returned by
+        :class:`dpnp.tensor.Device` object returned by
         :attr:`dpnp.ndarray.device`.
 
         Default: ``None``.
@@ -1403,7 +1403,7 @@ def empty_like(
         `device` can be ``None``, a oneAPI filter selector string, an instance
         of :class:`dpctl.SyclDevice` corresponding to a non-partitioned SYCL
         device, an instance of :class:`dpctl.SyclQueue`, or a
-        :class:`dpctl.tensor.Device` object returned by
+        :class:`dpnp.tensor.Device` object returned by
         :attr:`dpnp.ndarray.device`.
 
         Default: ``None``.
@@ -1515,7 +1515,7 @@ def eye(
         `device` can be ``None``, a oneAPI filter selector string, an instance
         of :class:`dpctl.SyclDevice` corresponding to a non-partitioned SYCL
         device, an instance of :class:`dpctl.SyclQueue`, or a
-        :class:`dpctl.tensor.Device` object returned by
+        :class:`dpnp.tensor.Device` object returned by
         :attr:`dpnp.ndarray.device`.
 
         Default: ``None``.
@@ -1627,7 +1627,7 @@ def frombuffer(
         `device` can be ``None``, a oneAPI filter selector string, an instance
         of :class:`dpctl.SyclDevice` corresponding to a non-partitioned SYCL
         device, an instance of :class:`dpctl.SyclQueue`, or a
-        :class:`dpctl.tensor.Device` object returned by
+        :class:`dpnp.tensor.Device` object returned by
         :attr:`dpnp.ndarray.device`.
 
         Default: ``None``.
@@ -1747,7 +1747,7 @@ def fromfile(
         `device` can be ``None``, a oneAPI filter selector string, an instance
         of :class:`dpctl.SyclDevice` corresponding to a non-partitioned SYCL
         device, an instance of :class:`dpctl.SyclQueue`, or a
-        :class:`dpctl.tensor.Device` object returned by
+        :class:`dpnp.tensor.Device` object returned by
         :attr:`dpnp.ndarray.device`.
 
         Default: ``None``.
@@ -1868,7 +1868,7 @@ def fromfunction(
         `device` can be ``None``, a oneAPI filter selector string, an instance
         of :class:`dpctl.SyclDevice` corresponding to a non-partitioned SYCL
         device, an instance of :class:`dpctl.SyclQueue`, or a
-        :class:`dpctl.tensor.Device` object returned by
+        :class:`dpnp.tensor.Device` object returned by
         :attr:`dpnp.ndarray.device`.
 
         Default: ``None``.
@@ -1979,7 +1979,7 @@ def fromiter(
         `device` can be ``None``, a oneAPI filter selector string, an instance
         of :class:`dpctl.SyclDevice` corresponding to a non-partitioned SYCL
         device, an instance of :class:`dpctl.SyclQueue`, or a
-        :class:`dpctl.tensor.Device` object returned by
+        :class:`dpnp.tensor.Device` object returned by
         :attr:`dpnp.ndarray.device`.
 
         Default: ``None``.
@@ -2081,7 +2081,7 @@ def fromstring(
         `device` can be ``None``, a oneAPI filter selector string, an instance
         of :class:`dpctl.SyclDevice` corresponding to a non-partitioned SYCL
         device, an instance of :class:`dpctl.SyclQueue`, or a
-        :class:`dpctl.tensor.Device` object returned by
+        :class:`dpnp.tensor.Device` object returned by
         :attr:`dpnp.ndarray.device`.
 
         Default: ``None``.
@@ -2158,9 +2158,9 @@ def from_dlpack(x, /, *, device=None, copy=None):
           to a non-partitioned SYCL device.
         * :class:`dpctl.SyclQueue` : Implies SYCL device targeted by the SYCL
           queue.
-        * :class:`dpctl.tensor.Device` : Implies SYCL device
+        * :class:`dpnp.tensor.Device` : Implies SYCL device
           ``device.sycl_queue``. The `device` object is obtained via
-          :attr:`dpctl.tensor.usm_ndarray.device`.
+          :attr:`dpnp.tensor.usm_ndarray.device`.
         * ``(device_type, device_id)`` : 2-tuple matching the format of the
           output of the :meth:`dpnp.ndarray.__dlpack_device__`: an integer
           enumerator representing the device type followed by an integer
@@ -2205,7 +2205,7 @@ def from_dlpack(x, /, *, device=None, copy=None):
     If the return type is :class:`dpnp.ndarray`, the associated SYCL queue is
     derived from the `device` keyword. When `device` keyword value has type
     :class:`dpctl.SyclQueue`, the explicit queue instance is used, when `device`
-    keyword value has type :class:`dpctl.tensor.Device`, the
+    keyword value has type :class:`dpnp.tensor.Device`, the
     ``device.sycl_queue`` is used. In all other cases, the cached SYCL queue
     corresponding to the implied SYCL device is used.
 
@@ -2261,7 +2261,7 @@ def full(
         `device` can be ``None``, a oneAPI filter selector string, an instance
         of :class:`dpctl.SyclDevice` corresponding to a non-partitioned SYCL
         device, an instance of :class:`dpctl.SyclQueue`, or a
-        :class:`dpctl.tensor.Device` object returned by
+        :class:`dpnp.tensor.Device` object returned by
         :attr:`dpnp.ndarray.device`.
 
         Default: ``None``.
@@ -2370,7 +2370,7 @@ def full_like(
         `device` can be ``None``, a oneAPI filter selector string, an instance
         of :class:`dpctl.SyclDevice` corresponding to a non-partitioned SYCL
         device, an instance of :class:`dpctl.SyclQueue`, or a
-        :class:`dpctl.tensor.Device` object returned by
+        :class:`dpnp.tensor.Device` object returned by
         :attr:`dpnp.ndarray.device`.
 
         Default: ``None``.
@@ -2485,7 +2485,7 @@ def geomspace(
         `device` can be ``None``, a oneAPI filter selector string, an instance
         of :class:`dpctl.SyclDevice` corresponding to a non-partitioned SYCL
         device, an instance of :class:`dpctl.SyclQueue`, or a
-        :class:`dpctl.tensor.Device` object returned by
+        :class:`dpnp.tensor.Device` object returned by
         :attr:`dpnp.ndarray.device`.
 
         Default: ``None``.
@@ -2597,7 +2597,7 @@ def identity(
         `device` can be ``None``, a oneAPI filter selector string, an instance
         of :class:`dpctl.SyclDevice` corresponding to a non-partitioned SYCL
         device, an instance of :class:`dpctl.SyclQueue`, or a
-        :class:`dpctl.tensor.Device` object returned by
+        :class:`dpnp.tensor.Device` object returned by
         :attr:`dpnp.ndarray.device`.
 
         Default: ``None``.
@@ -2715,7 +2715,7 @@ def linspace(
         `device` can be ``None``, a oneAPI filter selector string, an instance
         of :class:`dpctl.SyclDevice` corresponding to a non-partitioned SYCL
         device, an instance of :class:`dpctl.SyclQueue`, or a
-        :class:`dpctl.tensor.Device` object returned by
+        :class:`dpnp.tensor.Device` object returned by
         :attr:`dpnp.ndarray.device`.
 
         Default: ``None``.
@@ -2827,7 +2827,7 @@ def loadtxt(
         `device` can be ``None``, a oneAPI filter selector string, an instance
         of :class:`dpctl.SyclDevice` corresponding to a non-partitioned SYCL
         device, an instance of :class:`dpctl.SyclQueue`, or a
-        :class:`dpctl.tensor.Device` object returned by
+        :class:`dpnp.tensor.Device` object returned by
         :attr:`dpnp.ndarray.device`.
 
         Default: ``None``.
@@ -2942,7 +2942,7 @@ def logspace(
         `device` can be ``None``, a oneAPI filter selector string, an instance
         of :class:`dpctl.SyclDevice` corresponding to a non-partitioned SYCL
         device, an instance of :class:`dpctl.SyclQueue`, or a
-        :class:`dpctl.tensor.Device` object returned by
+        :class:`dpnp.tensor.Device` object returned by
         :attr:`dpnp.ndarray.device`.
 
         Default: ``None``.
@@ -3148,7 +3148,7 @@ class MGridClass:
         `device` can be ``None``, a oneAPI filter selector string, an instance
         of :class:`dpctl.SyclDevice` corresponding to a non-partitioned SYCL
         device, an instance of :class:`dpctl.SyclQueue`, or a
-        :class:`dpctl.tensor.Device` object returned by
+        :class:`dpnp.tensor.Device` object returned by
         :attr:`dpnp.ndarray.device`.
 
         Default: ``None``.
@@ -3227,7 +3227,7 @@ class OGridClass:
         `device` can be ``None``, a oneAPI filter selector string, an instance
         of :class:`dpctl.SyclDevice` corresponding to a non-partitioned SYCL
         device, an instance of :class:`dpctl.SyclQueue`, or a
-        :class:`dpctl.tensor.Device` object returned by
+        :class:`dpnp.tensor.Device` object returned by
         :attr:`dpnp.ndarray.device`.
 
         Default: ``None``.
@@ -3317,7 +3317,7 @@ def ones(
         `device` can be ``None``, a oneAPI filter selector string, an instance
         of :class:`dpctl.SyclDevice` corresponding to a non-partitioned SYCL
         device, an instance of :class:`dpctl.SyclQueue`, or a
-        :class:`dpctl.tensor.Device` object returned by
+        :class:`dpnp.tensor.Device` object returned by
         :attr:`dpnp.ndarray.device`.
 
         Default: ``None``.
@@ -3429,7 +3429,7 @@ def ones_like(
         `device` can be ``None``, a oneAPI filter selector string, an instance
         of :class:`dpctl.SyclDevice` corresponding to a non-partitioned SYCL
         device, an instance of :class:`dpctl.SyclQueue`, or a
-        :class:`dpctl.tensor.Device` object returned by
+        :class:`dpnp.tensor.Device` object returned by
         :attr:`dpnp.ndarray.device`.
 
         Default: ``None``.
@@ -3602,7 +3602,7 @@ def tri(
         `device` can be ``None``, a oneAPI filter selector string, an instance
         of :class:`dpctl.SyclDevice` corresponding to a non-partitioned SYCL
         device, an instance of :class:`dpctl.SyclQueue`, or a
-        :class:`dpctl.tensor.Device` object returned by
+        :class:`dpnp.tensor.Device` object returned by
         :attr:`dpnp.ndarray.device`.
 
         Default: ``None``.
@@ -3840,7 +3840,7 @@ def vander(
         `device` can be ``None``, a oneAPI filter selector string, an instance
         of :class:`dpctl.SyclDevice` corresponding to a non-partitioned SYCL
         device, an instance of :class:`dpctl.SyclQueue`, or a
-        :class:`dpctl.tensor.Device` object returned by
+        :class:`dpnp.tensor.Device` object returned by
         :attr:`dpnp.ndarray.device`.
 
         Default: ``None``.
@@ -3970,7 +3970,7 @@ def zeros(
         `device` can be ``None``, a oneAPI filter selector string, an instance
         of :class:`dpctl.SyclDevice` corresponding to a non-partitioned SYCL
         device, an instance of :class:`dpctl.SyclQueue`, or a
-        :class:`dpctl.tensor.Device` object returned by
+        :class:`dpnp.tensor.Device` object returned by
         :attr:`dpnp.ndarray.device`.
 
         Default: ``None``.
@@ -4082,7 +4082,7 @@ def zeros_like(
         `device` can be ``None``, a oneAPI filter selector string, an instance
         of :class:`dpctl.SyclDevice` corresponding to a non-partitioned SYCL
         device, an instance of :class:`dpctl.SyclQueue`, or a
-        :class:`dpctl.tensor.Device` object returned by
+        :class:`dpnp.tensor.Device` object returned by
         :attr:`dpnp.ndarray.device`.
 
         Default: ``None``.
diff --git a/dpnp/dpnp_iface_indexing.py b/dpnp/dpnp_iface_indexing.py
index 2a57ebab9c07..1c9776582b73 100644
--- a/dpnp/dpnp_iface_indexing.py
+++ b/dpnp/dpnp_iface_indexing.py
@@ -101,7 +101,7 @@ def _build_choices_list(choices):
     list of arrays. If a single array of dimension greater than one, the array
     will be unstacked.
 
-    Returns a list of :class:`dpctl.tensor.usm_ndarray`s.
+    Returns a list of :class:`dpnp.tensor.usm_ndarray`s.
     """
 
     if dpnp.is_supported_array_type(choices):
@@ -447,7 +447,7 @@ def diag_indices(n, ndim=2, device=None, usm_type="device", sycl_queue=None):
         `device` can be ``None``, a oneAPI filter selector string, an instance
         of :class:`dpctl.SyclDevice` corresponding to a non-partitioned SYCL
         device, an instance of :class:`dpctl.SyclQueue`, or a
-        :class:`dpctl.tensor.Device` object returned by
+        :class:`dpnp.tensor.Device` object returned by
         :attr:`dpnp.ndarray.device`.
 
         Default: ``None``.
@@ -1046,7 +1046,7 @@ def indices(
         `device` can be ``None``, a oneAPI filter selector string, an instance
         of :class:`dpctl.SyclDevice` corresponding to a non-partitioned SYCL
         device, an instance of :class:`dpctl.SyclQueue`, or a
-        :class:`dpctl.tensor.Device` object returned by
+        :class:`dpnp.tensor.Device` object returned by
         :attr:`dpnp.ndarray.device`.
 
         Default: ``None``.
@@ -1310,7 +1310,7 @@ def mask_indices(
         `device` can be ``None``, a oneAPI filter selector string, an instance
         of :class:`dpctl.SyclDevice` corresponding to a non-partitioned SYCL
         device, an instance of :class:`dpctl.SyclQueue`, or a
-        :class:`dpctl.tensor.Device` object returned by
+        :class:`dpnp.tensor.Device` object returned by
         :attr:`dpnp.ndarray.device`.
 
         Default: ``None``.
@@ -2323,7 +2323,7 @@ def tril_indices(
         `device` can be ``None``, a oneAPI filter selector string, an instance
         of :class:`dpctl.SyclDevice` corresponding to a non-partitioned SYCL
         device, an instance of :class:`dpctl.SyclQueue`, or a
-        :class:`dpctl.tensor.Device` object returned by
+        :class:`dpnp.tensor.Device` object returned by
         :attr:`dpnp.ndarray.device`.
 
         Default: ``None``.
@@ -2540,7 +2540,7 @@ def triu_indices(
         `device` can be ``None``, a oneAPI filter selector string, an instance
         of :class:`dpctl.SyclDevice` corresponding to a non-partitioned SYCL
         device, an instance of :class:`dpctl.SyclQueue`, or a
-        :class:`dpctl.tensor.Device` object returned by
+        :class:`dpnp.tensor.Device` object returned by
         :attr:`dpnp.ndarray.device`.
 
         Default: ``None``.
diff --git a/dpnp/dpnp_iface_manipulation.py b/dpnp/dpnp_iface_manipulation.py
index 11288134b6a1..b96d36a40e6a 100644
--- a/dpnp/dpnp_iface_manipulation.py
+++ b/dpnp/dpnp_iface_manipulation.py
@@ -692,7 +692,7 @@ def asarray_chkfinite(
         `device` can be ``None``, a oneAPI filter selector string, an instance
         of :class:`dpctl.SyclDevice` corresponding to a non-partitioned SYCL
         device, an instance of :class:`dpctl.SyclQueue`, or a
-        :class:`dpctl.tensor.Device` object returned by
+        :class:`dpnp.tensor.Device` object returned by
         :attr:`dpnp.ndarray.device`.
 
         Default: ``None``.
@@ -791,7 +791,7 @@ def asfarray(a, dtype=None, *, device=None, usm_type=None, sycl_queue=None):
     a : array_like
         Input data, in any form that can be converted to an array.
         This includes an instance of :class:`dpnp.ndarray` or
-        :class:`dpctl.tensor.usm_ndarray`, an object representing
+        :class:`dpnp.tensor.usm_ndarray`, an object representing
         SYCL USM allocation and implementing `__sycl_usm_array_interface__`
         protocol, an instance of :class:`numpy.ndarray`, an object supporting
         Python buffer protocol, a Python scalar, or a (possibly nested)
@@ -808,7 +808,7 @@ def asfarray(a, dtype=None, *, device=None, usm_type=None, sycl_queue=None):
         `device` can be ``None``, a oneAPI filter selector string, an instance
         of :class:`dpctl.SyclDevice` corresponding to a non-partitioned SYCL
         device, an instance of :class:`dpctl.SyclQueue`, or a
-        :class:`dpctl.tensor.Device` object returned by
+        :class:`dpnp.tensor.Device` object returned by
         :attr:`dpnp.ndarray.device`.
 
         Default: ``None``.
diff --git a/dpnp/dpnp_iface_nanfunctions.py b/dpnp/dpnp_iface_nanfunctions.py
index a5fb750cf586..10fffb342305 100644
--- a/dpnp/dpnp_iface_nanfunctions.py
+++ b/dpnp/dpnp_iface_nanfunctions.py
@@ -167,7 +167,7 @@ def nanargmax(a, axis=None, out=None, *, keepdims=False):
     Limitations
     -----------
     Input array is only supported as either :class:`dpnp.ndarray`
-    or :class:`dpctl.tensor.usm_ndarray`.
+    or :class:`dpnp.tensor.usm_ndarray`.
     Input array data types are limited by supported DPNP :ref:`Data types`.
 
     See Also
@@ -251,7 +251,7 @@ def nanargmin(a, axis=None, out=None, *, keepdims=False):
     Limitations
     -----------
     Input and output arrays are only supported as either :class:`dpnp.ndarray`
-    or :class:`dpctl.tensor.usm_ndarray`.
+    or :class:`dpnp.tensor.usm_ndarray`.
     Input array data types are limited by supported DPNP :ref:`Data types`.
 
     See Also
@@ -466,7 +466,7 @@ def nanmax(a, axis=None, out=None, keepdims=False, initial=None, where=True):
     Limitations
     -----------
     Input array is only supported as either :class:`dpnp.ndarray`
-    or :class:`dpctl.tensor.usm_ndarray`.
+    or :class:`dpnp.tensor.usm_ndarray`.
     Parameters `where`, and `initial` are only supported with their default
     values.
     Otherwise ``NotImplementedError`` exception will be raised.
@@ -782,7 +782,7 @@ def nanmin(a, axis=None, out=None, keepdims=False, initial=None, where=True):
     Limitations
     -----------
     Input array is only supported as either :class:`dpnp.ndarray`
-    or :class:`dpctl.tensor.usm_ndarray`.
+    or :class:`dpnp.tensor.usm_ndarray`.
     Parameters `where`, and `initial` are only supported with their default
     values.
     Otherwise ``NotImplementedError`` exception will be raised.
@@ -896,7 +896,7 @@ def nanprod(
     Limitations
     -----------
     Input array is only supported as either :class:`dpnp.ndarray` or
-    :class:`dpctl.tensor.usm_ndarray`.
+    :class:`dpnp.tensor.usm_ndarray`.
     Parameters `initial`, and `where` are only supported with their default
     values.
     Otherwise the function will be executed sequentially on CPU.
diff --git a/dpnp/dpnp_iface_sorting.py b/dpnp/dpnp_iface_sorting.py
index c24b1a4bc886..8f6f3e80f0d1 100644
--- a/dpnp/dpnp_iface_sorting.py
+++ b/dpnp/dpnp_iface_sorting.py
@@ -64,7 +64,7 @@ def _wrap_sort_argsort(
     descending=False,
     stable=True,
 ):
-    """Wrap a sorting call from dpctl.tensor interface."""
+    """Wrap a sorting call from dpnp.tensor interface."""
 
     if order is not None:
         raise NotImplementedError(
diff --git a/dpnp/dpnp_iface_window.py b/dpnp/dpnp_iface_window.py
index f8d6df07443d..bc12e714663c 100644
--- a/dpnp/dpnp_iface_window.py
+++ b/dpnp/dpnp_iface_window.py
@@ -111,7 +111,7 @@ def bartlett(M, *, device=None, usm_type=None, sycl_queue=None):
         `device` can be ``None``, a oneAPI filter selector string, an instance
         of :class:`dpctl.SyclDevice` corresponding to a non-partitioned SYCL
         device, an instance of :class:`dpctl.SyclQueue`, or a
-        :class:`dpctl.tensor.Device` object returned by
+        :class:`dpnp.tensor.Device` object returned by
         :attr:`dpnp.ndarray.device`.
 
         Default: ``None``.
@@ -205,7 +205,7 @@ def blackman(M, *, device=None, usm_type=None, sycl_queue=None):
         `device` can be ``None``, a oneAPI filter selector string, an instance
         of :class:`dpctl.SyclDevice` corresponding to a non-partitioned SYCL
         device, an instance of :class:`dpctl.SyclQueue`, or a
-        :class:`dpctl.tensor.Device` object returned by
+        :class:`dpnp.tensor.Device` object returned by
         :attr:`dpnp.ndarray.device`.
 
         Default: ``None``.
@@ -296,7 +296,7 @@ def hamming(M, *, device=None, usm_type=None, sycl_queue=None):
         `device` can be ``None``, a oneAPI filter selector string, an instance
         of :class:`dpctl.SyclDevice` corresponding to a non-partitioned SYCL
         device, an instance of :class:`dpctl.SyclQueue`, or a
-        :class:`dpctl.tensor.Device` object returned by
+        :class:`dpnp.tensor.Device` object returned by
         :attr:`dpnp.ndarray.device`.
 
         Default: ``None``.
@@ -380,7 +380,7 @@ def hanning(M, *, device=None, usm_type=None, sycl_queue=None):
         `device` can be ``None``, a oneAPI filter selector string, an instance
         of :class:`dpctl.SyclDevice` corresponding to a non-partitioned SYCL
         device, an instance of :class:`dpctl.SyclQueue`, or a
-        :class:`dpctl.tensor.Device` object returned by
+        :class:`dpnp.tensor.Device` object returned by
         :attr:`dpnp.ndarray.device`.
 
         Default: ``None``.
@@ -466,7 +466,7 @@ def kaiser(M, beta, *, device=None, usm_type=None, sycl_queue=None):
         `device` can be ``None``, a oneAPI filter selector string, an instance
         of :class:`dpctl.SyclDevice` corresponding to a non-partitioned SYCL
         device, an instance of :class:`dpctl.SyclQueue`, or a
-        :class:`dpctl.tensor.Device` object returned by
+        :class:`dpnp.tensor.Device` object returned by
         :attr:`dpnp.ndarray.device`.
 
         Default: ``None``.
diff --git a/dpnp/dpnp_utils/dpnp_algo_utils.pyx b/dpnp/dpnp_utils/dpnp_algo_utils.pyx
index 11737831f014..00f40a0358e8 100644
--- a/dpnp/dpnp_utils/dpnp_algo_utils.pyx
+++ b/dpnp/dpnp_utils/dpnp_algo_utils.pyx
@@ -538,7 +538,7 @@ cdef class dpnp_descriptor:
             return self.origin_pyobj.get_array()
 
         raise TypeError(
-            "expected either dpctl.tensor.usm_ndarray or dpnp.dpnp_array.dpnp_array, got {}"
+            "expected either dpnp.tensor.usm_ndarray or dpnp.dpnp_array.dpnp_array, got {}"
             "".format(type(self.origin_pyobj)))
 
     cdef void * get_data(self):
diff --git a/dpnp/dpnp_utils/dpnp_utils_linearalgebra.py b/dpnp/dpnp_utils/dpnp_utils_linearalgebra.py
index fe525609175e..2331eb7a10cc 100644
--- a/dpnp/dpnp_utils/dpnp_utils_linearalgebra.py
+++ b/dpnp/dpnp_utils/dpnp_utils_linearalgebra.py
@@ -770,7 +770,7 @@ def dpnp_dot(a, b, /, out=None, *, casting="same_kind", conjugate=False):
 
     The routine that is used to perform the main calculation
     depends on input arrays data type: 1) For integer and boolean data types,
-    `dpctl.tensor.vecdot` form the Data Parallel Control library is used,
+    `dpnp.tensor.vecdot` form the Data Parallel Control library is used,
     2) For real-valued floating point data types, `dot` routines from
     BLAS library of OneMKL are used, and 3) For complex data types,
     `dotu` or `dotc` routines from BLAS library of OneMKL are used.
@@ -818,7 +818,7 @@ def dpnp_dot(a, b, /, out=None, *, casting="same_kind", conjugate=False):
         _manager.add_event_pair(ht_ev, dot_ev)
     else:
         # oneapi::mkl::blas::dot does not support integer dtypes,
-        # so using dpctl.tensor.vecdot instead
+        # so using dpnp.tensor.vecdot instead
         a_usm = dpnp.get_usm_ndarray(a)
         b_usm = dpnp.get_usm_ndarray(b)
         result = dpnp_array._create_from_usm_ndarray(dpt.vecdot(a_usm, b_usm))
@@ -1117,7 +1117,7 @@ def dpnp_multiplication(
             else:
                 # oneapi::mkl::blas::gemm/gemv do not support integer dtypes,
                 # except for special cases determined in `_gemm_special_case`,
-                # use dpctl.tensor.matmul for unsupported cases
+                # use dpnp.tensor.matmul for unsupported cases
 
                 # `dpt.matmul` does not support `casting` kwarg.
                 # We may need to change input dtypes based on given `casting`.
diff --git a/dpnp/dpnp_utils/dpnp_utils_reduction.py b/dpnp/dpnp_utils/dpnp_utils_reduction.py
index 8c13c6380870..ba9830bd7eff 100644
--- a/dpnp/dpnp_utils/dpnp_utils_reduction.py
+++ b/dpnp/dpnp_utils/dpnp_utils_reduction.py
@@ -33,7 +33,7 @@
 
 
 def dpnp_wrap_reduction_call(usm_a, out, _reduction_fn, res_dt, **kwargs):
-    """Wrap a reduction call from dpctl.tensor interface."""
+    """Wrap a reduction call from dpnp.tensor interface."""
 
     input_out = out
     if out is None:
diff --git a/dpnp/fft/dpnp_iface_fft.py b/dpnp/fft/dpnp_iface_fft.py
index fcc222640c9a..90e1a112bdaf 100644
--- a/dpnp/fft/dpnp_iface_fft.py
+++ b/dpnp/fft/dpnp_iface_fft.py
@@ -263,7 +263,7 @@ def fftfreq(
         `device` can be ``None``, a oneAPI filter selector string, an instance
         of :class:`dpctl.SyclDevice` corresponding to a non-partitioned SYCL
         device, an instance of :class:`dpctl.SyclQueue`, or a
-        :class:`dpctl.tensor.Device` object returned by
+        :class:`dpnp.tensor.Device` object returned by
         :attr:`dpnp.ndarray.device`.
 
         Default: ``None``.
@@ -1581,7 +1581,7 @@ def rfftfreq(
         `device` can be ``None``, a oneAPI filter selector string, an instance
         of :class:`dpctl.SyclDevice` corresponding to a non-partitioned SYCL
         device, an instance of :class:`dpctl.SyclQueue`, or a
-        :class:`dpctl.tensor.Device` object returned by
+        :class:`dpnp.tensor.Device` object returned by
         :attr:`dpnp.ndarray.device`.
 
         Default: ``None``.
diff --git a/dpnp/linalg/dpnp_utils_linalg.py b/dpnp/linalg/dpnp_utils_linalg.py
index 9d7b32d48177..cf6d1ff231f2 100644
--- a/dpnp/linalg/dpnp_utils_linalg.py
+++ b/dpnp/linalg/dpnp_utils_linalg.py
@@ -1264,7 +1264,7 @@ def _real_type(dtype, device=None):
         type is created. `device` can be ``None``, a oneAPI filter selector
         string, an instance of :class:`dpctl.SyclDevice` corresponding to
         a non-partitioned SYCL device, an instance of :class:`dpctl.SyclQueue`,
-        or a :class:`dpctl.tensor.Device` object returned by
+        or a :class:`dpnp.tensor.Device` object returned by
         :attr:`dpnp.ndarray.device`.
 
         Default: ``None``.
diff --git a/dpnp/memory/_memory.py b/dpnp/memory/_memory.py
index ea2a9284596e..ee0188d33b39 100644
--- a/dpnp/memory/_memory.py
+++ b/dpnp/memory/_memory.py
@@ -77,7 +77,7 @@ def create_data(x):
     Parameters
     ----------
     x : usm_ndarray
-        Input array of :class:`dpctl.tensor.usm_ndarray` type.
+        Input array of :class:`dpnp.tensor.usm_ndarray` type.
 
     Returns
     -------
diff --git a/dpnp/random/dpnp_iface_random.py b/dpnp/random/dpnp_iface_random.py
index 31a82fa5ac7b..3cafe12b1958 100644
--- a/dpnp/random/dpnp_iface_random.py
+++ b/dpnp/random/dpnp_iface_random.py
@@ -839,7 +839,7 @@ def normal(
         `device` can be ``None``, a oneAPI filter selector string, an instance
         of :class:`dpctl.SyclDevice` corresponding to a non-partitioned SYCL
         device, an instance of :class:`dpctl.SyclQueue`, or a
-        :class:`dpctl.tensor.Device` object returned by
+        :class:`dpnp.tensor.Device` object returned by
         :attr:`dpnp.ndarray.device`.
 
         Default: ``None``.
@@ -1100,7 +1100,7 @@ def rand(*args, device=None, usm_type="device", sycl_queue=None):
         `device` can be ``None``, a oneAPI filter selector string, an instance
         of :class:`dpctl.SyclDevice` corresponding to a non-partitioned SYCL
         device, an instance of :class:`dpctl.SyclQueue`, or a
-        :class:`dpctl.tensor.Device` object returned by
+        :class:`dpnp.tensor.Device` object returned by
         :attr:`dpnp.ndarray.device`.
 
         Default: ``None``.
@@ -1161,7 +1161,7 @@ def randint(
         `device` can be ``None``, a oneAPI filter selector string, an instance
         of :class:`dpctl.SyclDevice` corresponding to a non-partitioned SYCL
         device, an instance of :class:`dpctl.SyclQueue`, or a
-        :class:`dpctl.tensor.Device` object returned by
+        :class:`dpnp.tensor.Device` object returned by
         :attr:`dpnp.ndarray.device`.
 
         Default: ``None``.
@@ -1222,7 +1222,7 @@ def randn(d0, *dn, device=None, usm_type="device", sycl_queue=None):
         `device` can be ``None``, a oneAPI filter selector string, an instance
         of :class:`dpctl.SyclDevice` corresponding to a non-partitioned SYCL
         device, an instance of :class:`dpctl.SyclQueue`, or a
-        :class:`dpctl.tensor.Device` object returned by
+        :class:`dpnp.tensor.Device` object returned by
         :attr:`dpnp.ndarray.device`.
 
         Default: ``None``.
@@ -1277,7 +1277,7 @@ def random(size=None, device=None, usm_type="device", sycl_queue=None):
         `device` can be ``None``, a oneAPI filter selector string, an instance
         of :class:`dpctl.SyclDevice` corresponding to a non-partitioned SYCL
         device, an instance of :class:`dpctl.SyclQueue`, or a
-        :class:`dpctl.tensor.Device` object returned by
+        :class:`dpnp.tensor.Device` object returned by
         :attr:`dpnp.ndarray.device`.
 
         Default: ``None``.
@@ -1328,7 +1328,7 @@ def random_integers(
         `device` can be ``None``, a oneAPI filter selector string, an instance
         of :class:`dpctl.SyclDevice` corresponding to a non-partitioned SYCL
         device, an instance of :class:`dpctl.SyclQueue`, or a
-        :class:`dpctl.tensor.Device` object returned by
+        :class:`dpnp.tensor.Device` object returned by
         :attr:`dpnp.ndarray.device`.
 
         Default: ``None``.
@@ -1396,7 +1396,7 @@ def random_sample(size=None, device=None, usm_type="device", sycl_queue=None):
         `device` can be ``None``, a oneAPI filter selector string, an instance
         of :class:`dpctl.SyclDevice` corresponding to a non-partitioned SYCL
         device, an instance of :class:`dpctl.SyclQueue`, or a
-        :class:`dpctl.tensor.Device` object returned by
+        :class:`dpnp.tensor.Device` object returned by
         :attr:`dpnp.ndarray.device`.
 
         Default: ``None``.
@@ -1446,7 +1446,7 @@ def ranf(size=None, device=None, usm_type="device", sycl_queue=None):
         `device` can be ``None``, a oneAPI filter selector string, an instance
         of :class:`dpctl.SyclDevice` corresponding to a non-partitioned SYCL
         device, an instance of :class:`dpctl.SyclQueue`, or a
-        :class:`dpctl.tensor.Device` object returned by
+        :class:`dpnp.tensor.Device` object returned by
         :attr:`dpnp.ndarray.device`.
 
         Default: ``None``.
@@ -1537,7 +1537,7 @@ def sample(size=None, device=None, usm_type="device", sycl_queue=None):
         `device` can be ``None``, a oneAPI filter selector string, an instance
         of :class:`dpctl.SyclDevice` corresponding to a non-partitioned SYCL
         device, an instance of :class:`dpctl.SyclQueue`, or a
-        :class:`dpctl.tensor.Device` object returned by
+        :class:`dpnp.tensor.Device` object returned by
         :attr:`dpnp.ndarray.device`.
 
         Default: ``None``.
@@ -1616,7 +1616,7 @@ def seed(seed=None, device=None, sycl_queue=None):
         `device` can be ``None``, a oneAPI filter selector string, an instance
         of :class:`dpctl.SyclDevice` corresponding to a non-partitioned SYCL
         device, an instance of :class:`dpctl.SyclQueue`, or a
-        :class:`dpctl.tensor.Device` object returned by
+        :class:`dpnp.tensor.Device` object returned by
         :attr:`dpnp.ndarray.device`.
 
         Default: ``None``.
@@ -1777,7 +1777,7 @@ def standard_normal(size=None, device=None, usm_type="device", sycl_queue=None):
         `device` can be ``None``, a oneAPI filter selector string, an instance
         of :class:`dpctl.SyclDevice` corresponding to a non-partitioned SYCL
         device, an instance of :class:`dpctl.SyclQueue`, or a
-        :class:`dpctl.tensor.Device` object returned by
+        :class:`dpnp.tensor.Device` object returned by
         :attr:`dpnp.ndarray.device`.
 
         Default: ``None``.
@@ -1922,7 +1922,7 @@ def uniform(
         `device` can be ``None``, a oneAPI filter selector string, an instance
         of :class:`dpctl.SyclDevice` corresponding to a non-partitioned SYCL
         device, an instance of :class:`dpctl.SyclQueue`, or a
-        :class:`dpctl.tensor.Device` object returned by
+        :class:`dpnp.tensor.Device` object returned by
         :attr:`dpnp.ndarray.device`.
 
         Default: ``None``.
diff --git a/dpnp/random/dpnp_random_state.py b/dpnp/random/dpnp_random_state.py
index e95434bcd410..9456169ec114 100644
--- a/dpnp/random/dpnp_random_state.py
+++ b/dpnp/random/dpnp_random_state.py
@@ -65,7 +65,7 @@ class RandomState:
         `device` can be ``None``, a oneAPI filter selector string, an instance
         of :class:`dpctl.SyclDevice` corresponding to a non-partitioned SYCL
         device, an instance of :class:`dpctl.SyclQueue`, or a
-        :class:`dpctl.tensor.Device` object returned by
+        :class:`dpnp.tensor.Device` object returned by
         :attr:`dpnp.ndarray.device`.
 
         Default: ``None``.
diff --git a/dpnp/tensor/_accumulation.py b/dpnp/tensor/_accumulation.py
index fa1326c3b18d..069eb870f783 100644
--- a/dpnp/tensor/_accumulation.py
+++ b/dpnp/tensor/_accumulation.py
@@ -52,7 +52,7 @@ def _accumulate_common(
     _default_accumulation_type_fn,
 ):
     if not isinstance(x, dpt.usm_ndarray):
-        raise TypeError(f"Expected dpctl.tensor.usm_ndarray, got {type(x)}")
+        raise TypeError(f"Expected dpnp.tensor.usm_ndarray, got {type(x)}")
     appended_axis = False
     if x.ndim == 0:
         x = x[dpt.newaxis]
diff --git a/dpnp/tensor/_clip.py b/dpnp/tensor/_clip.py
index 4ba2dcecb370..44434fc0bb0c 100644
--- a/dpnp/tensor/_clip.py
+++ b/dpnp/tensor/_clip.py
@@ -303,7 +303,7 @@ def clip(x, /, min=None, max=None, out=None, order="K"):
     """
     if not isinstance(x, dpt.usm_ndarray):
         raise TypeError(
-            "Expected `x` to be of dpctl.tensor.usm_ndarray type, got "
+            "Expected `x` to be of dpnp.tensor.usm_ndarray type, got "
             f"{type(x)}"
         )
     if order not in ["K", "C", "F", "A"]:
diff --git a/dpnp/tensor/_copy_utils.py b/dpnp/tensor/_copy_utils.py
index ac1e9a9863a2..3978e7345b12 100644
--- a/dpnp/tensor/_copy_utils.py
+++ b/dpnp/tensor/_copy_utils.py
@@ -53,7 +53,7 @@
 
 def _copy_to_numpy(ary):
     if not isinstance(ary, dpt.usm_ndarray):
-        raise TypeError(f"Expected dpctl.tensor.usm_ndarray, got {type(ary)}")
+        raise TypeError(f"Expected dpnp.tensor.usm_ndarray, got {type(ary)}")
     if ary.size == 0:
         # no data needs to be copied for zero sized array
         return np.ndarray(ary.shape, dtype=ary.dtype)
@@ -139,7 +139,7 @@ def _extract_impl(ary, ary_mask, axis=0):
     """
     if not isinstance(ary, dpt.usm_ndarray):
         raise TypeError(
-            f"Expecting type dpctl.tensor.usm_ndarray, got {type(ary)}"
+            f"Expecting type dpnp.tensor.usm_ndarray, got {type(ary)}"
         )
     if isinstance(ary_mask, dpt.usm_ndarray):
         dst_usm_type = dpt.get_coerced_usm_type(
@@ -159,7 +159,7 @@ def _extract_impl(ary, ary_mask, axis=0):
         )
     else:
         raise TypeError(
-            "Expecting type dpctl.tensor.usm_ndarray or numpy.ndarray, got "
+            "Expecting type dpnp.tensor.usm_ndarray or numpy.ndarray, got "
             f"{type(ary_mask)}"
         )
     ary_nd = ary.ndim
@@ -236,7 +236,7 @@ def _get_indices_queue_usm_type(inds, queue, usm_type):
 def _nonzero_impl(ary):
     if not isinstance(ary, dpt.usm_ndarray):
         raise TypeError(
-            f"Expecting type dpctl.tensor.usm_ndarray, got {type(ary)}"
+            f"Expecting type dpnp.tensor.usm_ndarray, got {type(ary)}"
         )
     exec_q = ary.sycl_queue
     usm_type = ary.usm_type
@@ -313,7 +313,7 @@ def _place_impl(ary, ary_mask, vals, axis=0):
     """
     if not isinstance(ary, dpt.usm_ndarray):
         raise TypeError(
-            f"Expecting type dpctl.tensor.usm_ndarray, got {type(ary)}"
+            f"Expecting type dpnp.tensor.usm_ndarray, got {type(ary)}"
         )
     if isinstance(ary_mask, dpt.usm_ndarray):
         exec_q = dpt.get_execution_queue(
@@ -341,7 +341,7 @@ def _place_impl(ary, ary_mask, vals, axis=0):
         )
     else:
         raise TypeError(
-            "Expecting type dpctl.tensor.usm_ndarray or numpy.ndarray, got "
+            "Expecting type dpnp.tensor.usm_ndarray or numpy.ndarray, got "
             f"{type(ary_mask)}"
         )
     if exec_q is not None:
@@ -413,7 +413,7 @@ def _place_impl(ary, ary_mask, vals, axis=0):
 def _put_multi_index(ary, inds, p, vals, mode=0):
     if not isinstance(ary, dpt.usm_ndarray):
         raise TypeError(
-            f"Expecting type dpctl.tensor.usm_ndarray, got {type(ary)}"
+            f"Expecting type dpnp.tensor.usm_ndarray, got {type(ary)}"
         )
     ary_nd = ary.ndim
     p = normalize_axis_index(operator.index(p), ary_nd)
@@ -486,7 +486,7 @@ def _put_multi_index(ary, inds, p, vals, mode=0):
 def _take_multi_index(ary, inds, p, mode=0):
     if not isinstance(ary, dpt.usm_ndarray):
         raise TypeError(
-            f"Expecting type dpctl.tensor.usm_ndarray, got {type(ary)}"
+            f"Expecting type dpnp.tensor.usm_ndarray, got {type(ary)}"
         )
     ary_nd = ary.ndim
     p = normalize_axis_index(operator.index(p), ary_nd)
@@ -703,7 +703,7 @@ def _copy_from_usm_ndarray_to_usm_ndarray(dst, src):
         )
     ):
         raise TypeError(
-            "Both types are expected to be dpctl.tensor.usm_ndarray, "
+            "Both types are expected to be dpnp.tensor.usm_ndarray, "
             f"got {type(dst)} and {type(src)}."
         )
 
diff --git a/dpnp/tensor/_ctors.py b/dpnp/tensor/_ctors.py
index 7e9a6202f12a..b6e28afdc9e7 100644
--- a/dpnp/tensor/_ctors.py
+++ b/dpnp/tensor/_ctors.py
@@ -244,7 +244,7 @@ def _asarray_from_usm_ndarray(
 ):
     if not isinstance(usm_ndary, dpt.usm_ndarray):
         raise TypeError(
-            f"Expected dpctl.tensor.usm_ndarray, got {type(usm_ndary)}"
+            f"Expected dpnp.tensor.usm_ndarray, got {type(usm_ndary)}"
         )
     if usm_type is None:
         usm_type = usm_ndary.usm_type
@@ -1687,7 +1687,7 @@ def tril(x, /, *, k=0):
     """
     if not isinstance(x, dpt.usm_ndarray):
         raise TypeError(
-            "Expected argument of type dpctl.tensor.usm_ndarray, "
+            "Expected argument of type dpnp.tensor.usm_ndarray, "
             f"got {type(x)}."
         )
 
@@ -1765,7 +1765,7 @@ def triu(x, /, *, k=0):
     """
     if not isinstance(x, dpt.usm_ndarray):
         raise TypeError(
-            "Expected argument of type dpctl.tensor.usm_ndarray, "
+            "Expected argument of type dpnp.tensor.usm_ndarray, "
             f"got {type(x)}."
         )
 
diff --git a/dpnp/tensor/_elementwise_common.py b/dpnp/tensor/_elementwise_common.py
index e258df1b2e93..2eb89b8fb5f8 100644
--- a/dpnp/tensor/_elementwise_common.py
+++ b/dpnp/tensor/_elementwise_common.py
@@ -181,7 +181,7 @@ def types(self):
 
     def __call__(self, x, /, *, out=None, order="K"):
         if not isinstance(x, dpt.usm_ndarray):
-            raise TypeError(f"Expected dpctl.tensor.usm_ndarray, got {type(x)}")
+            raise TypeError(f"Expected dpnp.tensor.usm_ndarray, got {type(x)}")
 
         if order not in ["C", "F", "K", "A"]:
             order = "K"
@@ -859,7 +859,7 @@ def _inplace_op(self, o1, o2):
         if not isinstance(o1, dpt.usm_ndarray):
             raise TypeError(
                 "Expected first argument to be "
-                f"dpctl.tensor.usm_ndarray, got {type(o1)}"
+                f"dpnp.tensor.usm_ndarray, got {type(o1)}"
             )
         if not o1.flags.writable:
             raise ValueError("provided left-hand side array is read-only")
diff --git a/dpnp/tensor/_indexing_functions.py b/dpnp/tensor/_indexing_functions.py
index 32162942d738..9ea0a16bdd03 100644
--- a/dpnp/tensor/_indexing_functions.py
+++ b/dpnp/tensor/_indexing_functions.py
@@ -84,11 +84,11 @@ def extract(condition, arr):
     """
     if not isinstance(condition, dpt.usm_ndarray):
         raise TypeError(
-            "Expecting dpctl.tensor.usm_ndarray type, " f"got {type(condition)}"
+            "Expecting dpnp.tensor.usm_ndarray type, " f"got {type(condition)}"
         )
     if not isinstance(arr, dpt.usm_ndarray):
         raise TypeError(
-            "Expecting dpctl.tensor.usm_ndarray type, " f"got {type(arr)}"
+            "Expecting dpnp.tensor.usm_ndarray type, " f"got {type(arr)}"
         )
     exec_q = dpt.get_execution_queue(
         (
@@ -123,7 +123,7 @@ def nonzero(arr):
     """
     if not isinstance(arr, dpt.usm_ndarray):
         raise TypeError(
-            "Expecting dpctl.tensor.usm_ndarray type, " f"got {type(arr)}"
+            "Expecting dpnp.tensor.usm_ndarray type, " f"got {type(arr)}"
         )
     if arr.ndim == 0:
         raise ValueError("Array of positive rank is expected")
@@ -152,15 +152,15 @@ def place(arr, mask, vals):
     """
     if not isinstance(arr, dpt.usm_ndarray):
         raise TypeError(
-            "Expecting dpctl.tensor.usm_ndarray type, " f"got {type(arr)}"
+            "Expecting dpnp.tensor.usm_ndarray type, " f"got {type(arr)}"
         )
     if not isinstance(mask, dpt.usm_ndarray):
         raise TypeError(
-            "Expecting dpctl.tensor.usm_ndarray type, " f"got {type(mask)}"
+            "Expecting dpnp.tensor.usm_ndarray type, " f"got {type(mask)}"
         )
     if not isinstance(vals, dpt.usm_ndarray):
         raise TypeError(
-            "Expecting dpctl.tensor.usm_ndarray type, " f"got {type(vals)}"
+            "Expecting dpnp.tensor.usm_ndarray type, " f"got {type(vals)}"
         )
     exec_q = dpt.get_execution_queue(
         (
@@ -385,10 +385,10 @@ def put_along_axis(x, indices, vals, /, *, axis=-1, mode="wrap"):
         work.
     """
     if not isinstance(x, dpt.usm_ndarray):
-        raise TypeError(f"Expected dpctl.tensor.usm_ndarray, got {type(x)}")
+        raise TypeError(f"Expected dpnp.tensor.usm_ndarray, got {type(x)}")
     if not isinstance(indices, dpt.usm_ndarray):
         raise TypeError(
-            f"Expected dpctl.tensor.usm_ndarray, got {type(indices)}"
+            f"Expected dpnp.tensor.usm_ndarray, got {type(indices)}"
         )
     x_nd = x.ndim
     if x_nd != indices.ndim:
@@ -597,10 +597,10 @@ def take_along_axis(x, indices, /, *, axis=-1, mode="wrap"):
         by the value of ``mode`` keyword.
     """
     if not isinstance(x, dpt.usm_ndarray):
-        raise TypeError(f"Expected dpctl.tensor.usm_ndarray, got {type(x)}")
+        raise TypeError(f"Expected dpnp.tensor.usm_ndarray, got {type(x)}")
     if not isinstance(indices, dpt.usm_ndarray):
         raise TypeError(
-            f"Expected dpctl.tensor.usm_ndarray, got {type(indices)}"
+            f"Expected dpnp.tensor.usm_ndarray, got {type(indices)}"
         )
     x_nd = x.ndim
     if x_nd != indices.ndim:
diff --git a/dpnp/tensor/_linear_algebra_functions.py b/dpnp/tensor/_linear_algebra_functions.py
index ad64fd201eb0..dcaf99b4423c 100644
--- a/dpnp/tensor/_linear_algebra_functions.py
+++ b/dpnp/tensor/_linear_algebra_functions.py
@@ -69,7 +69,7 @@ def matrix_transpose(x):
         )
     if x.ndim < 2:
         raise ValueError(
-            "dpctl.tensor.matrix_transpose requires array to have"
+            "dpnp.tensor.matrix_transpose requires array to have"
             "at least 2 dimensions"
         )
 
@@ -115,9 +115,9 @@ def tensordot(x1, x2, axes=2):
             must have a data type determined by Type Promotion Rules.
     """
     if not isinstance(x1, dpt.usm_ndarray):
-        raise TypeError(f"Expected dpctl.tensor.usm_ndarray, got {type(x1)}")
+        raise TypeError(f"Expected dpnp.tensor.usm_ndarray, got {type(x1)}")
     if not isinstance(x2, dpt.usm_ndarray):
-        raise TypeError(f"Expected dpctl.tensor.usm_ndarray, got {type(x2)}")
+        raise TypeError(f"Expected dpnp.tensor.usm_ndarray, got {type(x2)}")
     q1, x1_usm_type = x1.sycl_queue, x1.usm_type
     q2, x2_usm_type = x2.sycl_queue, x2.usm_type
     exec_q = dpt.get_execution_queue((q1, q2))
@@ -351,9 +351,9 @@ def vecdot(x1, x2, axis=-1):
             to non-contracted axes.
     """
     if not isinstance(x1, dpt.usm_ndarray):
-        raise TypeError(f"Expected dpctl.tensor.usm_ndarray, got {type(x1)}")
+        raise TypeError(f"Expected dpnp.tensor.usm_ndarray, got {type(x1)}")
     if not isinstance(x2, dpt.usm_ndarray):
-        raise TypeError(f"Expected dpctl.tensor.usm_ndarray, got {type(x2)}")
+        raise TypeError(f"Expected dpnp.tensor.usm_ndarray, got {type(x2)}")
     q1, x1_usm_type = x1.sycl_queue, x1.usm_type
     q2, x2_usm_type = x2.sycl_queue, x2.usm_type
     exec_q = dpt.get_execution_queue((q1, q2))
@@ -653,9 +653,9 @@ def matmul(x1, x2, out=None, dtype=None, order="K"):
             point type, neither argument is complex conjugated or transposed.
     """
     if not isinstance(x1, dpt.usm_ndarray):
-        raise TypeError(f"Expected dpctl.tensor.usm_ndarray, got {type(x1)}")
+        raise TypeError(f"Expected dpnp.tensor.usm_ndarray, got {type(x1)}")
     if not isinstance(x2, dpt.usm_ndarray):
-        raise TypeError(f"Expected dpctl.tensor.usm_ndarray, got {type(x2)}")
+        raise TypeError(f"Expected dpnp.tensor.usm_ndarray, got {type(x2)}")
     if order not in ["K", "C", "F", "A"]:
         order = "K"
     q1, x1_usm_type = x1.sycl_queue, x1.usm_type
diff --git a/dpnp/tensor/_print.py b/dpnp/tensor/_print.py
index 51de51265907..e39bf9041485 100644
--- a/dpnp/tensor/_print.py
+++ b/dpnp/tensor/_print.py
@@ -397,7 +397,7 @@ def usm_ndarray_str(
         str: string representation of input array.
     """
     if not isinstance(x, dpt.usm_ndarray):
-        raise TypeError(f"Expected dpctl.tensor.usm_ndarray, got {type(x)}")
+        raise TypeError(f"Expected dpnp.tensor.usm_ndarray, got {type(x)}")
 
     options = get_print_options()
     options.update(
@@ -459,7 +459,7 @@ def usm_ndarray_repr(
         str: formatted string representing the input array
     """
     if not isinstance(x, dpt.usm_ndarray):
-        raise TypeError(f"Expected dpctl.tensor.usm_ndarray, got {type(x)}")
+        raise TypeError(f"Expected dpnp.tensor.usm_ndarray, got {type(x)}")
 
     if line_width is None:
         line_width = _print_options["linewidth"]
diff --git a/dpnp/tensor/_reduction.py b/dpnp/tensor/_reduction.py
index dfa77c63fe92..782fc2b0b442 100644
--- a/dpnp/tensor/_reduction.py
+++ b/dpnp/tensor/_reduction.py
@@ -42,7 +42,7 @@
 
 def _comparison_over_axis(x, axis, keepdims, out, _reduction_fn):
     if not isinstance(x, dpt.usm_ndarray):
-        raise TypeError(f"Expected dpctl.tensor.usm_ndarray, got {type(x)}")
+        raise TypeError(f"Expected dpnp.tensor.usm_ndarray, got {type(x)}")
 
     nd = x.ndim
     if axis is None:
@@ -149,7 +149,7 @@ def _reduction_over_axis(
     _default_reduction_type_fn,
 ):
     if not isinstance(x, dpt.usm_ndarray):
-        raise TypeError(f"Expected dpctl.tensor.usm_ndarray, got {type(x)}")
+        raise TypeError(f"Expected dpnp.tensor.usm_ndarray, got {type(x)}")
     nd = x.ndim
     if axis is None:
         axis = tuple(range(nd))
@@ -298,7 +298,7 @@ def _reduction_over_axis(
 
 def _search_over_axis(x, axis, keepdims, out, _reduction_fn):
     if not isinstance(x, dpt.usm_ndarray):
-        raise TypeError(f"Expected dpctl.tensor.usm_ndarray, got {type(x)}")
+        raise TypeError(f"Expected dpnp.tensor.usm_ndarray, got {type(x)}")
 
     nd = x.ndim
     if axis is None:
diff --git a/dpnp/tensor/_search_functions.py b/dpnp/tensor/_search_functions.py
index 339f2b2a4e3d..c1d45ee4bb33 100644
--- a/dpnp/tensor/_search_functions.py
+++ b/dpnp/tensor/_search_functions.py
@@ -162,7 +162,7 @@ def where(condition, x1, x2, /, *, order="K", out=None):
     """
     if not isinstance(condition, dpt.usm_ndarray):
         raise TypeError(
-            "Expecting dpctl.tensor.usm_ndarray type, " f"got {type(condition)}"
+            "Expecting dpnp.tensor.usm_ndarray type, " f"got {type(condition)}"
         )
     if order not in ["K", "C", "F", "A"]:
         order = "K"
diff --git a/dpnp/tensor/_searchsorted.py b/dpnp/tensor/_searchsorted.py
index 0702e1711ef9..4c9b54cb63fa 100644
--- a/dpnp/tensor/_searchsorted.py
+++ b/dpnp/tensor/_searchsorted.py
@@ -89,13 +89,11 @@ def searchsorted(
             Default: `None`.
     """
     if not isinstance(x1, usm_ndarray):
-        raise TypeError(f"Expected dpctl.tensor.usm_ndarray, got {type(x1)}")
+        raise TypeError(f"Expected dpnp.tensor.usm_ndarray, got {type(x1)}")
     if not isinstance(x2, usm_ndarray):
-        raise TypeError(f"Expected dpctl.tensor.usm_ndarray, got {type(x2)}")
+        raise TypeError(f"Expected dpnp.tensor.usm_ndarray, got {type(x2)}")
     if sorter is not None and not isinstance(sorter, usm_ndarray):
-        raise TypeError(
-            f"Expected dpctl.tensor.usm_ndarray, got {type(sorter)}"
-        )
+        raise TypeError(f"Expected dpnp.tensor.usm_ndarray, got {type(sorter)}")
 
     if side not in ["left", "right"]:
         raise ValueError(
diff --git a/dpnp/tensor/_set_functions.py b/dpnp/tensor/_set_functions.py
index 3b1a9b66d0da..067de75c42ce 100644
--- a/dpnp/tensor/_set_functions.py
+++ b/dpnp/tensor/_set_functions.py
@@ -103,7 +103,7 @@ def unique_values(x: dpt.usm_ndarray) -> dpt.usm_ndarray:
             returned array has the same data type as `x`.
     """
     if not isinstance(x, dpt.usm_ndarray):
-        raise TypeError(f"Expected dpctl.tensor.usm_ndarray, got {type(x)}")
+        raise TypeError(f"Expected dpnp.tensor.usm_ndarray, got {type(x)}")
     array_api_dev = x.device
     exec_q = array_api_dev.sycl_queue
     if x.ndim == 1:
@@ -196,7 +196,7 @@ def unique_counts(x: dpt.usm_ndarray) -> UniqueCountsResult:
               array index data type.
     """
     if not isinstance(x, dpt.usm_ndarray):
-        raise TypeError(f"Expected dpctl.tensor.usm_ndarray, got {type(x)}")
+        raise TypeError(f"Expected dpnp.tensor.usm_ndarray, got {type(x)}")
     array_api_dev = x.device
     exec_q = array_api_dev.sycl_queue
     x_usm_type = x.usm_type
@@ -329,7 +329,7 @@ def unique_inverse(x):
               index data type.
     """
     if not isinstance(x, dpt.usm_ndarray):
-        raise TypeError(f"Expected dpctl.tensor.usm_ndarray, got {type(x)}")
+        raise TypeError(f"Expected dpnp.tensor.usm_ndarray, got {type(x)}")
     array_api_dev = x.device
     exec_q = array_api_dev.sycl_queue
     x_usm_type = x.usm_type
@@ -498,7 +498,7 @@ def unique_all(x: dpt.usm_ndarray) -> UniqueAllResult:
               array index data type.
     """
     if not isinstance(x, dpt.usm_ndarray):
-        raise TypeError(f"Expected dpctl.tensor.usm_ndarray, got {type(x)}")
+        raise TypeError(f"Expected dpnp.tensor.usm_ndarray, got {type(x)}")
     array_api_dev = x.device
     exec_q = array_api_dev.sycl_queue
     x_usm_type = x.usm_type
diff --git a/dpnp/tensor/_slicing.pxi b/dpnp/tensor/_slicing.pxi
index 86db56013e23..f387aef8afd8 100644
--- a/dpnp/tensor/_slicing.pxi
+++ b/dpnp/tensor/_slicing.pxi
@@ -252,7 +252,7 @@ def _basic_slice_meta(ind, shape : tuple, strides : tuple, offset : int):
             else:
                 raise IndexError(
                     "Only integers, slices (`:`), ellipsis (`...`), "
-                    "dpctl.tensor.newaxis (`None`) and integer and "
+                    "dpnp.tensor.newaxis (`None`) and integer and "
                     "boolean arrays are valid indices."
                 )
         if ellipses_count > 1:
@@ -378,6 +378,6 @@ def _basic_slice_meta(ind, shape : tuple, strides : tuple, offset : int):
     else:
         raise IndexError(
             "Only integers, slices (`:`), ellipsis (`...`), "
-            "dpctl.tensor.newaxis (`None`) and integer and "
+            "dpnp.tensor.newaxis (`None`) and integer and "
             "boolean arrays are valid indices."
         )
diff --git a/dpnp/tensor/_sorting.py b/dpnp/tensor/_sorting.py
index fb4d3e4d98e4..c912b4f77cdf 100644
--- a/dpnp/tensor/_sorting.py
+++ b/dpnp/tensor/_sorting.py
@@ -89,9 +89,7 @@ def sort(x, /, *, axis=-1, descending=False, stable=True, kind=None):
             the same shape as the input array `x`.
     """
     if not isinstance(x, dpt.usm_ndarray):
-        raise TypeError(
-            f"Expected type dpctl.tensor.usm_ndarray, got {type(x)}"
-        )
+        raise TypeError(f"Expected type dpnp.tensor.usm_ndarray, got {type(x)}")
     nd = x.ndim
     if nd == 0:
         axis = normalize_axis_index(axis, ndim=1, msg_prefix="axis")
@@ -205,9 +203,7 @@ def argsort(x, axis=-1, descending=False, stable=True, kind=None):
             data type.
     """
     if not isinstance(x, dpt.usm_ndarray):
-        raise TypeError(
-            f"Expected type dpctl.tensor.usm_ndarray, got {type(x)}"
-        )
+        raise TypeError(f"Expected type dpnp.tensor.usm_ndarray, got {type(x)}")
     nd = x.ndim
     if nd == 0:
         axis = normalize_axis_index(axis, ndim=1, msg_prefix="axis")
@@ -336,9 +332,7 @@ def top_k(x, k, /, *, axis=None, mode="largest"):
     """
     largest = _get_top_k_largest(mode)
     if not isinstance(x, dpt.usm_ndarray):
-        raise TypeError(
-            f"Expected type dpctl.tensor.usm_ndarray, got {type(x)}"
-        )
+        raise TypeError(f"Expected type dpnp.tensor.usm_ndarray, got {type(x)}")
 
     k = operator.index(k)
     if k < 0:
diff --git a/dpnp/tensor/_statistical_functions.py b/dpnp/tensor/_statistical_functions.py
index 3d717554b5f8..a2015488aff2 100644
--- a/dpnp/tensor/_statistical_functions.py
+++ b/dpnp/tensor/_statistical_functions.py
@@ -194,7 +194,7 @@ def mean(x, axis=None, keepdims=False):
             where input array `x` is allocated.
     """
     if not isinstance(x, dpt.usm_ndarray):
-        raise TypeError(f"Expected dpctl.tensor.usm_ndarray, got {type(x)}")
+        raise TypeError(f"Expected dpnp.tensor.usm_ndarray, got {type(x)}")
     nd = x.ndim
     if axis is None:
         axis = tuple(range(nd))
@@ -306,7 +306,7 @@ def var(x, axis=None, correction=0.0, keepdims=False):
             where input array `x` is allocated.
     """
     if not isinstance(x, dpt.usm_ndarray):
-        raise TypeError(f"Expected dpctl.tensor.usm_ndarray, got {type(x)}")
+        raise TypeError(f"Expected dpnp.tensor.usm_ndarray, got {type(x)}")
 
     if not isinstance(correction, (int, float)):
         raise TypeError(
@@ -358,7 +358,7 @@ def std(x, axis=None, correction=0.0, keepdims=False):
             where input array `x` is allocated.
     """
     if not isinstance(x, dpt.usm_ndarray):
-        raise TypeError(f"Expected dpctl.tensor.usm_ndarray, got {type(x)}")
+        raise TypeError(f"Expected dpnp.tensor.usm_ndarray, got {type(x)}")
 
     if not isinstance(correction, (int, float)):
         raise TypeError(
diff --git a/dpnp/tensor/_testing.py b/dpnp/tensor/_testing.py
index 33b1b30980a3..fbec13fdeb36 100644
--- a/dpnp/tensor/_testing.py
+++ b/dpnp/tensor/_testing.py
@@ -121,11 +121,11 @@ def allclose(a1, a2, atol=1e-8, rtol=1e-5, equal_nan=False):
     """
     if not isinstance(a1, dpt.usm_ndarray):
         raise TypeError(
-            f"Expected dpctl.tensor.usm_ndarray type, got {type(a1)}."
+            f"Expected dpnp.tensor.usm_ndarray type, got {type(a1)}."
         )
     if not isinstance(a2, dpt.usm_ndarray):
         raise TypeError(
-            f"Expected dpctl.tensor.usm_ndarray type, got {type(a2)}."
+            f"Expected dpnp.tensor.usm_ndarray type, got {type(a2)}."
         )
     atol = float(atol)
     rtol = float(rtol)
diff --git a/dpnp/tensor/_usmarray.pyx b/dpnp/tensor/_usmarray.pyx
index ad172091702f..c696056d53c2 100644
--- a/dpnp/tensor/_usmarray.pyx
+++ b/dpnp/tensor/_usmarray.pyx
@@ -903,7 +903,7 @@ cdef class usm_ndarray:
             raise ValueError(
                 "array.T requires array to have 2 dimensions. "
                 "Use array.mT to transpose stacks of matrices and "
-                "dpctl.tensor.permute_dims() to permute dimensions."
+                "dpnp.tensor.permute_dims() to permute dimensions."
             )
 
     @property
@@ -1161,7 +1161,7 @@ cdef class usm_ndarray:
 
         raise ValueError(
             "The truth value of an array with more than one element is "
-            "ambiguous. Use dpctl.tensor.any() or dpctl.tensor.all()"
+            "ambiguous. Use dpnp.tensor.any() or dpnp.tensor.all()"
         )
 
     def __float__(self):
@@ -1491,7 +1491,7 @@ cdef class usm_ndarray:
                     except Exception:
                         raise ValueError(
                             f"Input of type {type(rhs)} could not be "
-                            "copied into dpctl.tensor.usm_ndarray"
+                            "copied into dpnp.tensor.usm_ndarray"
                         )
             return
 
@@ -1620,8 +1620,8 @@ cdef class usm_ndarray:
         """
         raise TypeError(
             "Implicit conversion to a NumPy array is not allowed. "
-            "Use `dpctl.tensor.asnumpy` to copy data from this "
-            "`dpctl.tensor.usm_ndarray` instance to NumPy array"
+            "Use `dpnp.tensor.asnumpy` to copy data from this "
+            "`dpnp.tensor.usm_ndarray` instance to NumPy array"
         )
 
 
diff --git a/dpnp/tensor/_utility_functions.py b/dpnp/tensor/_utility_functions.py
index a02f7406d135..651ce0830266 100644
--- a/dpnp/tensor/_utility_functions.py
+++ b/dpnp/tensor/_utility_functions.py
@@ -50,7 +50,7 @@
 
 def _boolean_reduction(x, axis, keepdims, func):
     if not isinstance(x, dpt.usm_ndarray):
-        raise TypeError(f"Expected dpctl.tensor.usm_ndarray, got {type(x)}")
+        raise TypeError(f"Expected dpnp.tensor.usm_ndarray, got {type(x)}")
 
     nd = x.ndim
     if axis is None:
@@ -468,7 +468,7 @@ def diff(x, /, *, axis=-1, n=1, prepend=None, append=None):
 
     if not isinstance(x, dpt.usm_ndarray):
         raise TypeError(
-            "Expecting dpctl.tensor.usm_ndarray type, " f"got {type(x)}"
+            "Expecting dpnp.tensor.usm_ndarray type, " f"got {type(x)}"
         )
     x_nd = x.ndim
     axis = normalize_axis_index(operator.index(axis), x_nd)
diff --git a/dpnp/tests/tensor/test_tensor_clip.py b/dpnp/tests/tensor/test_tensor_clip.py
index 759fc0ef11c7..cfd9f6cfab2e 100644
--- a/dpnp/tests/tensor/test_tensor_clip.py
+++ b/dpnp/tests/tensor/test_tensor_clip.py
@@ -548,7 +548,7 @@ def test_clip_errors():
     ar3 = dpt.ones(2, dtype="f4")
     assert_raises_regex(
         TypeError,
-        "Expected `x` to be of dpctl.tensor.usm_ndarray type*",
+        "Expected `x` to be of dpnp.tensor.usm_ndarray type*",
         dpt.clip,
         ar1,
         ar2,
diff --git a/dpnp/tests/tensor/test_tensor_diff.py b/dpnp/tests/tensor/test_tensor_diff.py
index e5beea6845b1..f75b9d4a3639 100644
--- a/dpnp/tests/tensor/test_tensor_diff.py
+++ b/dpnp/tests/tensor/test_tensor_diff.py
@@ -324,7 +324,7 @@ def test_diff_input_validation():
     bad_in = {}
     assert_raises_regex(
         TypeError,
-        "Expecting dpctl.tensor.usm_ndarray type, got.*",
+        "Expecting dpnp.tensor.usm_ndarray type, got.*",
         dpt.diff,
         bad_in,
     )
diff --git a/dpnp/tests/third_party/cupy/indexing_tests/test_insert.py b/dpnp/tests/third_party/cupy/indexing_tests/test_insert.py
index 7399343e7e57..3b23b32fe3b2 100644
--- a/dpnp/tests/third_party/cupy/indexing_tests/test_insert.py
+++ b/dpnp/tests/third_party/cupy/indexing_tests/test_insert.py
@@ -84,7 +84,7 @@ def test_put(self, xp, dtype):
         # Take care so that actual indices don't overlap.
         if self.mode == "raise":
             pytest.skip("'raise' mode is not supported")
-        # `wrap` mode in dpctl.tensor.put is different from numpy.put (#1365):
+        # `wrap` mode in dpnp.tensor.put is different from numpy.put (#1365):
         # numpy`s `wrap` mode wraps indices around for cyclic operations
         # while dpctl`s `wrap` mode restricts indices to stay within the array bounds (-n <= i < n).
         elif self.mode == "wrap":

From 549e940904205bd92d55855cbe2e72b2faf7f901 Mon Sep 17 00:00:00 2001
From: vlad-perevezentsev <vladislav.perevezentsev@intel.com>
Date: Wed, 15 Apr 2026 14:48:26 +0200
Subject: [PATCH 40/43] Add device-aware output dtype for `dpt.round()` with
 boolean input (#2851)

This PR proposes device-aware output dtype resolution for
`dpnp.tensor.round()` with `boolean` input to handle devices that do not
support `float16`

Boolean support for round() was originally added in #2817
[6f5a792](https://github.com/IntelPython/dpnp/pull/2817/changes/6f5a792c8542143b395ba1c4e6e44e6bdaf85576)
to match NumPy behavior where numpy.round(bool) returns float16 rather
than an integral type like int8.
However on devices without fp16 support, returning float16 is not
viable.

The bool type mapping was removed from the round kernel and an
acceptance
function `_acceptance_fn_round` was added to ensure the fallback in
`_find_buf_dtype`
prefers floating-point output over integral types for boolean input

Result :
fp16 devices: round(bool) -> float16
non-fp16 devices: round(bool) -> float32
---
 dpnp/tensor/_elementwise_funcs.py                         | 7 ++++++-
 dpnp/tensor/_type_utils.py                                | 8 ++++++++
 .../include/kernels/elementwise_functions/round.hpp       | 1 -
 3 files changed, 14 insertions(+), 2 deletions(-)

diff --git a/dpnp/tensor/_elementwise_funcs.py b/dpnp/tensor/_elementwise_funcs.py
index 5d38cad0c2a5..4040f33bf38e 100644
--- a/dpnp/tensor/_elementwise_funcs.py
+++ b/dpnp/tensor/_elementwise_funcs.py
@@ -33,6 +33,7 @@
     _acceptance_fn_divide,
     _acceptance_fn_negative,
     _acceptance_fn_reciprocal,
+    _acceptance_fn_round,
     _acceptance_fn_subtract,
     _resolve_weak_types_all_py_ints,
 )
@@ -1723,7 +1724,11 @@
 """
 
 round = UnaryElementwiseFunc(
-    "round", ti._round_result_type, ti._round, _round_docstring
+    "round",
+    ti._round_result_type,
+    ti._round,
+    _round_docstring,
+    acceptance_fn=_acceptance_fn_round,
 )
 del _round_docstring
 
diff --git a/dpnp/tensor/_type_utils.py b/dpnp/tensor/_type_utils.py
index 3da9e7994760..b03ca1e1c79d 100644
--- a/dpnp/tensor/_type_utils.py
+++ b/dpnp/tensor/_type_utils.py
@@ -133,6 +133,13 @@ def _acceptance_fn_reciprocal(arg_dtype, buf_dt, res_dt, sycl_dev):
         return True
 
 
+def _acceptance_fn_round(arg_dtype, buf_dt, res_dt, sycl_dev):
+    # for boolean input, prefer floating-point output over integral
+    if arg_dtype.kind == "b" and res_dt.kind != "f":
+        return False
+    return True
+
+
 def _acceptance_fn_subtract(
     arg1_dtype, arg2_dtype, buf1_dt, buf2_dt, res_dt, sycl_dev
 ):
@@ -970,6 +977,7 @@ def _default_accumulation_dtype_fp_types(inp_dt, q):
     "_find_buf_dtype2",
     "_to_device_supported_dtype",
     "_acceptance_fn_default_unary",
+    "_acceptance_fn_round",
     "_acceptance_fn_reciprocal",
     "_acceptance_fn_default_binary",
     "_acceptance_fn_divide",
diff --git a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/round.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/round.hpp
index 18867a09bcef..b20166a4d505 100644
--- a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/round.hpp
+++ b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/round.hpp
@@ -116,7 +116,6 @@ template <typename T>
 struct RoundOutputType
 {
     using value_type = typename std::disjunction<
-        td_ns::TypeMapResultEntry<T, bool, sycl::half>,
         td_ns::TypeMapResultEntry<T, std::uint8_t>,
         td_ns::TypeMapResultEntry<T, std::uint16_t>,
         td_ns::TypeMapResultEntry<T, std::uint32_t>,

From 125c9137d1942a823030e2676e7d1ab24c908488 Mon Sep 17 00:00:00 2001
From: vlad-perevezentsev <vladislav.perevezentsev@intel.com>
Date: Wed, 15 Apr 2026 15:47:57 +0200
Subject: [PATCH 41/43] Fix test warnings in dpnp.tensor tests (#2852)

This PR proposes to fix test warnings in `dpnp.tensor` tests by
replacing deprecated strides assignment with
`np.lib.stride_tricks.as_strided` in `test_usm_ndarray_dlpack.py`
and suppressing overflow warnings from np.allclose in
`test_exp.py:test_exp_complex_contig`
---
 dpnp/tests/tensor/elementwise/test_exp.py    | 1 +
 dpnp/tests/tensor/test_usm_ndarray_dlpack.py | 4 +++-
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/dpnp/tests/tensor/elementwise/test_exp.py b/dpnp/tests/tensor/elementwise/test_exp.py
index d123ed0c83a8..5ff2d05fbd83 100644
--- a/dpnp/tests/tensor/elementwise/test_exp.py
+++ b/dpnp/tests/tensor/elementwise/test_exp.py
@@ -78,6 +78,7 @@ def test_exp_real_contig(dtype):
     assert_allclose(dpt.asnumpy(Z), np.repeat(Ynp, n_rep), atol=tol, rtol=tol)
 
 
+@pytest.mark.filterwarnings("ignore:overflow encountered:RuntimeWarning")
 @pytest.mark.parametrize("dtype", ["c8", "c16"])
 def test_exp_complex_contig(dtype):
     q = get_queue_or_skip()
diff --git a/dpnp/tests/tensor/test_usm_ndarray_dlpack.py b/dpnp/tests/tensor/test_usm_ndarray_dlpack.py
index 4b04339fe7f9..7db73467f788 100644
--- a/dpnp/tests/tensor/test_usm_ndarray_dlpack.py
+++ b/dpnp/tests/tensor/test_usm_ndarray_dlpack.py
@@ -783,7 +783,9 @@ def test_copy_via_host_gh_1789():
     get_queue_or_skip()
     x_np = np.ones((10, 10), dtype="i4")
     # strides are no longer multiple of itemsize
-    x_np.strides = (x_np.strides[0] - 1, x_np.strides[1])
+    x_np = np.lib.stride_tricks.as_strided(
+        x_np, shape=x_np.shape, strides=(x_np.strides[0] - 1, x_np.strides[1])
+    )
     with pytest.raises(BufferError):
         dpt.from_dlpack(x_np)
     with pytest.raises(BufferError):

From 391fb6d9b717237699ea7487cedc526bd900736d Mon Sep 17 00:00:00 2001
From: vlad-perevezentsev <vladislav.perevezentsev@intel.com>
Date: Thu, 16 Apr 2026 13:28:59 +0200
Subject: [PATCH 42/43] Update CHANGELOG.md with `dpnp.tensor` migration
 (#2857)

This PR adds `dpnp.tensor` migration entry to `CHANGELOG.md` and update
release summary
---
 CHANGELOG.md | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index f8aaae542ec5..bf659a351a57 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -6,6 +6,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [0.20.0] - MM/DD/2026
 
+This release introduces a major architectural change: the Array API-compliant tensor implementation has been migrated from `dpctl.tensor` into `dpnp.tensor`, simplifying maintenance, reducing cross-project dependencies, and allows the tensor implementation to evolve within `dpnp`.
 This release changes the license from `BSD-2-Clause` to `BSD-3-Clause`.
 This release achieves `dpnp` compatibility with Python 3.14 and enables distributing `dpnp` packages with the latest Python version.
 Also, that release drops support for Python 3.9, making Python 3.10 the minimum required version.
@@ -28,6 +29,7 @@ Also, that release drops support for Python 3.9, making Python 3.10 the minimum
 * Added implementation of `dpnp.isin` function [#2595](https://github.com/IntelPython/dpnp/pull/2595)
 * Added implementation of `dpnp.scipy.linalg.lu` (SciPy-compatible) [#2787](https://github.com/IntelPython/dpnp/pull/2787)
 * Added support for ndarray subclassing via `dpnp.ndarray.view` method with `type` parameter [#2815](https://github.com/IntelPython/dpnp/issues/2815)
+* Migrated tensor implementation from `dpctl.tensor` into `dpnp.tensor`, making `dpnp` the primary owner of the Array API-compliant tensor layer [#2856](https://github.com/IntelPython/dpnp/pull/2856)
 
 ### Changed
 
@@ -84,6 +86,7 @@ Also, that release drops support for Python 3.9, making Python 3.10 the minimum
 * Resolved an issue with strides calculation in `dpnp.diagonal` to return correct values for empty diagonals [#2814](https://github.com/IntelPython/dpnp/pull/2814)
 * Fixed test tolerance issues for float16 intermediate precision that became visible when testing against conda-forge's NumPy [#2828](https://github.com/IntelPython/dpnp/pull/2828)
 * Ensured device aware dtype handling in `dpnp.identity` and `dpnp.gradient` [#2835](https://github.com/IntelPython/dpnp/pull/2835)
+* Fixed `dpnp.tensor.round` to use device-aware output dtype for boolean input [#2851](https://github.com/IntelPython/dpnp/pull/2851)
 
 ### Security
 

From 80c49bee7db6b025456be3f96a78b85504cdc492 Mon Sep 17 00:00:00 2001
From: vlad-perevezentsev <vladislav.perevezentsev@intel.com>
Date: Thu, 16 Apr 2026 17:00:42 +0200
Subject: [PATCH 43/43]  Use `dpctl_capi.h` header in `dpnp4pybind11.hpp`
 (#2859)

This PR proposes replacing the individual dpctl C-API includes and
import calls in `dpnp4pybind11.hpp` with `dpctl_capi.h` header and its
`import_dpctl()` function.
Previous changes to the explicit imports were made to avoid conflicting
with dpctl which contained a tensor;
now that dpctl no longer contains a tensor, we can use the header
directly.

Additionally, fixes `RuntimeWarning` filter in `test_exp_complex_contig`
---
 dpnp/backend/include/dpnp4pybind11.hpp    | 36 ++++-------------------
 dpnp/tests/tensor/elementwise/test_exp.py |  2 +-
 2 files changed, 6 insertions(+), 32 deletions(-)

diff --git a/dpnp/backend/include/dpnp4pybind11.hpp b/dpnp/backend/include/dpnp4pybind11.hpp
index 896ff20873a5..8bc931a3ca1a 100644
--- a/dpnp/backend/include/dpnp4pybind11.hpp
+++ b/dpnp/backend/include/dpnp4pybind11.hpp
@@ -28,29 +28,8 @@
 
 #pragma once
 
-// Include dpctl SYCL interface from external dpctl package
-#include "syclinterface/dpctl_sycl_extension_interface.h"
-#include "syclinterface/dpctl_sycl_types.h"
-
-#ifdef __cplusplus
-#define CYTHON_EXTERN_C extern "C"
-#else
-#define CYTHON_EXTERN_C
-#endif
-
-// Include dpctl C-API headers (both declarations and import functions)
-#include "dpctl/_sycl_context.h"
-#include "dpctl/_sycl_context_api.h"
-#include "dpctl/_sycl_device.h"
-#include "dpctl/_sycl_device_api.h"
-#include "dpctl/_sycl_event.h"
-#include "dpctl/_sycl_event_api.h"
-#include "dpctl/_sycl_queue.h"
-#include "dpctl/_sycl_queue_api.h"
-#include "dpctl/memory/_memory.h"
-#include "dpctl/memory/_memory_api.h"
-#include "dpctl/program/_program.h"
-#include "dpctl/program/_program_api.h"
+// Include dpctl C-API headers
+#include "dpctl_capi.h"
 
 // Include generated Cython headers for usm_ndarray
 // (struct definition and constants only)
@@ -253,14 +232,9 @@ class dpctl_capi
           default_usm_memory_{}, default_usm_ndarray_{}, as_usm_memory_{}
 
     {
-        // Import dpctl SYCL interface modules
-        // This imports python modules and initializes pointers to Python types
-        import_dpctl___sycl_device();
-        import_dpctl___sycl_context();
-        import_dpctl___sycl_event();
-        import_dpctl___sycl_queue();
-        import_dpctl__memory___memory();
-        import_dpctl__program___program();
+        // Import dpctl C-API
+        // (device, context, event, queue, memory, program)
+        import_dpctl();
         // Import dpnp tensor module for PyUSMArrayType
         import_dpnp__tensor___usmarray();
 
diff --git a/dpnp/tests/tensor/elementwise/test_exp.py b/dpnp/tests/tensor/elementwise/test_exp.py
index 5ff2d05fbd83..ca204128317e 100644
--- a/dpnp/tests/tensor/elementwise/test_exp.py
+++ b/dpnp/tests/tensor/elementwise/test_exp.py
@@ -78,7 +78,7 @@ def test_exp_real_contig(dtype):
     assert_allclose(dpt.asnumpy(Z), np.repeat(Ynp, n_rep), atol=tol, rtol=tol)
 
 
-@pytest.mark.filterwarnings("ignore:overflow encountered:RuntimeWarning")
+@pytest.mark.filterwarnings("ignore::RuntimeWarning")
 @pytest.mark.parametrize("dtype", ["c8", "c16"])
 def test_exp_complex_contig(dtype):
     q = get_queue_or_skip()