From 3b84e7a32b64dd5a2b9346cb3931fcc7246646da Mon Sep 17 00:00:00 2001 From: yaoxiao Date: Sat, 25 Apr 2026 23:52:31 +0800 Subject: [PATCH 1/5] [improvement](be) Optimize cosine array distance calculation ### What problem does this PR solve? Issue Number: close #xxx Related PR: #xxx Problem Summary: Avoid redundant square root calculations in cosine_distance and cosine_similarity while preserving the existing semantics for non-zero vectors. Empty arrays and zero vectors keep their previous return values, and pointer preconditions are asserted instead of silently mapping unexpected internal states to user-visible distance values. ### Release note None ### Check List (For Author) - Test: Manual test - Ran git diff --check for be/src/exprs/function/array/function_array_distance.cpp. Full build and unit/regression tests were not run because the current worktree is not initialized (.worktree_initialized and thirdparty/installed are missing). - Behavior changed: No - Does this need documentation: No --- .../array/function_array_distance.cpp | 26 ++++++++++++++++--- 1 file changed, 22 insertions(+), 4 deletions(-) diff --git a/be/src/exprs/function/array/function_array_distance.cpp b/be/src/exprs/function/array/function_array_distance.cpp index 89a0dafe1e4aed..03ee287f9fe05d 100644 --- a/be/src/exprs/function/array/function_array_distance.cpp +++ b/be/src/exprs/function/array/function_array_distance.cpp @@ -23,6 +23,12 @@ namespace doris { FAISS_PRAGMA_IMPRECISE_FUNCTION_BEGIN float CosineDistance::distance(const float* x, const float* y, size_t d) { + if (d == 0) { + return 2.0f; + } + + DCHECK(x != nullptr && y != nullptr); + float dot_prod = 0; float squared_x = 0; float squared_y = 0; @@ -31,15 +37,24 @@ float CosineDistance::distance(const float* x, const float* y, size_t d) { squared_x += x[i] * x[i]; squared_y += y[i] * y[i]; } - if (squared_x == 0 or squared_y == 0) { + + if (squared_x == 0 || squared_y == 0) { return 2.0f; } - return 1 - dot_prod / sqrt(squared_x * squared_y); + + const float norm = std::sqrt(squared_x * squared_y); + return 1 - dot_prod / norm; } FAISS_PRAGMA_IMPRECISE_FUNCTION_END FAISS_PRAGMA_IMPRECISE_FUNCTION_BEGIN float CosineSimilarity::distance(const float* x, const float* y, size_t d) { + if (d == 0) { + return 0.0f; + } + + DCHECK(x != nullptr && y != nullptr); + float dot_prod = 0; float squared_x = 0; float squared_y = 0; @@ -48,10 +63,13 @@ float CosineSimilarity::distance(const float* x, const float* y, size_t d) { squared_x += x[i] * x[i]; squared_y += y[i] * y[i]; } - if (squared_x == 0 or squared_y == 0) { + + if (squared_x == 0 || squared_y == 0) { return 0.0f; } - return dot_prod / sqrt(squared_x * squared_y); + + const float norm = std::sqrt(squared_x * squared_y); + return dot_prod / norm; } FAISS_PRAGMA_IMPRECISE_FUNCTION_END From 36289ef375250644e1803b148d91f7014a6dbdf5 Mon Sep 17 00:00:00 2001 From: yaoxiao Date: Thu, 21 May 2026 16:51:17 +0800 Subject: [PATCH 2/5] [fix](be) Fix numerical instability in cosine distance/similarity - Replace float norm `sqrt(squared_x * squared_y)` with double-precision intermediate to prevent float overflow for large-magnitude vectors (e.g. 1e19 per element: squared product overflows float to +inf, yielding a wrong cosine of 0/NaN instead of 1). - Clamp the final cosine to [-1, 1] before computing distance to prevent tiny negative cosine_distance values caused by floating-point rounding (e.g. identical vectors could yield cosine=1.0000001 -> distance=-1e-7). - Add cosine_distance unit tests covering identical/orthogonal/opposite/zero vectors and a known-value case. - Add large-magnitude regression test (1e19 elements) that would have caught the original overflow bug. --- .../array/function_array_distance.cpp | 20 ++++- .../function_array_cosine_similarity_test.cpp | 79 ++++++++++++++++++- 2 files changed, 93 insertions(+), 6 deletions(-) diff --git a/be/src/exprs/function/array/function_array_distance.cpp b/be/src/exprs/function/array/function_array_distance.cpp index 03ee287f9fe05d..3f37775d6beedf 100644 --- a/be/src/exprs/function/array/function_array_distance.cpp +++ b/be/src/exprs/function/array/function_array_distance.cpp @@ -17,6 +17,8 @@ #include "exprs/function/array/function_array_distance.h" +#include + #include "exprs/function/simple_function_factory.h" namespace doris { @@ -42,8 +44,16 @@ float CosineDistance::distance(const float* x, const float* y, size_t d) { return 2.0f; } - const float norm = std::sqrt(squared_x * squared_y); - return 1 - dot_prod / norm; + // Accumulate the norm in double and take a single square root. Computing + // (double)squared_x * (double)squared_y cannot overflow for finite float inputs, + // whereas the float expression sqrt(squared_x * squared_y) overflows to +inf for + // large-magnitude vectors and would silently yield a distance of 1.0. + const double norm = std::sqrt(static_cast(squared_x) * static_cast(squared_y)); + // Clamp the cosine to [-1, 1] before mapping to a distance. Floating-point rounding + // can push the ratio slightly outside [-1, 1] (e.g. 1.0000001 for identical vectors), + // which would otherwise produce a tiny negative distance. + const float cosine = std::clamp(static_cast(dot_prod / norm), -1.0f, 1.0f); + return 1.0f - cosine; } FAISS_PRAGMA_IMPRECISE_FUNCTION_END @@ -68,8 +78,10 @@ float CosineSimilarity::distance(const float* x, const float* y, size_t d) { return 0.0f; } - const float norm = std::sqrt(squared_x * squared_y); - return dot_prod / norm; + // See CosineDistance::distance: the double-precision norm avoids float overflow, + // and clamping keeps the result within the mathematically valid [-1, 1] range. + const double norm = std::sqrt(static_cast(squared_x) * static_cast(squared_y)); + return std::clamp(static_cast(dot_prod / norm), -1.0f, 1.0f); } FAISS_PRAGMA_IMPRECISE_FUNCTION_END diff --git a/be/test/exprs/function/function_array_cosine_similarity_test.cpp b/be/test/exprs/function/function_array_cosine_similarity_test.cpp index a4928276f1e144..66b4edd4388659 100644 --- a/be/test/exprs/function/function_array_cosine_similarity_test.cpp +++ b/be/test/exprs/function/function_array_cosine_similarity_test.cpp @@ -99,8 +99,10 @@ TEST(function_cosine_similarity_test, cosine_similarity) { TestArray vec1 = {Float32(1.0), Float32(2.0), Float32(3.0)}; TestArray vec2 = {Float32(3.0), Float32(5.0), Float32(7.0)}; - // Expected: 34 / sqrt(14 * 83) = 34 / sqrt(1162) ≈ 0.9974149 - float expected = 34.0f / std::sqrt(14.0f * 83.0f); + // Expected: 34 / sqrt(14 * 83) = 34 / sqrt(1162) ≈ 0.9974149. + // Mirror the production formula exactly (double-precision norm) so the + // exact float comparison in check_function matches bit-for-bit. + float expected = static_cast(34.0 / std::sqrt(14.0 * 83.0)); DataSet data_set = {{{vec1, vec2}, Float32(expected)}}; static_cast(check_function(func_name, input_types, data_set)); @@ -156,4 +158,77 @@ TEST(function_cosine_similarity_test, cosine_similarity) { } } +TEST(function_cosine_distance_test, cosine_distance) { + std::string func_name = "cosine_distance"; + TestArray empty_arr; + InputTypeSet input_types = {PrimitiveType::TYPE_ARRAY, PrimitiveType::TYPE_FLOAT, + PrimitiveType::TYPE_ARRAY, PrimitiveType::TYPE_FLOAT}; + + // identical vectors -> distance 0.0 (and crucially never a negative distance) + { + TestArray vec1 = {Float32(1.0), Float32(2.0), Float32(3.0)}; + TestArray vec2 = {Float32(1.0), Float32(2.0), Float32(3.0)}; + DataSet data_set = {{{vec1, vec2}, Float32(0.0)}}; + static_cast(check_function(func_name, input_types, data_set)); + } + + // orthogonal vectors -> distance 1.0 + { + TestArray vec1 = {Float32(1.0), Float32(0.0)}; + TestArray vec2 = {Float32(0.0), Float32(1.0)}; + DataSet data_set = {{{vec1, vec2}, Float32(1.0)}}; + static_cast(check_function(func_name, input_types, data_set)); + } + + // opposite vectors -> distance 2.0 + { + TestArray vec1 = {Float32(1.0), Float32(2.0), Float32(3.0)}; + TestArray vec2 = {Float32(-1.0), Float32(-2.0), Float32(-3.0)}; + DataSet data_set = {{{vec1, vec2}, Float32(2.0)}}; + static_cast(check_function(func_name, input_types, data_set)); + } + + // zero vector and empty array keep the legacy fallback distance of 2.0 + { + TestArray zero_vec = {Float32(0.0), Float32(0.0), Float32(0.0)}; + TestArray vec = {Float32(1.0), Float32(2.0), Float32(3.0)}; + DataSet data_set = {{{zero_vec, vec}, Float32(2.0)}, + {{empty_arr, empty_arr}, Float32(2.0)}}; + static_cast(check_function(func_name, input_types, data_set)); + } + + // known value: 1 - 34 / sqrt(14 * 83). Mirror the production formula exactly. + { + TestArray vec1 = {Float32(1.0), Float32(2.0), Float32(3.0)}; + TestArray vec2 = {Float32(3.0), Float32(5.0), Float32(7.0)}; + float expected = 1.0f - static_cast(34.0 / std::sqrt(14.0 * 83.0)); + DataSet data_set = {{{vec1, vec2}, Float32(expected)}}; + static_cast(check_function(func_name, input_types, data_set)); + } +} + +// Regression tests for the numerical-stability fixes: large-magnitude vectors must +// not overflow the norm (legacy sqrt(squared_x * squared_y) produced +inf and a +// wrong result), and the cosine must stay within [-1, 1]. +TEST(function_cosine_numerical_stability_test, large_magnitude_no_overflow) { + InputTypeSet input_types = {PrimitiveType::TYPE_ARRAY, PrimitiveType::TYPE_FLOAT, + PrimitiveType::TYPE_ARRAY, PrimitiveType::TYPE_FLOAT}; + + // squared_x = squared_y = 2e38 (within FLT_MAX), but squared_x * squared_y = 4e76 + // overflows float. The double-precision norm keeps parallel vectors at cos = 1.0. + TestArray big1 = {Float32(1e19), Float32(1e19)}; + TestArray big2 = {Float32(1e19), Float32(1e19)}; + + { + DataSet data_set = {{{big1, big2}, Float32(1.0)}}; + static_cast(check_function("cosine_similarity", input_types, + data_set)); + } + { + DataSet data_set = {{{big1, big2}, Float32(0.0)}}; + static_cast( + check_function("cosine_distance", input_types, data_set)); + } +} + } // namespace doris From e94725c6cc4be0a56c56feb726db33e3766af171 Mon Sep 17 00:00:00 2001 From: yaoxiao Date: Sun, 24 May 2026 14:41:59 +0800 Subject: [PATCH 3/5] [fix](be) Fix clang-format violation in cosine similarity test Reformat overlong static_cast(...) call to comply with 100-column limit. --- .../exprs/function/function_array_cosine_similarity_test.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/be/test/exprs/function/function_array_cosine_similarity_test.cpp b/be/test/exprs/function/function_array_cosine_similarity_test.cpp index 66b4edd4388659..f281ef17fe2f20 100644 --- a/be/test/exprs/function/function_array_cosine_similarity_test.cpp +++ b/be/test/exprs/function/function_array_cosine_similarity_test.cpp @@ -221,8 +221,8 @@ TEST(function_cosine_numerical_stability_test, large_magnitude_no_overflow) { { DataSet data_set = {{{big1, big2}, Float32(1.0)}}; - static_cast(check_function("cosine_similarity", input_types, - data_set)); + static_cast( + check_function("cosine_similarity", input_types, data_set)); } { DataSet data_set = {{{big1, big2}, Float32(0.0)}}; From 12b0ce08a9489e0e2a3b8b8605fbcb0f5cfb0429 Mon Sep 17 00:00:00 2001 From: yaoxiao Date: Tue, 26 May 2026 17:09:00 +0800 Subject: [PATCH 4/5] [fix](regression) Update cosine distance/similarity expected values After switching to double-precision norm in cosine_distance/cosine_similarity, the computed values are slightly more accurate (closer to the mathematical truth). Update the regression .out expectations accordingly: - cosine_distance([1,2,3],[3,5,7]): 0.002585053 -> 0.002585113 (true: ~0.0025851140) - cosine_distance([1.0,2.0,3.0],[4.0,5.0,6.0]): 0.02536809 -> 0.02536815 - cosine_similarity([0.001,0.002],[0.003,0.004]): 0.9838699 -> 0.98387 - cosine_similarity + cosine_distance roundtrip: 0.9999999534439087 -> 1.0 (clamp guarantees sim + dist = 1.0 by construction) - table row 3 cosine_similarity: 0.9746319 -> 0.9746318 --- .../array_functions/test_array_distance_functions.out | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/regression-test/data/query_p0/sql_functions/array_functions/test_array_distance_functions.out b/regression-test/data/query_p0/sql_functions/array_functions/test_array_distance_functions.out index 071a20f477e015..a2b3a0c837c309 100644 --- a/regression-test/data/query_p0/sql_functions/array_functions/test_array_distance_functions.out +++ b/regression-test/data/query_p0/sql_functions/array_functions/test_array_distance_functions.out @@ -6,7 +6,7 @@ 3.741657 -- !sql -- -0.002585053 +0.002585113 -- !sql -- 2.0 @@ -18,7 +18,7 @@ 2.828427 -- !sql -- -0.02536809 +0.02536815 -- !sql -- 23.0 @@ -69,7 +69,7 @@ -1.0 -- !cosine_sim_distance_relation -- -0.9999999534439087 +1.0 -- !cosine_sim_empty -- 0.0 @@ -78,12 +78,12 @@ 0.9838699 -- !cosine_sim_small -- -0.9838699 +0.98387 -- !cosine_sim_table -- 1 1.0 2 0.0 -3 0.9746319 +3 0.9746318 4 -1.0 5 0.96 From d650aba8beb6484026442ce652e94b1577a7a234 Mon Sep 17 00:00:00 2001 From: yaoxiao Date: Tue, 26 May 2026 22:07:07 +0800 Subject: [PATCH 5/5] [fix](test) Remove redundant assert_cast on get_null_map_column_ptr After #63491 typed _null_map as ColumnUInt8::WrappedPtr, get_null_map_column_ptr() now returns ColumnUInt8::MutablePtr directly, making assert_cast(...) a same-type cast which is rejected by the static_assert added in #63059. Drop the wrapper. --- be/test/exprs/function/geo/functions_geo_test.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/be/test/exprs/function/geo/functions_geo_test.cpp b/be/test/exprs/function/geo/functions_geo_test.cpp index dcea4939fe1dcc..338df011af0a39 100644 --- a/be/test/exprs/function/geo/functions_geo_test.cpp +++ b/be/test/exprs/function/geo/functions_geo_test.cpp @@ -372,7 +372,7 @@ TEST(VGeoFunctionsTest, function_geo_st_geometries_invalid) { // Insert non-null but invalid data auto* nullable_input = assert_cast(input_col.get()); nullable_input->get_nested_column_ptr()->insert_data(invalid_buf.data(), invalid_buf.size()); - assert_cast(nullable_input->get_null_map_column_ptr().get())->insert_value(0); + nullable_input->get_null_map_column_ptr()->insert_value(0); block.insert({std::move(input_col), input_type, "shape"}); FunctionBasePtr func = SimpleFunctionFactory::instance().get_function(