diff --git a/be/src/exprs/function/array/function_array_distance.cpp b/be/src/exprs/function/array/function_array_distance.cpp index 89a0dafe1e4aed..3f37775d6beedf 100644 --- a/be/src/exprs/function/array/function_array_distance.cpp +++ b/be/src/exprs/function/array/function_array_distance.cpp @@ -17,12 +17,20 @@ #include "exprs/function/array/function_array_distance.h" +#include + #include "exprs/function/simple_function_factory.h" namespace doris { FAISS_PRAGMA_IMPRECISE_FUNCTION_BEGIN float CosineDistance::distance(const float* x, const float* y, size_t d) { + if (d == 0) { + return 2.0f; + } + + DCHECK(x != nullptr && y != nullptr); + float dot_prod = 0; float squared_x = 0; float squared_y = 0; @@ -31,15 +39,32 @@ float CosineDistance::distance(const float* x, const float* y, size_t d) { squared_x += x[i] * x[i]; squared_y += y[i] * y[i]; } - if (squared_x == 0 or squared_y == 0) { + + if (squared_x == 0 || squared_y == 0) { return 2.0f; } - return 1 - dot_prod / sqrt(squared_x * squared_y); + + // Accumulate the norm in double and take a single square root. Computing + // (double)squared_x * (double)squared_y cannot overflow for finite float inputs, + // whereas the float expression sqrt(squared_x * squared_y) overflows to +inf for + // large-magnitude vectors and would silently yield a distance of 1.0. + const double norm = std::sqrt(static_cast(squared_x) * static_cast(squared_y)); + // Clamp the cosine to [-1, 1] before mapping to a distance. Floating-point rounding + // can push the ratio slightly outside [-1, 1] (e.g. 1.0000001 for identical vectors), + // which would otherwise produce a tiny negative distance. + const float cosine = std::clamp(static_cast(dot_prod / norm), -1.0f, 1.0f); + return 1.0f - cosine; } FAISS_PRAGMA_IMPRECISE_FUNCTION_END FAISS_PRAGMA_IMPRECISE_FUNCTION_BEGIN float CosineSimilarity::distance(const float* x, const float* y, size_t d) { + if (d == 0) { + return 0.0f; + } + + DCHECK(x != nullptr && y != nullptr); + float dot_prod = 0; float squared_x = 0; float squared_y = 0; @@ -48,10 +73,15 @@ float CosineSimilarity::distance(const float* x, const float* y, size_t d) { squared_x += x[i] * x[i]; squared_y += y[i] * y[i]; } - if (squared_x == 0 or squared_y == 0) { + + if (squared_x == 0 || squared_y == 0) { return 0.0f; } - return dot_prod / sqrt(squared_x * squared_y); + + // See CosineDistance::distance: the double-precision norm avoids float overflow, + // and clamping keeps the result within the mathematically valid [-1, 1] range. + const double norm = std::sqrt(static_cast(squared_x) * static_cast(squared_y)); + return std::clamp(static_cast(dot_prod / norm), -1.0f, 1.0f); } FAISS_PRAGMA_IMPRECISE_FUNCTION_END diff --git a/be/test/exprs/function/function_array_cosine_similarity_test.cpp b/be/test/exprs/function/function_array_cosine_similarity_test.cpp index a4928276f1e144..f281ef17fe2f20 100644 --- a/be/test/exprs/function/function_array_cosine_similarity_test.cpp +++ b/be/test/exprs/function/function_array_cosine_similarity_test.cpp @@ -99,8 +99,10 @@ TEST(function_cosine_similarity_test, cosine_similarity) { TestArray vec1 = {Float32(1.0), Float32(2.0), Float32(3.0)}; TestArray vec2 = {Float32(3.0), Float32(5.0), Float32(7.0)}; - // Expected: 34 / sqrt(14 * 83) = 34 / sqrt(1162) ≈ 0.9974149 - float expected = 34.0f / std::sqrt(14.0f * 83.0f); + // Expected: 34 / sqrt(14 * 83) = 34 / sqrt(1162) ≈ 0.9974149. + // Mirror the production formula exactly (double-precision norm) so the + // exact float comparison in check_function matches bit-for-bit. + float expected = static_cast(34.0 / std::sqrt(14.0 * 83.0)); DataSet data_set = {{{vec1, vec2}, Float32(expected)}}; static_cast(check_function(func_name, input_types, data_set)); @@ -156,4 +158,77 @@ TEST(function_cosine_similarity_test, cosine_similarity) { } } +TEST(function_cosine_distance_test, cosine_distance) { + std::string func_name = "cosine_distance"; + TestArray empty_arr; + InputTypeSet input_types = {PrimitiveType::TYPE_ARRAY, PrimitiveType::TYPE_FLOAT, + PrimitiveType::TYPE_ARRAY, PrimitiveType::TYPE_FLOAT}; + + // identical vectors -> distance 0.0 (and crucially never a negative distance) + { + TestArray vec1 = {Float32(1.0), Float32(2.0), Float32(3.0)}; + TestArray vec2 = {Float32(1.0), Float32(2.0), Float32(3.0)}; + DataSet data_set = {{{vec1, vec2}, Float32(0.0)}}; + static_cast(check_function(func_name, input_types, data_set)); + } + + // orthogonal vectors -> distance 1.0 + { + TestArray vec1 = {Float32(1.0), Float32(0.0)}; + TestArray vec2 = {Float32(0.0), Float32(1.0)}; + DataSet data_set = {{{vec1, vec2}, Float32(1.0)}}; + static_cast(check_function(func_name, input_types, data_set)); + } + + // opposite vectors -> distance 2.0 + { + TestArray vec1 = {Float32(1.0), Float32(2.0), Float32(3.0)}; + TestArray vec2 = {Float32(-1.0), Float32(-2.0), Float32(-3.0)}; + DataSet data_set = {{{vec1, vec2}, Float32(2.0)}}; + static_cast(check_function(func_name, input_types, data_set)); + } + + // zero vector and empty array keep the legacy fallback distance of 2.0 + { + TestArray zero_vec = {Float32(0.0), Float32(0.0), Float32(0.0)}; + TestArray vec = {Float32(1.0), Float32(2.0), Float32(3.0)}; + DataSet data_set = {{{zero_vec, vec}, Float32(2.0)}, + {{empty_arr, empty_arr}, Float32(2.0)}}; + static_cast(check_function(func_name, input_types, data_set)); + } + + // known value: 1 - 34 / sqrt(14 * 83). Mirror the production formula exactly. + { + TestArray vec1 = {Float32(1.0), Float32(2.0), Float32(3.0)}; + TestArray vec2 = {Float32(3.0), Float32(5.0), Float32(7.0)}; + float expected = 1.0f - static_cast(34.0 / std::sqrt(14.0 * 83.0)); + DataSet data_set = {{{vec1, vec2}, Float32(expected)}}; + static_cast(check_function(func_name, input_types, data_set)); + } +} + +// Regression tests for the numerical-stability fixes: large-magnitude vectors must +// not overflow the norm (legacy sqrt(squared_x * squared_y) produced +inf and a +// wrong result), and the cosine must stay within [-1, 1]. +TEST(function_cosine_numerical_stability_test, large_magnitude_no_overflow) { + InputTypeSet input_types = {PrimitiveType::TYPE_ARRAY, PrimitiveType::TYPE_FLOAT, + PrimitiveType::TYPE_ARRAY, PrimitiveType::TYPE_FLOAT}; + + // squared_x = squared_y = 2e38 (within FLT_MAX), but squared_x * squared_y = 4e76 + // overflows float. The double-precision norm keeps parallel vectors at cos = 1.0. + TestArray big1 = {Float32(1e19), Float32(1e19)}; + TestArray big2 = {Float32(1e19), Float32(1e19)}; + + { + DataSet data_set = {{{big1, big2}, Float32(1.0)}}; + static_cast( + check_function("cosine_similarity", input_types, data_set)); + } + { + DataSet data_set = {{{big1, big2}, Float32(0.0)}}; + static_cast( + check_function("cosine_distance", input_types, data_set)); + } +} + } // namespace doris diff --git a/be/test/exprs/function/geo/functions_geo_test.cpp b/be/test/exprs/function/geo/functions_geo_test.cpp index dcea4939fe1dcc..338df011af0a39 100644 --- a/be/test/exprs/function/geo/functions_geo_test.cpp +++ b/be/test/exprs/function/geo/functions_geo_test.cpp @@ -372,7 +372,7 @@ TEST(VGeoFunctionsTest, function_geo_st_geometries_invalid) { // Insert non-null but invalid data auto* nullable_input = assert_cast(input_col.get()); nullable_input->get_nested_column_ptr()->insert_data(invalid_buf.data(), invalid_buf.size()); - assert_cast(nullable_input->get_null_map_column_ptr().get())->insert_value(0); + nullable_input->get_null_map_column_ptr()->insert_value(0); block.insert({std::move(input_col), input_type, "shape"}); FunctionBasePtr func = SimpleFunctionFactory::instance().get_function( diff --git a/regression-test/data/query_p0/sql_functions/array_functions/test_array_distance_functions.out b/regression-test/data/query_p0/sql_functions/array_functions/test_array_distance_functions.out index 071a20f477e015..a2b3a0c837c309 100644 --- a/regression-test/data/query_p0/sql_functions/array_functions/test_array_distance_functions.out +++ b/regression-test/data/query_p0/sql_functions/array_functions/test_array_distance_functions.out @@ -6,7 +6,7 @@ 3.741657 -- !sql -- -0.002585053 +0.002585113 -- !sql -- 2.0 @@ -18,7 +18,7 @@ 2.828427 -- !sql -- -0.02536809 +0.02536815 -- !sql -- 23.0 @@ -69,7 +69,7 @@ -1.0 -- !cosine_sim_distance_relation -- -0.9999999534439087 +1.0 -- !cosine_sim_empty -- 0.0 @@ -78,12 +78,12 @@ 0.9838699 -- !cosine_sim_small -- -0.9838699 +0.98387 -- !cosine_sim_table -- 1 1.0 2 0.0 -3 0.9746319 +3 0.9746318 4 -1.0 5 0.96