apache · yiguolei · May 28, 2026 · Apr 25, 2026 · May 15, 2026 · May 21, 2026
diff --git a/be/src/exprs/function/array/function_array_distance.cpp b/be/src/exprs/function/array/function_array_distance.cpp
@@ -17,12 +17,20 @@
 
 #include "exprs/function/array/function_array_distance.h"
 
+#include <algorithm>
+
 #include "exprs/function/simple_function_factory.h"
 
 namespace doris {
 
 FAISS_PRAGMA_IMPRECISE_FUNCTION_BEGIN
 float CosineDistance::distance(const float* x, const float* y, size_t d) {
+    if (d == 0) {
+        return 2.0f;
+    }
+
+    DCHECK(x != nullptr && y != nullptr);
+
     float dot_prod = 0;
     float squared_x = 0;
     float squared_y = 0;
@@ -31,15 +39,32 @@ float CosineDistance::distance(const float* x, const float* y, size_t d) {
         squared_x += x[i] * x[i];
         squared_y += y[i] * y[i];
     }
-    if (squared_x == 0 or squared_y == 0) {
+
+    if (squared_x == 0 || squared_y == 0) {
         return 2.0f;
     }
-    return 1 - dot_prod / sqrt(squared_x * squared_y);
+
+    // Accumulate the norm in double and take a single square root. Computing
+    // (double)squared_x * (double)squared_y cannot overflow for finite float inputs,
+    // whereas the float expression sqrt(squared_x * squared_y) overflows to +inf for
+    // large-magnitude vectors and would silently yield a distance of 1.0.
+    const double norm = std::sqrt(static_cast<double>(squared_x) * static_cast<double>(squared_y));
+    // Clamp the cosine to [-1, 1] before mapping to a distance. Floating-point rounding
+    // can push the ratio slightly outside [-1, 1] (e.g. 1.0000001 for identical vectors),
+    // which would otherwise produce a tiny negative distance.
+    const float cosine = std::clamp(static_cast<float>(dot_prod / norm), -1.0f, 1.0f);
+    return 1.0f - cosine;
 }
 FAISS_PRAGMA_IMPRECISE_FUNCTION_END
 
 FAISS_PRAGMA_IMPRECISE_FUNCTION_BEGIN
 float CosineSimilarity::distance(const float* x, const float* y, size_t d) {
+    if (d == 0) {
+        return 0.0f;
+    }
+
+    DCHECK(x != nullptr && y != nullptr);
+
     float dot_prod = 0;
     float squared_x = 0;
     float squared_y = 0;
@@ -48,10 +73,15 @@ float CosineSimilarity::distance(const float* x, const float* y, size_t d) {
         squared_x += x[i] * x[i];
         squared_y += y[i] * y[i];
     }
-    if (squared_x == 0 or squared_y == 0) {
+
+    if (squared_x == 0 || squared_y == 0) {
         return 0.0f;
     }
-    return dot_prod / sqrt(squared_x * squared_y);
+
+    // See CosineDistance::distance: the double-precision norm avoids float overflow,
+    // and clamping keeps the result within the mathematically valid [-1, 1] range.
+    const double norm = std::sqrt(static_cast<double>(squared_x) * static_cast<double>(squared_y));
+    return std::clamp(static_cast<float>(dot_prod / norm), -1.0f, 1.0f);
 }
 FAISS_PRAGMA_IMPRECISE_FUNCTION_END
 

diff --git a/be/test/exprs/function/function_array_cosine_similarity_test.cpp b/be/test/exprs/function/function_array_cosine_similarity_test.cpp
@@ -99,8 +99,10 @@ TEST(function_cosine_similarity_test, cosine_similarity) {
 
         TestArray vec1 = {Float32(1.0), Float32(2.0), Float32(3.0)};
         TestArray vec2 = {Float32(3.0), Float32(5.0), Float32(7.0)};
-        // Expected: 34 / sqrt(14 * 83) = 34 / sqrt(1162) ≈ 0.9974149
-        float expected = 34.0f / std::sqrt(14.0f * 83.0f);
+        // Expected: 34 / sqrt(14 * 83) = 34 / sqrt(1162) ≈ 0.9974149.
+        // Mirror the production formula exactly (double-precision norm) so the
+        // exact float comparison in check_function matches bit-for-bit.
+        float expected = static_cast<float>(34.0 / std::sqrt(14.0 * 83.0));
         DataSet data_set = {{{vec1, vec2}, Float32(expected)}};
 
         static_cast<void>(check_function<DataTypeFloat32, false>(func_name, input_types, data_set));
@@ -156,4 +158,77 @@ TEST(function_cosine_similarity_test, cosine_similarity) {
     }
 }
 
+TEST(function_cosine_distance_test, cosine_distance) {
+    std::string func_name = "cosine_distance";
+    TestArray empty_arr;
+    InputTypeSet input_types = {PrimitiveType::TYPE_ARRAY, PrimitiveType::TYPE_FLOAT,
+                                PrimitiveType::TYPE_ARRAY, PrimitiveType::TYPE_FLOAT};
+
+    // identical vectors -> distance 0.0 (and crucially never a negative distance)
+    {
+        TestArray vec1 = {Float32(1.0), Float32(2.0), Float32(3.0)};
+        TestArray vec2 = {Float32(1.0), Float32(2.0), Float32(3.0)};
+        DataSet data_set = {{{vec1, vec2}, Float32(0.0)}};
+        static_cast<void>(check_function<DataTypeFloat32, false>(func_name, input_types, data_set));
+    }
+
+    // orthogonal vectors -> distance 1.0
+    {
+        TestArray vec1 = {Float32(1.0), Float32(0.0)};
+        TestArray vec2 = {Float32(0.0), Float32(1.0)};
+        DataSet data_set = {{{vec1, vec2}, Float32(1.0)}};
+        static_cast<void>(check_function<DataTypeFloat32, false>(func_name, input_types, data_set));
+    }
+
+    // opposite vectors -> distance 2.0
+    {
+        TestArray vec1 = {Float32(1.0), Float32(2.0), Float32(3.0)};
+        TestArray vec2 = {Float32(-1.0), Float32(-2.0), Float32(-3.0)};
+        DataSet data_set = {{{vec1, vec2}, Float32(2.0)}};
+        static_cast<void>(check_function<DataTypeFloat32, false>(func_name, input_types, data_set));
+    }
+
+    // zero vector and empty array keep the legacy fallback distance of 2.0
+    {
+        TestArray zero_vec = {Float32(0.0), Float32(0.0), Float32(0.0)};
+        TestArray vec = {Float32(1.0), Float32(2.0), Float32(3.0)};
+        DataSet data_set = {{{zero_vec, vec}, Float32(2.0)},
+                            {{empty_arr, empty_arr}, Float32(2.0)}};
+        static_cast<void>(check_function<DataTypeFloat32, false>(func_name, input_types, data_set));
+    }
+
+    // known value: 1 - 34 / sqrt(14 * 83). Mirror the production formula exactly.
+    {
+        TestArray vec1 = {Float32(1.0), Float32(2.0), Float32(3.0)};
+        TestArray vec2 = {Float32(3.0), Float32(5.0), Float32(7.0)};
+        float expected = 1.0f - static_cast<float>(34.0 / std::sqrt(14.0 * 83.0));
+        DataSet data_set = {{{vec1, vec2}, Float32(expected)}};
+        static_cast<void>(check_function<DataTypeFloat32, false>(func_name, input_types, data_set));
+    }
+}
+
+// Regression tests for the numerical-stability fixes: large-magnitude vectors must
+// not overflow the norm (legacy sqrt(squared_x * squared_y) produced +inf and a
+// wrong result), and the cosine must stay within [-1, 1].
+TEST(function_cosine_numerical_stability_test, large_magnitude_no_overflow) {
+    InputTypeSet input_types = {PrimitiveType::TYPE_ARRAY, PrimitiveType::TYPE_FLOAT,
+                                PrimitiveType::TYPE_ARRAY, PrimitiveType::TYPE_FLOAT};
+
+    // squared_x = squared_y = 2e38 (within FLT_MAX), but squared_x * squared_y = 4e76
+    // overflows float. The double-precision norm keeps parallel vectors at cos = 1.0.
+    TestArray big1 = {Float32(1e19), Float32(1e19)};
+    TestArray big2 = {Float32(1e19), Float32(1e19)};
+
+    {
+        DataSet data_set = {{{big1, big2}, Float32(1.0)}};
+        static_cast<void>(
+                check_function<DataTypeFloat32, false>("cosine_similarity", input_types, data_set));
+    }
+    {
+        DataSet data_set = {{{big1, big2}, Float32(0.0)}};
+        static_cast<void>(
+                check_function<DataTypeFloat32, false>("cosine_distance", input_types, data_set));
+    }
+}
+
 } // namespace doris
diff --git a/be/test/exprs/function/geo/functions_geo_test.cpp b/be/test/exprs/function/geo/functions_geo_test.cpp
@@ -372,7 +372,7 @@ TEST(VGeoFunctionsTest, function_geo_st_geometries_invalid) {
     // Insert non-null but invalid data
     auto* nullable_input = assert_cast<ColumnNullable*>(input_col.get());
     nullable_input->get_nested_column_ptr()->insert_data(invalid_buf.data(), invalid_buf.size());
-    assert_cast<ColumnUInt8*>(nullable_input->get_null_map_column_ptr().get())->insert_value(0);
+    nullable_input->get_null_map_column_ptr()->insert_value(0);
     block.insert({std::move(input_col), input_type, "shape"});
 
     FunctionBasePtr func = SimpleFunctionFactory::instance().get_function(

diff --git a/...ession-test/data/query_p0/sql_functions/array_functions/test_array_distance_functions.out b/...ession-test/data/query_p0/sql_functions/array_functions/test_array_distance_functions.out
@@ -6,7 +6,7 @@
 3.741657
 
 -- !sql --
-0.002585053
+0.002585113
 
 -- !sql --
 2.0
@@ -18,7 +18,7 @@
 2.828427
 
 -- !sql --
-0.02536809
+0.02536815
 
 -- !sql --
 23.0
@@ -69,7 +69,7 @@
 -1.0
 
 -- !cosine_sim_distance_relation --
-0.9999999534439087
+1.0
 
 -- !cosine_sim_empty --
 0.0
@@ -78,12 +78,12 @@
 0.9838699
 
 -- !cosine_sim_small --
-0.9838699
+0.98387
 
 -- !cosine_sim_table --
 1	1.0
 2	0.0
-3	0.9746319
+3	0.9746318
 4	-1.0
 5	0.96