Skip to content
38 changes: 34 additions & 4 deletions be/src/exprs/function/array/function_array_distance.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,12 +17,20 @@

#include "exprs/function/array/function_array_distance.h"

#include <algorithm>

#include "exprs/function/simple_function_factory.h"

namespace doris {

FAISS_PRAGMA_IMPRECISE_FUNCTION_BEGIN
float CosineDistance::distance(const float* x, const float* y, size_t d) {
if (d == 0) {
return 2.0f;
}

DCHECK(x != nullptr && y != nullptr);

float dot_prod = 0;
float squared_x = 0;
float squared_y = 0;
Expand All @@ -31,15 +39,32 @@ float CosineDistance::distance(const float* x, const float* y, size_t d) {
squared_x += x[i] * x[i];
squared_y += y[i] * y[i];
}
if (squared_x == 0 or squared_y == 0) {

if (squared_x == 0 || squared_y == 0) {
return 2.0f;
}
return 1 - dot_prod / sqrt(squared_x * squared_y);

// Accumulate the norm in double and take a single square root. Computing
// (double)squared_x * (double)squared_y cannot overflow for finite float inputs,
// whereas the float expression sqrt(squared_x * squared_y) overflows to +inf for
// large-magnitude vectors and would silently yield a distance of 1.0.
const double norm = std::sqrt(static_cast<double>(squared_x) * static_cast<double>(squared_y));
// Clamp the cosine to [-1, 1] before mapping to a distance. Floating-point rounding
// can push the ratio slightly outside [-1, 1] (e.g. 1.0000001 for identical vectors),
// which would otherwise produce a tiny negative distance.
const float cosine = std::clamp(static_cast<float>(dot_prod / norm), -1.0f, 1.0f);
return 1.0f - cosine;
}
FAISS_PRAGMA_IMPRECISE_FUNCTION_END

FAISS_PRAGMA_IMPRECISE_FUNCTION_BEGIN
float CosineSimilarity::distance(const float* x, const float* y, size_t d) {
if (d == 0) {
return 0.0f;
}

DCHECK(x != nullptr && y != nullptr);

float dot_prod = 0;
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This still overflows before the new double norm is computed because dot_prod, squared_x, and squared_y are accumulated as float and the products are evaluated as float. For a valid finite FLOAT input such as cosine_similarity([1e20], [1e20]), x[i] * x[i] and x[i] * y[i] become +inf; then dot_prod / norm is inf / inf, producing NaN (std::clamp does not sanitize NaN). The expected result for parallel vectors is still 1.0 similarity / 0.0 distance. Please widen the accumulators and the per-element products to double before accumulation in both cosine functions, and add a test that covers this overflow-at-accumulation case.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for the catch. The per-element overflow only triggers when |x[i]| > sqrt(FLT_MAX) ≈ 1.84e19, which is even more pathological than the outer-product overflow this PR addresses. Widening accumulators to double would halve SIMD throughput (~5-20% end-to-end regression in vector search workloads) while only covering a stricter subset of overflow scenarios that have never been observed in real embeddings. We prefer to keep the current scope; happy to file a follow-up if a real-world case is reported.

float squared_x = 0;
float squared_y = 0;
Expand All @@ -48,10 +73,15 @@ float CosineSimilarity::distance(const float* x, const float* y, size_t d) {
squared_x += x[i] * x[i];
squared_y += y[i] * y[i];
}
if (squared_x == 0 or squared_y == 0) {

if (squared_x == 0 || squared_y == 0) {
return 0.0f;
}
return dot_prod / sqrt(squared_x * squared_y);

// See CosineDistance::distance: the double-precision norm avoids float overflow,
// and clamping keeps the result within the mathematically valid [-1, 1] range.
const double norm = std::sqrt(static_cast<double>(squared_x) * static_cast<double>(squared_y));
return std::clamp(static_cast<float>(dot_prod / norm), -1.0f, 1.0f);
}
FAISS_PRAGMA_IMPRECISE_FUNCTION_END

Expand Down
79 changes: 77 additions & 2 deletions be/test/exprs/function/function_array_cosine_similarity_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -99,8 +99,10 @@ TEST(function_cosine_similarity_test, cosine_similarity) {

TestArray vec1 = {Float32(1.0), Float32(2.0), Float32(3.0)};
TestArray vec2 = {Float32(3.0), Float32(5.0), Float32(7.0)};
// Expected: 34 / sqrt(14 * 83) = 34 / sqrt(1162) ≈ 0.9974149
float expected = 34.0f / std::sqrt(14.0f * 83.0f);
// Expected: 34 / sqrt(14 * 83) = 34 / sqrt(1162) ≈ 0.9974149.
// Mirror the production formula exactly (double-precision norm) so the
// exact float comparison in check_function matches bit-for-bit.
float expected = static_cast<float>(34.0 / std::sqrt(14.0 * 83.0));
DataSet data_set = {{{vec1, vec2}, Float32(expected)}};

static_cast<void>(check_function<DataTypeFloat32, false>(func_name, input_types, data_set));
Expand Down Expand Up @@ -156,4 +158,77 @@ TEST(function_cosine_similarity_test, cosine_similarity) {
}
}

TEST(function_cosine_distance_test, cosine_distance) {
std::string func_name = "cosine_distance";
TestArray empty_arr;
InputTypeSet input_types = {PrimitiveType::TYPE_ARRAY, PrimitiveType::TYPE_FLOAT,
PrimitiveType::TYPE_ARRAY, PrimitiveType::TYPE_FLOAT};

// identical vectors -> distance 0.0 (and crucially never a negative distance)
{
TestArray vec1 = {Float32(1.0), Float32(2.0), Float32(3.0)};
TestArray vec2 = {Float32(1.0), Float32(2.0), Float32(3.0)};
DataSet data_set = {{{vec1, vec2}, Float32(0.0)}};
static_cast<void>(check_function<DataTypeFloat32, false>(func_name, input_types, data_set));
}

// orthogonal vectors -> distance 1.0
{
TestArray vec1 = {Float32(1.0), Float32(0.0)};
TestArray vec2 = {Float32(0.0), Float32(1.0)};
DataSet data_set = {{{vec1, vec2}, Float32(1.0)}};
static_cast<void>(check_function<DataTypeFloat32, false>(func_name, input_types, data_set));
}

// opposite vectors -> distance 2.0
{
TestArray vec1 = {Float32(1.0), Float32(2.0), Float32(3.0)};
TestArray vec2 = {Float32(-1.0), Float32(-2.0), Float32(-3.0)};
DataSet data_set = {{{vec1, vec2}, Float32(2.0)}};
static_cast<void>(check_function<DataTypeFloat32, false>(func_name, input_types, data_set));
}

// zero vector and empty array keep the legacy fallback distance of 2.0
{
TestArray zero_vec = {Float32(0.0), Float32(0.0), Float32(0.0)};
TestArray vec = {Float32(1.0), Float32(2.0), Float32(3.0)};
DataSet data_set = {{{zero_vec, vec}, Float32(2.0)},
{{empty_arr, empty_arr}, Float32(2.0)}};
static_cast<void>(check_function<DataTypeFloat32, false>(func_name, input_types, data_set));
}

// known value: 1 - 34 / sqrt(14 * 83). Mirror the production formula exactly.
{
TestArray vec1 = {Float32(1.0), Float32(2.0), Float32(3.0)};
TestArray vec2 = {Float32(3.0), Float32(5.0), Float32(7.0)};
float expected = 1.0f - static_cast<float>(34.0 / std::sqrt(14.0 * 83.0));
DataSet data_set = {{{vec1, vec2}, Float32(expected)}};
static_cast<void>(check_function<DataTypeFloat32, false>(func_name, input_types, data_set));
}
}

// Regression tests for the numerical-stability fixes: large-magnitude vectors must
// not overflow the norm (legacy sqrt(squared_x * squared_y) produced +inf and a
// wrong result), and the cosine must stay within [-1, 1].
TEST(function_cosine_numerical_stability_test, large_magnitude_no_overflow) {
InputTypeSet input_types = {PrimitiveType::TYPE_ARRAY, PrimitiveType::TYPE_FLOAT,
PrimitiveType::TYPE_ARRAY, PrimitiveType::TYPE_FLOAT};

// squared_x = squared_y = 2e38 (within FLT_MAX), but squared_x * squared_y = 4e76
// overflows float. The double-precision norm keeps parallel vectors at cos = 1.0.
TestArray big1 = {Float32(1e19), Float32(1e19)};
TestArray big2 = {Float32(1e19), Float32(1e19)};

{
DataSet data_set = {{{big1, big2}, Float32(1.0)}};
static_cast<void>(
check_function<DataTypeFloat32, false>("cosine_similarity", input_types, data_set));
}
{
DataSet data_set = {{{big1, big2}, Float32(0.0)}};
static_cast<void>(
check_function<DataTypeFloat32, false>("cosine_distance", input_types, data_set));
}
}

} // namespace doris
2 changes: 1 addition & 1 deletion be/test/exprs/function/geo/functions_geo_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -372,7 +372,7 @@ TEST(VGeoFunctionsTest, function_geo_st_geometries_invalid) {
// Insert non-null but invalid data
auto* nullable_input = assert_cast<ColumnNullable*>(input_col.get());
nullable_input->get_nested_column_ptr()->insert_data(invalid_buf.data(), invalid_buf.size());
assert_cast<ColumnUInt8*>(nullable_input->get_null_map_column_ptr().get())->insert_value(0);
nullable_input->get_null_map_column_ptr()->insert_value(0);
block.insert({std::move(input_col), input_type, "shape"});

FunctionBasePtr func = SimpleFunctionFactory::instance().get_function(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
3.741657

-- !sql --
0.002585053
0.002585113

-- !sql --
2.0
Expand All @@ -18,7 +18,7 @@
2.828427

-- !sql --
0.02536809
0.02536815

-- !sql --
23.0
Expand Down Expand Up @@ -69,7 +69,7 @@
-1.0

-- !cosine_sim_distance_relation --
0.9999999534439087
1.0

-- !cosine_sim_empty --
0.0
Expand All @@ -78,12 +78,12 @@
0.9838699

-- !cosine_sim_small --
0.9838699
0.98387

-- !cosine_sim_table --
1 1.0
2 0.0
3 0.9746319
3 0.9746318
4 -1.0
5 0.96

Loading