From d32cbb7e138755b4e0b01c2f95398f736e7a2e91 Mon Sep 17 00:00:00 2001 From: kigland Date: Sat, 30 May 2026 14:17:05 +0800 Subject: [PATCH] fix: guard neighbour lookups in replace_blank replace_blank keeps a space only when both neighbours are ASCII non-space characters, but it indexed text[i + 1] / text[i - 1] without bounds checks. A trailing space raised IndexError, and a leading space read text[-1] (wrapping to the last character) and was wrongly kept. Restrict the check to interior positions so edge spaces are dropped and no out-of-range access occurs; interior spacing is unchanged. --- src/voxcpm/utils/text_normalize.py | 13 ++++++++++++- tests/test_text_normalize.py | 18 ++++++++++++++++++ 2 files changed, 30 insertions(+), 1 deletion(-) create mode 100644 tests/test_text_normalize.py diff --git a/src/voxcpm/utils/text_normalize.py b/src/voxcpm/utils/text_normalize.py index 423a173..f5328e1 100644 --- a/src/voxcpm/utils/text_normalize.py +++ b/src/voxcpm/utils/text_normalize.py @@ -112,7 +112,18 @@ def replace_blank(text: str): out_str = [] for i, c in enumerate(text): if c == " ": - if (text[i + 1].isascii() and text[i + 1] != " ") and (text[i - 1].isascii() and text[i - 1] != " "): + # Keep a space only when it sits between two ASCII word characters. + # Guard the neighbour lookups: a trailing space would make text[i + 1] + # raise IndexError, and a leading space would make text[i - 1] wrap + # around to the last character. Edge spaces are not between two + # words, so they are dropped. + if ( + 0 < i < len(text) - 1 + and text[i + 1].isascii() + and text[i + 1] != " " + and text[i - 1].isascii() + and text[i - 1] != " " + ): out_str.append(c) else: out_str.append(c) diff --git a/tests/test_text_normalize.py b/tests/test_text_normalize.py new file mode 100644 index 0000000..a047bb4 --- /dev/null +++ b/tests/test_text_normalize.py @@ -0,0 +1,18 @@ +from voxcpm.utils.text_normalize import replace_blank + + +def test_replace_blank_keeps_interior_ascii_space(): + assert replace_blank("a b") == "a b" + + +def test_replace_blank_drops_edge_spaces(): + # A space is only kept between two ASCII word characters. A trailing space + # used to raise IndexError (text[i + 1]) and a leading space was wrongly + # kept (text[i - 1] wrapping to the last character); both are now dropped. + assert replace_blank("hello ") == "hello" + assert replace_blank(" hello") == "hello" + assert replace_blank("a b ") == "a b" + + +def test_replace_blank_drops_space_adjacent_to_non_ascii(): + assert replace_blank("中 文") == "中文"