From 7506ab9bfeaa08a36c1c01a2b9810abab85de5bd Mon Sep 17 00:00:00 2001
From: Patrick Kunzmann <padix.key@gmail.com>
Date: Fri, 30 Jan 2026 21:56:25 +0100
Subject: [PATCH] Add benchmarks for modules to be replaced by Rust code

---
 benchmarks/sequence/align/benchmark_kmer.py   |  50 ++++++++
 .../sequence/align/benchmark_multiple.py      |  24 ++++
 .../sequence/align/benchmark_pairwise.py      |  48 ++++++++
 .../sequence/align/benchmark_selector.py      |  36 ++++++
 benchmarks/sequence/benchmark_alphabet.py     |  35 ++++++
 benchmarks/sequence/benchmark_phylo.py        |  27 +++++
 benchmarks/structure/benchmark_alphabet.py    |   2 +-
 benchmarks/structure/benchmark_bonds.py       | 107 ++++++++++++++++++
 benchmarks/structure/benchmark_celllist.py    |  21 +++-
 benchmarks/structure/benchmark_charges.py     |  16 +++
 benchmarks/structure/benchmark_compare.py     |   2 +-
 benchmarks/structure/benchmark_pdb.py         |  69 +++++++++++
 benchmarks/structure/benchmark_sasa.py        |  22 ++++
 benchmarks/structure/benchmark_superimpose.py |   2 +-
 src/biotite/structure/bonds.pyx               |   2 +-
 15 files changed, 455 insertions(+), 8 deletions(-)
 create mode 100644 benchmarks/sequence/align/benchmark_kmer.py
 create mode 100644 benchmarks/sequence/align/benchmark_multiple.py
 create mode 100644 benchmarks/sequence/align/benchmark_pairwise.py
 create mode 100644 benchmarks/sequence/align/benchmark_selector.py
 create mode 100644 benchmarks/sequence/benchmark_alphabet.py
 create mode 100644 benchmarks/sequence/benchmark_phylo.py
 create mode 100644 benchmarks/structure/benchmark_bonds.py
 create mode 100644 benchmarks/structure/benchmark_charges.py
 create mode 100644 benchmarks/structure/benchmark_pdb.py
 create mode 100644 benchmarks/structure/benchmark_sasa.py

diff --git a/benchmarks/sequence/align/benchmark_kmer.py b/benchmarks/sequence/align/benchmark_kmer.py
new file mode 100644
index 000000000..f2efaa721
--- /dev/null
+++ b/benchmarks/sequence/align/benchmark_kmer.py
@@ -0,0 +1,50 @@
+import numpy as np
+import pytest
+import biotite.sequence as seq
+import biotite.sequence.align as align
+
+SEQ_LENGTH = 10_000
+K = 3
+
+
+@pytest.fixture(scope="module")
+def sequence():
+    np.random.seed(0)
+    sequence = seq.ProteinSequence()
+    sequence.code = np.random.randint(
+        len(seq.ProteinSequence.alphabet), size=SEQ_LENGTH
+    )
+    return sequence
+
+
+@pytest.fixture(scope="module")
+def kmer_alphabet():
+    return align.KmerAlphabet(seq.ProteinSequence.alphabet, K)
+
+
+@pytest.fixture(scope="module")
+def matrix():
+    return align.SubstitutionMatrix.std_protein_matrix()
+
+
+@pytest.fixture(scope="module")
+def score_threshold_rule(matrix):
+    return align.ScoreThresholdRule(matrix, 10)
+
+
+@pytest.mark.benchmark
+def benchmark_create_kmers(kmer_alphabet, sequence):
+    """
+    Create k-mer codes from a sequence.
+    """
+    kmer_alphabet.create_kmers(sequence.code)
+
+
+@pytest.mark.benchmark
+def benchmark_similar_kmers(score_threshold_rule, kmer_alphabet):
+    """
+    Find all k-mers similar to a reference k-mer using a score threshold.
+    """
+    KMER_CODE = 0
+
+    score_threshold_rule.similar_kmers(kmer_alphabet, KMER_CODE)
diff --git a/benchmarks/sequence/align/benchmark_multiple.py b/benchmarks/sequence/align/benchmark_multiple.py
new file mode 100644
index 000000000..b905a57c0
--- /dev/null
+++ b/benchmarks/sequence/align/benchmark_multiple.py
@@ -0,0 +1,24 @@
+from pathlib import Path
+import pytest
+import biotite.sequence.align as align
+import biotite.sequence.io.fasta as fasta
+from tests.util import data_dir
+
+
+@pytest.fixture(scope="module")
+def sequences():
+    fasta_file = fasta.FastaFile.read(Path(data_dir("sequence")) / "cas9.fasta")
+    return list(fasta.get_sequences(fasta_file).values())
+
+
+@pytest.fixture(scope="module")
+def matrix():
+    return align.SubstitutionMatrix.std_protein_matrix()
+
+
+@pytest.mark.benchmark
+def benchmark_align_multiple(sequences, matrix):
+    """
+    Perform progressive multiple sequence alignment.
+    """
+    align.align_multiple(sequences, matrix, gap_penalty=(-10, -1))
diff --git a/benchmarks/sequence/align/benchmark_pairwise.py b/benchmarks/sequence/align/benchmark_pairwise.py
new file mode 100644
index 000000000..0d3d29ea5
--- /dev/null
+++ b/benchmarks/sequence/align/benchmark_pairwise.py
@@ -0,0 +1,48 @@
+from functools import partial
+from pathlib import Path
+import pytest
+import biotite.sequence as seq
+import biotite.sequence.align as align
+import biotite.sequence.io.fasta as fasta
+from tests.util import data_dir
+
+GAP_PENALTY = (-10, -1)
+
+
+@pytest.fixture(scope="module")
+def sequences():
+    fasta_file = fasta.FastaFile.read(Path(data_dir("sequence")) / "cas9.fasta")
+    return [seq.ProteinSequence(s) for s in fasta_file.values()]
+
+
+@pytest.fixture(scope="module")
+def matrix():
+    return align.SubstitutionMatrix.std_protein_matrix()
+
+
+@pytest.fixture(scope="module")
+def seq_pair(sequences):
+    return sequences[0], sequences[1]
+
+
+@pytest.fixture(scope="module")
+def seed(seq_pair):
+    return (len(seq_pair[0]) // 2, len(seq_pair[1]) // 2)
+
+
+@pytest.mark.benchmark
+@pytest.mark.parametrize(
+    "method",
+    [
+        partial(align.align_optimal, gap_penalty=GAP_PENALTY),
+        partial(align.align_banded, band=(-50, 50), gap_penalty=GAP_PENALTY),
+        partial(align.align_local_gapped, threshold=100, gap_penalty=GAP_PENALTY),
+    ],
+    ids=lambda x: x.func.__name__,
+)
+def benchmark_align_pairwise(seq_pair, matrix, seed, method):
+    """
+    Perform pairwise sequence alignment using different algorithms.
+    """
+    kwargs = {"seed": seed} if method.func is align.align_local_gapped else {}
+    method(seq_pair[0], seq_pair[1], matrix, **kwargs)
diff --git a/benchmarks/sequence/align/benchmark_selector.py b/benchmarks/sequence/align/benchmark_selector.py
new file mode 100644
index 000000000..19aa5c9d9
--- /dev/null
+++ b/benchmarks/sequence/align/benchmark_selector.py
@@ -0,0 +1,36 @@
+import numpy as np
+import pytest
+import biotite.sequence as seq
+import biotite.sequence.align as align
+
+SEQ_LENGTH = 10_000
+K = 8
+S = 4
+WINDOW = 10
+ALPHABET = seq.NucleotideSequence.alphabet_unamb
+KMER_ALPHABET = align.KmerAlphabet(ALPHABET, K)
+
+
+@pytest.fixture(scope="module")
+def sequence():
+    np.random.seed(0)
+    s = seq.NucleotideSequence()
+    s.code = np.random.randint(len(ALPHABET), size=SEQ_LENGTH)
+    return s
+
+
+@pytest.mark.parametrize(
+    "selector",
+    [
+        align.MinimizerSelector(KMER_ALPHABET, window=WINDOW),
+        align.SyncmerSelector(ALPHABET, K, S),
+        align.CachedSyncmerSelector(ALPHABET, K, S),
+        align.MincodeSelector(KMER_ALPHABET, compression=4),
+    ],
+    ids=lambda x: x.__class__.__name__,
+)
+def benchmark_select(sequence, selector):
+    """
+    Select k-mers from a sequence using different selection strategies.
+    """
+    selector.select(sequence)
diff --git a/benchmarks/sequence/benchmark_alphabet.py b/benchmarks/sequence/benchmark_alphabet.py
new file mode 100644
index 000000000..7e68a3db0
--- /dev/null
+++ b/benchmarks/sequence/benchmark_alphabet.py
@@ -0,0 +1,35 @@
+import pytest
+import biotite.sequence as seq
+
+SEQ_LENGTH = 10_000
+
+
+@pytest.fixture(scope="module")
+def alphabet():
+    return seq.ProteinSequence.alphabet
+
+
+@pytest.fixture(scope="module")
+def symbols():
+    return "ACDEFGHIKLMNPQRSTVWY" * (SEQ_LENGTH // 20)
+
+
+@pytest.fixture(scope="module")
+def code(alphabet, symbols):
+    return alphabet.encode_multiple(symbols)
+
+
+@pytest.mark.benchmark
+def benchmark_encode(alphabet, symbols):
+    """
+    Encode symbols into a sequence code.
+    """
+    alphabet.encode_multiple(symbols)
+
+
+@pytest.mark.benchmark
+def benchmark_decode(alphabet, code):
+    """
+    Decode a sequence code into symbols.
+    """
+    alphabet.decode_multiple(code)
diff --git a/benchmarks/sequence/benchmark_phylo.py b/benchmarks/sequence/benchmark_phylo.py
new file mode 100644
index 000000000..704ee98ef
--- /dev/null
+++ b/benchmarks/sequence/benchmark_phylo.py
@@ -0,0 +1,27 @@
+import numpy as np
+import pytest
+import biotite.sequence.phylo as phylo
+
+N = 20
+
+
+@pytest.fixture(scope="module")
+def distances():
+    np.random.seed(0)
+    rand = np.random.rand(N, N).astype(np.float32)
+    distances = (rand + rand.T) / 2
+    np.fill_diagonal(distances, 0)
+    return distances
+
+
+@pytest.mark.benchmark
+@pytest.mark.parametrize(
+    "method",
+    [phylo.upgma, phylo.neighbor_joining],
+    ids=lambda x: x.__name__,
+)
+def benchmark_clustering(distances, method):
+    """
+    Perform hierarchical clustering from a distance matrix.
+    """
+    method(distances)
diff --git a/benchmarks/structure/benchmark_alphabet.py b/benchmarks/structure/benchmark_alphabet.py
index 54a9080ad..4d351853e 100644
--- a/benchmarks/structure/benchmark_alphabet.py
+++ b/benchmarks/structure/benchmark_alphabet.py
@@ -7,7 +7,7 @@
 PDB_ID = "1aki"
 
 
-@pytest.fixture
+@pytest.fixture(scope="module")
 def atoms():
     pdbx_file = pdbx.BinaryCIFFile.read(Path(data_dir("structure")) / f"{PDB_ID}.bcif")
     return pdbx.get_structure(pdbx_file, model=1, include_bonds=True)
diff --git a/benchmarks/structure/benchmark_bonds.py b/benchmarks/structure/benchmark_bonds.py
new file mode 100644
index 000000000..0bf01fa46
--- /dev/null
+++ b/benchmarks/structure/benchmark_bonds.py
@@ -0,0 +1,107 @@
+from pathlib import Path
+import pytest
+import biotite.structure as struc
+import biotite.structure.info as info
+import biotite.structure.io.pdbx as pdbx
+from tests.util import data_dir
+
+PDB_ID = "1aki"
+
+
+@pytest.fixture(autouse=True, scope="session")
+def load_ccd():
+    """
+    Ensure that the CCD is already loaded to avoid biasing tests with its loading time.
+    """
+    info.get_ccd()
+
+
+@pytest.fixture(scope="module")
+def atoms():
+    pdbx_file = pdbx.BinaryCIFFile.read(Path(data_dir("structure")) / f"{PDB_ID}.bcif")
+    return pdbx.get_structure(pdbx_file, model=1, include_bonds=True)
+
+
+@pytest.fixture(scope="module")
+def bond_array(atoms):
+    return atoms.bonds.as_array()
+
+
+@pytest.mark.benchmark
+def benchmark_bond_list_creation(atoms, bond_array):
+    """
+    Create a `BondList` from an array of bonds, which involves sorting and deduplication.
+    """
+    struc.BondList(atoms.array_length(), bond_array)
+
+
+@pytest.mark.benchmark
+@pytest.mark.parametrize(
+    "method",
+    [
+        struc.BondList.as_set,
+        struc.BondList.as_graph,
+        struc.BondList.as_array,
+        struc.BondList.get_all_bonds,
+        struc.BondList.adjacency_matrix,
+        struc.BondList.bond_type_matrix,
+    ],
+    ids=lambda x: x.__name__,
+)
+def benchmark_conversion(atoms, method):
+    """
+    Convert the `BondList` to a different representation.
+    """
+    method(atoms.bonds)
+
+
+@pytest.mark.benchmark
+def benchmark_get_bonds(atoms):
+    """
+    Get the bonds for each atom index.
+    """
+    for i in range(atoms.array_length()):
+        atoms.bonds.get_bonds(i)
+
+
+@pytest.mark.benchmark
+def benchmark_get_all_bonds(atoms):
+    """
+    Get the bonds for all atom indices.
+    """
+    atoms.bonds.get_all_bonds()
+
+
+@pytest.mark.benchmark
+def benchmark_concatenate(atoms):
+    """
+    Concatenate two `BondList` objects.
+    """
+    atoms.bonds.concatenate([atoms.bonds, atoms.bonds])
+
+
+@pytest.mark.parametrize(
+    "connect_fn", [struc.connect_via_distances, struc.connect_via_residue_names]
+)
+@pytest.mark.benchmark
+def benchmark_connect(atoms, connect_fn):
+    """
+    Find bonds between atoms using the specified method.
+    """
+    connect_fn(atoms)
+
+
+@pytest.mark.benchmark
+def benchmark_find_connected(atoms):
+    """
+    Find all connected atoms for a given atom index.
+    """
+    struc.find_connected(atoms.bonds, 0)
+
+
+@pytest.mark.benchmark
+def benchmark_find_rotatable_bonds(atoms):
+    """
+    Find all rotatable bonds in a `BondList`.
+    """
+    struc.find_rotatable_bonds(atoms.bonds)
diff --git a/benchmarks/structure/benchmark_celllist.py b/benchmarks/structure/benchmark_celllist.py
index afff79a07..1d41a4405 100644
--- a/benchmarks/structure/benchmark_celllist.py
+++ b/benchmarks/structure/benchmark_celllist.py
@@ -5,15 +5,28 @@
 from tests.util import data_dir
 
 
-@pytest.fixture
+@pytest.fixture(scope="module")
 def atoms():
     pdbx_file = pdbx.BinaryCIFFile.read(Path(data_dir("structure")) / "1gya.bcif")
     return pdbx.get_structure(pdbx_file, model=1)
 
 
-def benchmark_cell_list(atoms):
+@pytest.fixture(scope="module")
+def cell_list(atoms):
+    return struc.CellList(atoms, 5.0)
+
+
+@pytest.mark.benchmark
+def benchmark_cell_list_creation(atoms):
+    """
+    Create a cell list for a structure.
+    """
+    struc.CellList(atoms, 5.0)
+
+
+@pytest.mark.benchmark
+def benchmark_cell_list_compute_contacts(cell_list, atoms):
     """
-    Find all contacts in a structure using a cell list.
+    Find all contacts in a structure using an existing cell list.
     """
-    cell_list = struc.CellList(atoms, 5.0)
     cell_list.get_atoms(atoms.coord, 5.0)
diff --git a/benchmarks/structure/benchmark_charges.py b/benchmarks/structure/benchmark_charges.py
new file mode 100644
index 000000000..4040a35ba
--- /dev/null
+++ b/benchmarks/structure/benchmark_charges.py
@@ -0,0 +1,16 @@
+import pytest
+import biotite.structure as struc
+import biotite.structure.info as info
+
+
+@pytest.fixture(scope="module")
+def atoms():
+    return info.residue("PNN")
+
+
+@pytest.mark.benchmark
+def benchmark_partial_charges(atoms):
+    """
+    Compute the partial charges of each atom in a structure.
+    """
+    struc.partial_charges(atoms)
diff --git a/benchmarks/structure/benchmark_compare.py b/benchmarks/structure/benchmark_compare.py
index b6b00be83..67f9a5cbe 100644
--- a/benchmarks/structure/benchmark_compare.py
+++ b/benchmarks/structure/benchmark_compare.py
@@ -7,7 +7,7 @@
 from tests.util import data_dir
 
 
-@pytest.fixture
+@pytest.fixture(scope="module")
 def atoms():
     pdbx_file = pdbx.BinaryCIFFile.read(Path(data_dir("structure")) / "1gya.bcif")
     atoms = pdbx.get_structure(pdbx_file)
diff --git a/benchmarks/structure/benchmark_pdb.py b/benchmarks/structure/benchmark_pdb.py
new file mode 100644
index 000000000..fb8006b48
--- /dev/null
+++ b/benchmarks/structure/benchmark_pdb.py
@@ -0,0 +1,69 @@
+from pathlib import Path
+import pytest
+import biotite.structure.info as info
+import biotite.structure.io.pdb as pdb
+from tests.util import data_dir
+
+
+@pytest.fixture(autouse=True, scope="session")
+def load_ccd():
+    """
+    Ensure that the CCD is already loaded to avoid biasing tests with its loading time.
+    """
+    info.get_ccd()
+
+
+@pytest.fixture(scope="module")
+def pdb_file_path():
+    return Path(data_dir("structure")) / "1aki.pdb"
+
+
+@pytest.fixture(scope="module")
+def empty_pdb_file():
+    return pdb.PDBFile()
+
+
+@pytest.fixture(scope="module")
+def pdb_file(pdb_file_path):
+    return pdb.PDBFile.read(pdb_file_path)
+
+
+@pytest.fixture(scope="module")
+def atoms(pdb_file):
+    return pdb_file.get_structure(model=1, include_bonds=True)
+
+
+@pytest.mark.benchmark
+def benchmark_read(pdb_file_path):
+    pdb.PDBFile.read(pdb_file_path)
+
+
+@pytest.mark.benchmark
+def benchmark_get_coord(pdb_file):
+    pdb_file.get_coord(model=1)
+
+
+@pytest.mark.benchmark
+def benchmark_get_structure(pdb_file):
+    pdb_file.get_structure(model=1)
+
+
+@pytest.mark.benchmark
+def benchmark_get_structure_with_bonds(pdb_file):
+    pdb_file.get_structure(model=1, include_bonds=True)
+
+
+@pytest.mark.benchmark
+def benchmark_get_remark(pdb_file):
+    pdb_file.get_remark(350)
+
+
+@pytest.mark.benchmark
+def benchmark_set_structure(atoms, empty_pdb_file):
+    atoms.bonds = None
+    empty_pdb_file.set_structure(atoms)
+
+
+@pytest.mark.benchmark
+def benchmark_set_structure_with_bonds(atoms, empty_pdb_file):
+    empty_pdb_file.set_structure(atoms)
diff --git a/benchmarks/structure/benchmark_sasa.py b/benchmarks/structure/benchmark_sasa.py
new file mode 100644
index 000000000..1bd2e987f
--- /dev/null
+++ b/benchmarks/structure/benchmark_sasa.py
@@ -0,0 +1,22 @@
+from pathlib import Path
+import pytest
+import biotite.structure as struc
+import biotite.structure.io.pdbx as pdbx
+from tests.util import data_dir
+
+
+@pytest.fixture(scope="module")
+def atoms():
+    """
+    A structure that includes hydrogen atoms.
+    """
+    pdbx_file = pdbx.BinaryCIFFile.read(Path(data_dir("structure")) / "1gya.bcif")
+    return pdbx.get_structure(pdbx_file, model=1, include_bonds=True)
+
+
+@pytest.mark.benchmark
+def benchmark_sasa(atoms):
+    """
+    Compute the SASA of each atom in a structure.
+    """
+    struc.sasa(atoms, vdw_radii="Single")
diff --git a/benchmarks/structure/benchmark_superimpose.py b/benchmarks/structure/benchmark_superimpose.py
index a4376ed7e..f030c1430 100644
--- a/benchmarks/structure/benchmark_superimpose.py
+++ b/benchmarks/structure/benchmark_superimpose.py
@@ -5,7 +5,7 @@
 from tests.util import data_dir
 
 
-@pytest.fixture
+@pytest.fixture(scope="module")
 def atoms():
     pdbx_file = pdbx.BinaryCIFFile.read(Path(data_dir("structure")) / "1gya.bcif")
     return pdbx.get_structure(pdbx_file)
diff --git a/src/biotite/structure/bonds.pyx b/src/biotite/structure/bonds.pyx
index 480504e34..9d0673e92 100644
--- a/src/biotite/structure/bonds.pyx
+++ b/src/biotite/structure/bonds.pyx
@@ -1875,7 +1875,7 @@ def find_connected(bond_list, uint32 root, bint as_mask=False):
     """
     find_connected(bond_list, root, as_mask=False)
 
-    Get indices to all atoms that are directly or inderectly connected
+    Get indices to all atoms that are directly or indirectly connected
     to the root atom indicated by the given index.
 
     An atom is *connected* to the `root` atom, if that atom is reachable