From 7506ab9bfeaa08a36c1c01a2b9810abab85de5bd Mon Sep 17 00:00:00 2001 From: Patrick Kunzmann Date: Fri, 30 Jan 2026 21:56:25 +0100 Subject: [PATCH] Add benchmarks for modules to be replaced by Rust code --- benchmarks/sequence/align/benchmark_kmer.py | 50 ++++++++ .../sequence/align/benchmark_multiple.py | 24 ++++ .../sequence/align/benchmark_pairwise.py | 48 ++++++++ .../sequence/align/benchmark_selector.py | 36 ++++++ benchmarks/sequence/benchmark_alphabet.py | 35 ++++++ benchmarks/sequence/benchmark_phylo.py | 27 +++++ benchmarks/structure/benchmark_alphabet.py | 2 +- benchmarks/structure/benchmark_bonds.py | 107 ++++++++++++++++++ benchmarks/structure/benchmark_celllist.py | 21 +++- benchmarks/structure/benchmark_charges.py | 16 +++ benchmarks/structure/benchmark_compare.py | 2 +- benchmarks/structure/benchmark_pdb.py | 69 +++++++++++ benchmarks/structure/benchmark_sasa.py | 22 ++++ benchmarks/structure/benchmark_superimpose.py | 2 +- src/biotite/structure/bonds.pyx | 2 +- 15 files changed, 455 insertions(+), 8 deletions(-) create mode 100644 benchmarks/sequence/align/benchmark_kmer.py create mode 100644 benchmarks/sequence/align/benchmark_multiple.py create mode 100644 benchmarks/sequence/align/benchmark_pairwise.py create mode 100644 benchmarks/sequence/align/benchmark_selector.py create mode 100644 benchmarks/sequence/benchmark_alphabet.py create mode 100644 benchmarks/sequence/benchmark_phylo.py create mode 100644 benchmarks/structure/benchmark_bonds.py create mode 100644 benchmarks/structure/benchmark_charges.py create mode 100644 benchmarks/structure/benchmark_pdb.py create mode 100644 benchmarks/structure/benchmark_sasa.py diff --git a/benchmarks/sequence/align/benchmark_kmer.py b/benchmarks/sequence/align/benchmark_kmer.py new file mode 100644 index 000000000..f2efaa721 --- /dev/null +++ b/benchmarks/sequence/align/benchmark_kmer.py @@ -0,0 +1,50 @@ +import numpy as np +import pytest +import biotite.sequence as seq +import biotite.sequence.align as align + +SEQ_LENGTH = 10_000 +K = 3 + + +@pytest.fixture(scope="module") +def sequence(): + np.random.seed(0) + sequence = seq.ProteinSequence() + sequence.code = np.random.randint( + len(seq.ProteinSequence.alphabet), size=SEQ_LENGTH + ) + return sequence + + +@pytest.fixture(scope="module") +def kmer_alphabet(): + return align.KmerAlphabet(seq.ProteinSequence.alphabet, K) + + +@pytest.fixture(scope="module") +def matrix(): + return align.SubstitutionMatrix.std_protein_matrix() + + +@pytest.fixture(scope="module") +def score_threshold_rule(matrix): + return align.ScoreThresholdRule(matrix, 10) + + +@pytest.mark.benchmark +def benchmark_create_kmers(kmer_alphabet, sequence): + """ + Create k-mer codes from a sequence. + """ + kmer_alphabet.create_kmers(sequence.code) + + +@pytest.mark.benchmark +def benchmark_similar_kmers(score_threshold_rule, kmer_alphabet): + """ + Find all k-mers similar to a reference k-mer using a score threshold. + """ + KMER_CODE = 0 + + score_threshold_rule.similar_kmers(kmer_alphabet, KMER_CODE) diff --git a/benchmarks/sequence/align/benchmark_multiple.py b/benchmarks/sequence/align/benchmark_multiple.py new file mode 100644 index 000000000..b905a57c0 --- /dev/null +++ b/benchmarks/sequence/align/benchmark_multiple.py @@ -0,0 +1,24 @@ +from pathlib import Path +import pytest +import biotite.sequence.align as align +import biotite.sequence.io.fasta as fasta +from tests.util import data_dir + + +@pytest.fixture(scope="module") +def sequences(): + fasta_file = fasta.FastaFile.read(Path(data_dir("sequence")) / "cas9.fasta") + return list(fasta.get_sequences(fasta_file).values()) + + +@pytest.fixture(scope="module") +def matrix(): + return align.SubstitutionMatrix.std_protein_matrix() + + +@pytest.mark.benchmark +def benchmark_align_multiple(sequences, matrix): + """ + Perform progressive multiple sequence alignment. + """ + align.align_multiple(sequences, matrix, gap_penalty=(-10, -1)) diff --git a/benchmarks/sequence/align/benchmark_pairwise.py b/benchmarks/sequence/align/benchmark_pairwise.py new file mode 100644 index 000000000..0d3d29ea5 --- /dev/null +++ b/benchmarks/sequence/align/benchmark_pairwise.py @@ -0,0 +1,48 @@ +from functools import partial +from pathlib import Path +import pytest +import biotite.sequence as seq +import biotite.sequence.align as align +import biotite.sequence.io.fasta as fasta +from tests.util import data_dir + +GAP_PENALTY = (-10, -1) + + +@pytest.fixture(scope="module") +def sequences(): + fasta_file = fasta.FastaFile.read(Path(data_dir("sequence")) / "cas9.fasta") + return [seq.ProteinSequence(s) for s in fasta_file.values()] + + +@pytest.fixture(scope="module") +def matrix(): + return align.SubstitutionMatrix.std_protein_matrix() + + +@pytest.fixture(scope="module") +def seq_pair(sequences): + return sequences[0], sequences[1] + + +@pytest.fixture(scope="module") +def seed(seq_pair): + return (len(seq_pair[0]) // 2, len(seq_pair[1]) // 2) + + +@pytest.mark.benchmark +@pytest.mark.parametrize( + "method", + [ + partial(align.align_optimal, gap_penalty=GAP_PENALTY), + partial(align.align_banded, band=(-50, 50), gap_penalty=GAP_PENALTY), + partial(align.align_local_gapped, threshold=100, gap_penalty=GAP_PENALTY), + ], + ids=lambda x: x.func.__name__, +) +def benchmark_align_pairwise(seq_pair, matrix, seed, method): + """ + Perform pairwise sequence alignment using different algorithms. + """ + kwargs = {"seed": seed} if method.func is align.align_local_gapped else {} + method(seq_pair[0], seq_pair[1], matrix, **kwargs) diff --git a/benchmarks/sequence/align/benchmark_selector.py b/benchmarks/sequence/align/benchmark_selector.py new file mode 100644 index 000000000..19aa5c9d9 --- /dev/null +++ b/benchmarks/sequence/align/benchmark_selector.py @@ -0,0 +1,36 @@ +import numpy as np +import pytest +import biotite.sequence as seq +import biotite.sequence.align as align + +SEQ_LENGTH = 10_000 +K = 8 +S = 4 +WINDOW = 10 +ALPHABET = seq.NucleotideSequence.alphabet_unamb +KMER_ALPHABET = align.KmerAlphabet(ALPHABET, K) + + +@pytest.fixture(scope="module") +def sequence(): + np.random.seed(0) + s = seq.NucleotideSequence() + s.code = np.random.randint(len(ALPHABET), size=SEQ_LENGTH) + return s + + +@pytest.mark.parametrize( + "selector", + [ + align.MinimizerSelector(KMER_ALPHABET, window=WINDOW), + align.SyncmerSelector(ALPHABET, K, S), + align.CachedSyncmerSelector(ALPHABET, K, S), + align.MincodeSelector(KMER_ALPHABET, compression=4), + ], + ids=lambda x: x.__class__.__name__, +) +def benchmark_select(sequence, selector): + """ + Select k-mers from a sequence using different selection strategies. + """ + selector.select(sequence) diff --git a/benchmarks/sequence/benchmark_alphabet.py b/benchmarks/sequence/benchmark_alphabet.py new file mode 100644 index 000000000..7e68a3db0 --- /dev/null +++ b/benchmarks/sequence/benchmark_alphabet.py @@ -0,0 +1,35 @@ +import pytest +import biotite.sequence as seq + +SEQ_LENGTH = 10_000 + + +@pytest.fixture(scope="module") +def alphabet(): + return seq.ProteinSequence.alphabet + + +@pytest.fixture(scope="module") +def symbols(): + return "ACDEFGHIKLMNPQRSTVWY" * (SEQ_LENGTH // 20) + + +@pytest.fixture(scope="module") +def code(alphabet, symbols): + return alphabet.encode_multiple(symbols) + + +@pytest.mark.benchmark +def benchmark_encode(alphabet, symbols): + """ + Encode symbols into a sequence code. + """ + alphabet.encode_multiple(symbols) + + +@pytest.mark.benchmark +def benchmark_decode(alphabet, code): + """ + Decode a sequence code into symbols. + """ + alphabet.decode_multiple(code) diff --git a/benchmarks/sequence/benchmark_phylo.py b/benchmarks/sequence/benchmark_phylo.py new file mode 100644 index 000000000..704ee98ef --- /dev/null +++ b/benchmarks/sequence/benchmark_phylo.py @@ -0,0 +1,27 @@ +import numpy as np +import pytest +import biotite.sequence.phylo as phylo + +N = 20 + + +@pytest.fixture(scope="module") +def distances(): + np.random.seed(0) + rand = np.random.rand(N, N).astype(np.float32) + distances = (rand + rand.T) / 2 + np.fill_diagonal(distances, 0) + return distances + + +@pytest.mark.benchmark +@pytest.mark.parametrize( + "method", + [phylo.upgma, phylo.neighbor_joining], + ids=lambda x: x.__name__, +) +def benchmark_clustering(distances, method): + """ + Perform hierarchical clustering from a distance matrix. + """ + method(distances) diff --git a/benchmarks/structure/benchmark_alphabet.py b/benchmarks/structure/benchmark_alphabet.py index 54a9080ad..4d351853e 100644 --- a/benchmarks/structure/benchmark_alphabet.py +++ b/benchmarks/structure/benchmark_alphabet.py @@ -7,7 +7,7 @@ PDB_ID = "1aki" -@pytest.fixture +@pytest.fixture(scope="module") def atoms(): pdbx_file = pdbx.BinaryCIFFile.read(Path(data_dir("structure")) / f"{PDB_ID}.bcif") return pdbx.get_structure(pdbx_file, model=1, include_bonds=True) diff --git a/benchmarks/structure/benchmark_bonds.py b/benchmarks/structure/benchmark_bonds.py new file mode 100644 index 000000000..0bf01fa46 --- /dev/null +++ b/benchmarks/structure/benchmark_bonds.py @@ -0,0 +1,107 @@ +from pathlib import Path +import pytest +import biotite.structure as struc +import biotite.structure.info as info +import biotite.structure.io.pdbx as pdbx +from tests.util import data_dir + +PDB_ID = "1aki" + + +@pytest.fixture(autouse=True, scope="session") +def load_ccd(): + """ + Ensure that the CCD is already loaded to avoid biasing tests with its loading time. + """ + info.get_ccd() + + +@pytest.fixture(scope="module") +def atoms(): + pdbx_file = pdbx.BinaryCIFFile.read(Path(data_dir("structure")) / f"{PDB_ID}.bcif") + return pdbx.get_structure(pdbx_file, model=1, include_bonds=True) + + +@pytest.fixture(scope="module") +def bond_array(atoms): + return atoms.bonds.as_array() + + +@pytest.mark.benchmark +def benchmark_bond_list_creation(atoms, bond_array): + """ + Create a `BondList` from an array of bonds, which involves sorting and deduplication. + """ + struc.BondList(atoms.array_length(), bond_array) + + +@pytest.mark.benchmark +@pytest.mark.parametrize( + "method", + [ + struc.BondList.as_set, + struc.BondList.as_graph, + struc.BondList.as_array, + struc.BondList.get_all_bonds, + struc.BondList.adjacency_matrix, + struc.BondList.bond_type_matrix, + ], + ids=lambda x: x.__name__, +) +def benchmark_conversion(atoms, method): + """ + Convert the `BondList` to a different representation. + """ + method(atoms.bonds) + + +@pytest.mark.benchmark +def benchmark_get_bonds(atoms): + """ + Get the bonds for each atom index. + """ + for i in range(atoms.array_length()): + atoms.bonds.get_bonds(i) + + +@pytest.mark.benchmark +def benchmark_get_all_bonds(atoms): + """ + Get the bonds for all atom indices. + """ + atoms.bonds.get_all_bonds() + + +@pytest.mark.benchmark +def benchmark_concatenate(atoms): + """ + Concatenate two `BondList` objects. + """ + atoms.bonds.concatenate([atoms.bonds, atoms.bonds]) + + +@pytest.mark.parametrize( + "connect_fn", [struc.connect_via_distances, struc.connect_via_residue_names] +) +@pytest.mark.benchmark +def benchmark_connect(atoms, connect_fn): + """ + Find bonds between atoms using the specified method. + """ + connect_fn(atoms) + + +@pytest.mark.benchmark +def benchmark_find_connected(atoms): + """ + Find all connected atoms for a given atom index. + """ + struc.find_connected(atoms.bonds, 0) + + +@pytest.mark.benchmark +def benchmark_find_rotatable_bonds(atoms): + """ + Find all rotatable bonds in a `BondList`. + """ + struc.find_rotatable_bonds(atoms.bonds) diff --git a/benchmarks/structure/benchmark_celllist.py b/benchmarks/structure/benchmark_celllist.py index afff79a07..1d41a4405 100644 --- a/benchmarks/structure/benchmark_celllist.py +++ b/benchmarks/structure/benchmark_celllist.py @@ -5,15 +5,28 @@ from tests.util import data_dir -@pytest.fixture +@pytest.fixture(scope="module") def atoms(): pdbx_file = pdbx.BinaryCIFFile.read(Path(data_dir("structure")) / "1gya.bcif") return pdbx.get_structure(pdbx_file, model=1) -def benchmark_cell_list(atoms): +@pytest.fixture(scope="module") +def cell_list(atoms): + return struc.CellList(atoms, 5.0) + + +@pytest.mark.benchmark +def benchmark_cell_list_creation(atoms): + """ + Create a cell list for a structure. + """ + struc.CellList(atoms, 5.0) + + +@pytest.mark.benchmark +def benchmark_cell_list_compute_contacts(cell_list, atoms): """ - Find all contacts in a structure using a cell list. + Find all contacts in a structure using an existing cell list. """ - cell_list = struc.CellList(atoms, 5.0) cell_list.get_atoms(atoms.coord, 5.0) diff --git a/benchmarks/structure/benchmark_charges.py b/benchmarks/structure/benchmark_charges.py new file mode 100644 index 000000000..4040a35ba --- /dev/null +++ b/benchmarks/structure/benchmark_charges.py @@ -0,0 +1,16 @@ +import pytest +import biotite.structure as struc +import biotite.structure.info as info + + +@pytest.fixture(scope="module") +def atoms(): + return info.residue("PNN") + + +@pytest.mark.benchmark +def benchmark_partial_charges(atoms): + """ + Compute the partial charges of each atom in a structure. + """ + struc.partial_charges(atoms) diff --git a/benchmarks/structure/benchmark_compare.py b/benchmarks/structure/benchmark_compare.py index b6b00be83..67f9a5cbe 100644 --- a/benchmarks/structure/benchmark_compare.py +++ b/benchmarks/structure/benchmark_compare.py @@ -7,7 +7,7 @@ from tests.util import data_dir -@pytest.fixture +@pytest.fixture(scope="module") def atoms(): pdbx_file = pdbx.BinaryCIFFile.read(Path(data_dir("structure")) / "1gya.bcif") atoms = pdbx.get_structure(pdbx_file) diff --git a/benchmarks/structure/benchmark_pdb.py b/benchmarks/structure/benchmark_pdb.py new file mode 100644 index 000000000..fb8006b48 --- /dev/null +++ b/benchmarks/structure/benchmark_pdb.py @@ -0,0 +1,69 @@ +from pathlib import Path +import pytest +import biotite.structure.info as info +import biotite.structure.io.pdb as pdb +from tests.util import data_dir + + +@pytest.fixture(autouse=True, scope="session") +def load_ccd(): + """ + Ensure that the CCD is already loaded to avoid biasing tests with its loading time. + """ + info.get_ccd() + + +@pytest.fixture(scope="module") +def pdb_file_path(): + return Path(data_dir("structure")) / "1aki.pdb" + + +@pytest.fixture(scope="module") +def empty_pdb_file(): + return pdb.PDBFile() + + +@pytest.fixture(scope="module") +def pdb_file(pdb_file_path): + return pdb.PDBFile.read(pdb_file_path) + + +@pytest.fixture(scope="module") +def atoms(pdb_file): + return pdb_file.get_structure(model=1, include_bonds=True) + + +@pytest.mark.benchmark +def benchmark_read(pdb_file_path): + pdb.PDBFile.read(pdb_file_path) + + +@pytest.mark.benchmark +def benchmark_get_coord(pdb_file): + pdb_file.get_coord(model=1) + + +@pytest.mark.benchmark +def benchmark_get_structure(pdb_file): + pdb_file.get_structure(model=1) + + +@pytest.mark.benchmark +def benchmark_get_structure_with_bonds(pdb_file): + pdb_file.get_structure(model=1, include_bonds=True) + + +@pytest.mark.benchmark +def benchmark_get_remark(pdb_file): + pdb_file.get_remark(350) + + +@pytest.mark.benchmark +def benchmark_set_structure(atoms, empty_pdb_file): + atoms.bonds = None + empty_pdb_file.set_structure(atoms) + + +@pytest.mark.benchmark +def benchmark_set_structure_with_bonds(atoms, empty_pdb_file): + empty_pdb_file.set_structure(atoms) diff --git a/benchmarks/structure/benchmark_sasa.py b/benchmarks/structure/benchmark_sasa.py new file mode 100644 index 000000000..1bd2e987f --- /dev/null +++ b/benchmarks/structure/benchmark_sasa.py @@ -0,0 +1,22 @@ +from pathlib import Path +import pytest +import biotite.structure as struc +import biotite.structure.io.pdbx as pdbx +from tests.util import data_dir + + +@pytest.fixture(scope="module") +def atoms(): + """ + A structure that includes hydrogen atoms. + """ + pdbx_file = pdbx.BinaryCIFFile.read(Path(data_dir("structure")) / "1gya.bcif") + return pdbx.get_structure(pdbx_file, model=1, include_bonds=True) + + +@pytest.mark.benchmark +def benchmark_sasa(atoms): + """ + Compute the SASA of each atom in a structure. + """ + struc.sasa(atoms, vdw_radii="Single") diff --git a/benchmarks/structure/benchmark_superimpose.py b/benchmarks/structure/benchmark_superimpose.py index a4376ed7e..f030c1430 100644 --- a/benchmarks/structure/benchmark_superimpose.py +++ b/benchmarks/structure/benchmark_superimpose.py @@ -5,7 +5,7 @@ from tests.util import data_dir -@pytest.fixture +@pytest.fixture(scope="module") def atoms(): pdbx_file = pdbx.BinaryCIFFile.read(Path(data_dir("structure")) / "1gya.bcif") return pdbx.get_structure(pdbx_file) diff --git a/src/biotite/structure/bonds.pyx b/src/biotite/structure/bonds.pyx index 480504e34..9d0673e92 100644 --- a/src/biotite/structure/bonds.pyx +++ b/src/biotite/structure/bonds.pyx @@ -1875,7 +1875,7 @@ def find_connected(bond_list, uint32 root, bint as_mask=False): """ find_connected(bond_list, root, as_mask=False) - Get indices to all atoms that are directly or inderectly connected + Get indices to all atoms that are directly or indirectly connected to the root atom indicated by the given index. An atom is *connected* to the `root` atom, if that atom is reachable