flagos-ai · factnn · Jun 17, 2026 · Jun 17, 2026
diff --git a/docs/source/development/contributing.md b/docs/source/development/contributing.md
@@ -30,7 +30,7 @@ pip install -r requirements/requirements_nvidia.txt
 pip install -e .
 
 # Run tests
-python -m pytest tests/
+python scripts/generate_kernel_and_verify.py --single-test --server-type openai
 ```
 
 ## Code Style

diff --git a/docs/source/development/custom-operators.md b/docs/source/development/custom-operators.md
@@ -4,91 +4,102 @@ How to add custom operators to KernelGenBench.
 
 ## Overview
 
-To benchmark your own operators:
-1. Add test cases to `src/kernelgenbench/accuracy/`
-2. Register them in the dataset
-3. Create operator schema
+To benchmark your own operators, follow the existing pattern (see vLLM/cuBLAS/SGLang as examples):
 
-## Step 1: Create Accuracy Test
+1. Add a thin baseline wrapper in `src/kernelgenbench/dataset/baseline/<namespace>/`
+2. Register operator names in `src/kernelgenbench/dataset/kernel_list.py`
+3. Create accuracy tests in `src/kernelgenbench/accuracy/<namespace>/`
+4. (Optional) Add implementation info for prompt generation
 
-Create a file in `src/kernelgenbench/accuracy/`:
+## Step 1: Create Baseline Wrapper
+
+Create a thin Python wrapper that calls your reference implementation.
+Follow the vLLM pattern for simple function calls:
 
 ```python
-# src/kernelgenbench/accuracy/custom_ops.py
+# src/kernelgenbench/dataset/baseline/myns/my_op.py
 
 import torch
-from .base import AccuracyTest
 
-class CustomAddTest(AccuracyTest):
-    """Test for custom_add operator."""
+try:
+    from my_library import my_op_impl
+except ModuleNotFoundError:
+    my_op_impl = None
 
-    op_name = "custom::add"
 
-    def get_test_cases(self):
-        """Generate test cases for different shapes and dtypes."""
-        return [
-            # (shape, dtype, kwargs)
-            ((64, 64), torch.float32, {}),
-            ((128, 128), torch.float16, {}),
-            ((256, 256), torch.float32, {}),
-        ]
+def my_op(
+    x: torch.Tensor,
+    y: torch.Tensor,
+) -> torch.Tensor:
+    """Wrapper for my_op implementation."""
+    return my_op_impl(x, y)
+```
 
-    def baseline(self, x, y):
-        """Reference implementation."""
-        return x + y
+Create `src/kernelgenbench/dataset/baseline/myns/__init__.py` to export all operators:
 
-    def validate(self, output, expected, rtol=1e-5, atol=1e-5):
-        """Validate output against expected."""
-        return torch.allclose(output, expected, rtol=rtol, atol=atol)
+```python
+from .my_op import my_op
 ```
 
-## Step 2: Register Operator
+## Step 2: Register in Dataset
 
-Add to `src/kernelgenbench/dataset/registry.py`:
+Add to `src/kernelgenbench/dataset/kernel_list.py`:
 
 ```python
-OPERATOR_REGISTRY = {
-    # ... existing operators ...
-    "custom::add": {
-        "test_class": "CustomAddTest",
-        "module": "custom_ops",
-        "description": "Custom addition operator",
-    },
-}
+MYNS_OPERATOR_NAMES = ['my_op']
+
+def _load_myns_operators():
+    from .baseline import myns
+    return {f'myns::{name}': getattr(myns, name) for name in MYNS_OPERATOR_NAMES}
+
+def get_myns_operators():
+    return _load_myns_operators()
 ```
 
-## Step 3: Create Schema (Optional)
+## Step 3: Create Accuracy Tests
 
-For LLM prompt generation, create a schema:
+Create accuracy tests following the vLLM/SGLang test pattern:
 
 ```python
-# src/kernelgenbench/schemas/custom.py
-
-CUSTOM_ADD_SCHEMA = {
-    "name": "custom::add",
-    "inputs": [
-        {"name": "x", "type": "Tensor", "description": "First input tensor"},
-        {"name": "y", "type": "Tensor", "description": "Second input tensor"},
-    ],
-    "outputs": [
-        {"name": "out", "type": "Tensor", "description": "Output tensor"},
-    ],
-    "description": "Element-wise addition of two tensors",
-}
+# src/kernelgenbench/accuracy/myns/test_my_op.py
+
+import kernelgenbench
+from sandbox.config import DEVICE as device
+from sandbox.verifier.test_parametrize import parametrize, label
+from sandbox.utils.accuracy_utils import kernelgenbench_assert_close as assert_close
+import torch
+import triton
+
+
+@label("my_op")
+@parametrize("shape", [(64, 64), (128, 128), (512, 512)])
+@parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32])
+def test_accuracy_my_op(shape, dtype):
+    M, N = shape
+    x = torch.randn(M, N, device='cuda', dtype=dtype)
+    y = torch.randn(M, N, device='cuda', dtype=dtype)
+
+    ref_out = kernelgenbench.baseline.my_op(x, y)
+    # During verification, kernelgenbench.triton.my_op is the generated kernel
+    act_out = kernelgenbench.baseline.my_op(x.clone(), y.clone())
+
+    assert_close(act_out, ref_out, dtype)
 ```
 
+## Step 4: Add Prompt Template (Optional)
+
+For LLM-based kernel generation, add an operator specification to the dataset's
+implementation info (see `vllm_IMPL_INFO.json` and `cublas_IMPL_INFO.json` for examples).
+
 ## Test Your Operator
 
 ```bash
+# Test with a single operator
 python scripts/generate_kernel_and_verify.py \
-    --op-name custom::add \
+    --op-name myns::my_op \
     --single-test \
     --server-type openai
-```
 
-## Best Practices
-
-1. **Comprehensive test cases** - Cover edge cases
-2. **Multiple dtypes** - Test float16, float32, etc.
-3. **Various shapes** - Test small and large tensors
-4. **Clear descriptions** - Help LLMs understand the operator
+# Verify accuracy
+# (runs the accuracy test you created in Step 3)
+```
diff --git a/docs/source/development/extending.md b/docs/source/development/extending.md
@@ -6,10 +6,10 @@ How to extend KernelGenBench with new platforms and methods.
 
 ### Step 1: Add Device Detection
 
-Edit `src/kernelgenbench/runtime/device.py`:
+Edit `src/runtime/__init__.py`:
 
 ```python
-def detect_device():
+def _detect_device_name() -> str:
     """Detect current hardware platform."""
     # Add detection logic for your platform
     if is_my_platform():
@@ -21,7 +21,7 @@ def detect_device():
 
 ### Step 2: Create Platform Configuration
 
-Create `src/kernelgenbench/runtime/platforms/my_platform.py`:
+Add platform constraints to the `DEVICE_CONSTRAINTS` dict in `src/runtime/__init__.py`:
 
 ```python
 class MyPlatformConfig:
@@ -134,7 +134,7 @@ python run.py --method my_method "$@"
 
 ## Adding New Evaluation Metrics
 
-Edit `src/kernelgenbench/metrics/`:
+Add custom analysis to `scripts/analyze/analyze.py`:
 
 ```python
 def compute_my_metric(results):

diff --git a/docs/source/faq/index.md b/docs/source/faq/index.md
@@ -6,7 +6,7 @@ This section answers common questions about KernelGenBench.
 
 ### Q: Which Python version do I need?
 
-**A**: Python 3.8 or higher is required. Python 3.10+ is recommended for best compatibility.
+**A**: Python 3.10 or higher is required.
 
 ### Q: Can I use KernelGenBench on CPU-only machines?
 
@@ -82,7 +82,7 @@ cd agent_bench && bash test_ops.sh add --device-count 1
 
 **A**:
 - LLM Track: `output/pass_at_k/<timestamp>/`
-- Agent Track: `agent_bench/runs/<timestamp>/`
+- Agent Track: `agent_bench/runs/<method>_<dataset>_<timestamp>/`
 
 ## Errors
 

diff --git a/docs/source/features/anti-hack.md b/docs/source/features/anti-hack.md
@@ -13,18 +13,27 @@ The anti-hack architecture guards against "cheating" behaviors where generated c
 
 ### Purpose
 
-Block blacklisted API calls before execution.
+Enforce a whitelist-based approach: most `torch.*` API calls are forbidden.
+Only tensor creation, dtype helpers, and constants are allowed.
 
 ### Method
 
 Parse the generated abstract syntax tree (AST) to detect and block:
 
+**Whitelist (allowed torch APIs):**
+`torch.empty`, `torch.zeros`, `torch.randn`, `torch.range`, `torch.float16`, etc.
+
+**Detected patterns (blocked):**
+
 | Blocked Pattern | Reason |
 |-----------------|--------|
-| `torch.ops.aten.*` | Direct PyTorch API calls |
-| `import vllm` | Using {term}`vLLM` implementation |
-| `ctypes` | Dynamic library loading |
-| `__import__` | Dynamic imports |
+| `torch.*()` not in whitelist | Prevents using torch.sum/mean/mm/reductions |
+| `print()` | Prevents input sniffing from test harness |
+| `.data_ptr()` / `.storage()` | Prevents raw memory access |
+| Module-level `_cache = {}` | Prevents inter-iteration result caching |
+| `import vllm` | Using pre-existing implementations |
+| `exec()` / `eval()` | Dynamic code execution |
+| Import alias / `getattr()` bypass | Catches obfuscation attempts |
 
 ### Implementation
 

diff --git a/docs/source/getting-started/index.md b/docs/source/getting-started/index.md
@@ -116,6 +116,7 @@ python scripts/generate_kernel_and_verify.py \
 | Dataset | Operators | Description |
 |---------|-----------|-------------|
 | `KernelGenBench` | 210 | Full set (ATen + vLLM + cuBLAS, NVIDIA) |
+| `KernelGenBench-nocublas` | 160 | ATen + vLLM (NVIDIA, no cuBLAS) |
 | `KernelGenBench-aten` | 110 | ATen operators only |
 | `KernelGenBench-vllm` | 50 | vLLM operators only (NVIDIA only) |
 | `KernelGenBench-cublas` | 50 | cuBLAS operators only (NVIDIA only) |

diff --git a/docs/source/operation-guide/agent-track/methods.md b/docs/source/operation-guide/agent-track/methods.md
@@ -10,9 +10,9 @@ Available agent methods in KernelGenBench.
 | `normal_cc` | Claude Code + self-verification loop | Opus, GLM, Qwen, MiniMax |
 | `naive_opencode` | Single OpenCode call | Opus, GLM, Qwen, MiniMax |
 | `normal_opencode` | OpenCode + self-verification loop | Opus, GLM, Qwen, MiniMax |
-| AutoKernel | Automated kernel optimization | GLM, Qwen |
-| AKO4ALL | Kernel optimization for all operators | Opus |
-| cuda-optimized-skill | CUDA optimization with strategy memory | Opus |
+| AutoKernel | Automated kernel optimization (standalone: `test_autokernel.sh`) | GLM, Qwen |
+| AKO4ALL | Kernel optimization for all operators (standalone: `test_ako4all.sh`) | Opus |
+| cuda-optimized-skill | CUDA optimization with strategy memory (standalone) | Opus |
 
 ## Basic Agent Frameworks
 

diff --git a/docs/source/operation-guide/llm-track/parameters.md b/docs/source/operation-guide/llm-track/parameters.md
@@ -38,7 +38,7 @@ Specify a single operator to test:
 --op-name vllm13::rms_norm
 
 # cuBLAS operator
---op-name cublasSgemm_v2
+--op-name cublas::cublasSgemm_v2
 ```
 
 ### --dataset
@@ -69,6 +69,6 @@ Results saved to `output/pass_at_k/<timestamp>/`:
 
 | File | Description |
 |------|-------------|
-| `results.json` | Complete results |
+| `pass_at_k_results.json` | Complete results |
 | `kernels/` | Generated kernel files |
 | `checkpoints/` | Resume checkpoints |
diff --git a/docs/source/reference/datasets.md b/docs/source/reference/datasets.md
@@ -1,12 +1,13 @@
 # Datasets
 
-{term}`KernelGenBench` provides four dataset variants for different evaluation scenarios.
+{term}`KernelGenBench` provides five dataset variants for different evaluation scenarios.
 
 ## Dataset Overview
 
 | Dataset | Operators | Sources | Platforms |
 |---------|-----------|---------|-----------|
 | {term}`KernelGenBench` | 210 | {term}`ATen` + {term}`vLLM` + {term}`cuBLAS` | NVIDIA only |
+| {term}`KernelGenBench-nocublas` | 160 | {term}`ATen` + {term}`vLLM` | NVIDIA only |
 | {term}`KernelGenBench-aten` | 110 | {term}`ATen` only | All platforms |
 | {term}`KernelGenBench-vllm` | 50 | {term}`vLLM` only | NVIDIA only |
 | {term}`KernelGenBench-cublas` | 50 | {term}`cuBLAS` only | NVIDIA only |

diff --git a/docs/source/reference/operators.md b/docs/source/reference/operators.md
@@ -13,12 +13,12 @@ Reference list of operator sources in {term}`KernelGenBench`.
 
 | Category | Operators |
 |----------|-----------|
-| Arithmetic | `add`, `sub`, `mul`, `div` |
-| Reduction | `sum`, `mean`, `max`, `min` |
+| Arithmetic | `add`, `div`, `floor_divide`, `cos`, `sin` |
+| Reduction | `sum`, `mean`, `argmax`, `amin` |
 | Linear | `matmul`, `linear`, `bmm` |
-| Normalization | `softmax`, `layer_norm` |
-| Activation | `relu`, `gelu`, `silu` |
-| Shape Operations | `reshape`, `transpose`, `permute` |
+| Normalization | `_softmax`, `softmax` |
+| Activation | `hardsigmoid`, `prelu`, `heaviside` |
+| Shape Operations | `expand`, `repeat`, `view` |
 
 ## vLLM Operators (50)
 
@@ -45,9 +45,9 @@ Reference list of operator sources in {term}`KernelGenBench`.
 
 | Precision | Standard | StridedBatched | Batched |
 |-----------|----------|----------------|---------|
-| Float32 | `cublasSgemm` | `cublasSgemmStridedBatched` | `cublasSgemmBatched` |
+| Float32 | `cublasSgemm_v2` | `cublasSgemmStridedBatched` | `cublasSgemmBatched_64` |
 | Float64 | — | `cublasDgemmStridedBatched` | `cublasDgemmBatched` |
-| Complex64 | `cublasCgemm` | `cublasCgemmStridedBatched` | — |
+| Complex64 | `cublasCgemm_v2` | `cublasCgemmStridedBatched` | — |
 | Complex128 | — | `cublasZgemmStridedBatched` | `cublasZgemmBatched` |
 | Float16 | — | `cublasHgemmStridedBatched` | `cublasHgemmBatched` |
 
@@ -64,4 +64,4 @@ Reference list of operator sources in {term}`KernelGenBench`.
 |--------|--------|---------|
 | {term}`ATen` | `aten::` | `aten::add.Tensor` |
 | {term}`vLLM` | `vllm13::` | `vllm13::rms_norm` |
-| {term}`cuBLAS` | `cublas` | `cublasSgemm_v2` |
+| {term}`cuBLAS` | `cublas::` | `cublas::cublasSgemm_v2` |