diff --git a/docs/exp.md b/docs/exp.md
new file mode 100644
index 0000000..8ef757d
--- /dev/null
+++ b/docs/exp.md
@@ -0,0 +1,295 @@
+\section{Experiments and Evaluation}
+\label{sec:experiments}
+
+In this section, we deploy KernelGenBench to evaluate LLMs and agentic frameworks across diverse operator sources and heterogeneous hardware platforms.
+
+\subsection{Experimental Setup}
+\label{subsec:setup}
+
+We deploy the evaluation suite across six dedicated full-node hardware platforms: NVIDIA A100 and five alternative hardware platforms (Platform A--E). The full 210-operator problem set is evaluated on the NVIDIA baseline, while the 110-operator ATen subset is used to benchmark cross-platform portability. For all language models, we use \texttt{temperature}=0 for Pass@1 generation and \texttt{temperature}=0.8 for Pass@5, with a context window limit of \texttt{max\_tokens}=16384 and a unified 30-minute wall-clock timeout per operator task. Evaluated paradigms are detailed in Section~\ref{subsec:pipeline_and_antihack}. Results for legacy models are deferred to Appendix~\ref{app:legacy_baselines}.
+
+\subsection{KernelGenBench-MS: Multi-Source Evaluation}
+\label{subsec:nvidia_analysis}
+
+We evaluate all methods on NVIDIA A100 across the full 210-operator suite. Table~\ref{tab:nvidia_main_ops} presents the comprehensive results.
+
+% \begin{table}[htbp]
+% \centering
+% \setlength{\tabcolsep}{4pt}
+% \caption{NVIDIA A100 evaluation across 210 operators from three sources (ATen, vLLM, cuBLAS), showing accuracy and speedup by operator source across all generation paradigms.}
+% \label{tab:nvidia_main_ops}
+% \resizebox{\textwidth}{!}{%
+% \begin{tabular}{l | cc | cc | cc | cc}
+% \toprule
+% \multirow{2}{*}{\textbf{Method \& Setup}} & \multicolumn{2}{c|}{\textbf{Overall (210)}} & \multicolumn{2}{c|}{\textbf{ATen (110)}} & \multicolumn{2}{c|}{\textbf{vLLM (50)}} & \multicolumn{2}{c}{\textbf{cuBLAS (50)}} \\
+% \cmidrule(lr){2-3} \cmidrule(lr){4-5} \cmidrule(lr){6-7} \cmidrule(lr){8-9}
+% & Acc (\%) & Spd ($\times$) & Acc (\%) & Spd ($\times$) & Acc (\%) & Spd ($\times$) & Acc (\%) & Spd ($\times$) \\
+% \midrule
+% % --- LLM Sampling Methods ---
+% \multicolumn{9}{l}{\textbf{LLM Sampling Methods}} \\
+% \midrule
+% Pass@1 (Opus-4.6)       & 41 & 0.70 & 39 & 0.90 & 20 & 0.76 & 68 & 0.49 \\
+% Pass@1 (GLM-5.0)        & 21 & 0.68 & 21 & 0.49 & 24 & 1.24 & 20 & 0.73 \\
+% Pass@1 (Qwen3.5-27b)    & 7  & 0.85 & 8  & 0.83 & 2  & 2.05 & 8  & 0.71 \\
+% Pass@1 (MiniMax M-2.5)  & 2  & 0.88 & 4  & 0.88 & 0  & 0.00 & 0  & 0.00 \\
+% \midrule
+% Pass@5 (Opus-4.6)       & 57 & 0.68 & 62 & 0.79 & 28 & 0.71 & 74 & 0.49 \\
+% Pass@5 (GLM-5.0)        & 36 & 0.77 & 45 & 0.64 & 32 & 1.28 & 20 & 0.76 \\
+% Pass@5 (Qwen3.5-27b)    & 11 & 1.01 & 13 & 1.04 & 12 & 0.70 & 8  & 0.68 \\
+% Pass@5 (MiniMax M-2.5)  & 17 & 0.69 & 21 & 0.76 & 18 & 1.27 & 2  & 0.46 \\
+% \midrule
+% % --- Vanilla Agentic Frameworks ---
+% \multicolumn{9}{l}{\textbf{Vanilla Agentic Frameworks}} \\
+% \midrule
+% Claude Code (Opus-4.6)     & 87 & 0.78 & 92 & 0.86 & 68 & 1.02 & 94 & 0.51 \\
+% Claude Code (GLM-5.0)      & 67 & 0.83 & 72 & 0.88 & 52 & 1.23 & 72 & 0.53 \\
+% Claude Code (Qwen3.5-27b)  & 62 & 0.70 & 80 & 0.68 & 38 & 1.17 & 48 & 0.50 \\
+% Claude Code (MiniMax M-2.5)& 49 & 0.69 & 69 & 0.78 & 26 & 0.46 & 26 & 0.58 \\
+% \midrule
+% OpenCode (Opus-4.6)        & 81 & 0.73 & 92 & 0.82 & 46 & 0.97 & 92 & 0.50 \\
+% OpenCode (GLM-5.0)         & 72 & 0.69 & 87 & 0.71 & 42 & 0.94 & 70 & 0.51 \\
+% OpenCode (Qwen3.5-27b)     & 53 & 0.78 & 58 & 0.75 & 44 & 1.31 & 52 & 0.58 \\
+% OpenCode (MiniMax M-2.5)   & 41 & 0.62 & 50 & 0.77 & 26 & 0.44 & 36 & 0.42 \\
+% \midrule
+% % --- Kernel-Specialized Agents ---
+% \multicolumn{9}{l}{\textbf{Kernel-Specialized Agents}} \\
+% \midrule
+% AKO4all (Opus-4.6)              & 83 & 0.97 & 91 & 1.00 & 64 & 1.62 & 84 & 0.61 \\
+% CUDA Opt. Skill (MiniMax M-2.5) & 45 & 0.80 & 63 & 0.81 & 24 & 0.92 & 28 & 0.45 \\
+% \midrule
+% AutoKernel (GLM-5.0)            & 71 & 0.99 & 87 & 1.00 & 43 & 1.40 & 66 & 0.75 \\
+% AutoKernel (Qwen3.5-27b)        & 47 & 1.02 & 69 & 1.00 & 16 & 1.63 & 30 & 0.80 \\
+% AutoKernel (MiniMax M-2.5)      & 43 & 0.89 & 66 & 0.87 & 20 & 1.52 & 16 & 0.46 \\
+% \bottomrule
+% \end{tabular}%
+% }
+% \end{table}
+
+\begin{table}[htbp]
+\centering
+\setlength{\tabcolsep}{4pt}
+\caption{NVIDIA A100 evaluation across 210 operators from three sources (ATen, vLLM, cuBLAS), showing accuracy and speedup by operator source across all generation paradigms.}
+\label{tab:nvidia_main_ops}
+\resizebox{\textwidth}{!}{%
+\begin{tabular}{l | cc | cc | cc | cc}
+\toprule
+\multirow{2}{*}{\textbf{Method \& Setup}} & \multicolumn{2}{c|}{\textbf{Overall (210)}} & \multicolumn{2}{c|}{\textbf{ATen (110)}} & \multicolumn{2}{c|}{\textbf{vLLM (50)}} & \multicolumn{2}{c}{\textbf{cuBLAS (50)}} \\
+\cmidrule(lr){2-3} \cmidrule(lr){4-5} \cmidrule(lr){6-7} \cmidrule(lr){8-9}
+& Acc (\%) & Spd ($\times$) & Acc (\%) & Spd ($\times$) & Acc (\%) & Spd ($\times$) & Acc (\%) & Spd ($\times$) \\
+\midrule
+% --- LLM Sampling Methods ---
+\multicolumn{9}{l}{\textbf{LLM Sampling Methods}} \\
+\midrule
+Pass@1 (Opus-4.6)       & 41 & 0.70 & 39 & 0.90 & 20 & 0.76 & 68 & 0.49 \\
+Pass@1 (GLM-5.0)        & 21 & 0.68 & 21 & 0.49 & 24 & 1.24 & 20 & 0.73 \\
+Pass@1 (Qwen3.5-27b)    & 7  & 0.85 & 8  & 0.83 & 2  & \textbf{2.05} & 8  & 0.71 \\
+Pass@1 (MiniMax M-2.5)  & 2  & 0.88 & 4  & 0.88 & 0  & 0.00 & 0  & 0.00 \\
+\midrule
+Pass@5 (Opus-4.6)       & 57 & 0.68 & 62 & 0.79 & 28 & 0.71 & 74 & 0.49 \\
+Pass@5 (GLM-5.0)        & 36 & 0.77 & 45 & 0.64 & 32 & 1.28 & 20 & \underline{0.76} \\
+Pass@5 (Qwen3.5-27b)    & 11 & \underline{1.01} & 13 & \textbf{1.04} & 12 & 0.70 & 8  & 0.68 \\
+Pass@5 (MiniMax M-2.5)  & 17 & 0.69 & 21 & 0.76 & 18 & 1.27 & 2  & 0.46 \\
+\midrule
+% --- Vanilla Agentic Frameworks ---
+\multicolumn{9}{l}{\textbf{Vanilla Agentic Frameworks}} \\
+\midrule
+Claude Code (Opus-4.6)     & 87 & 0.78 & 92 & 0.86 & 68 & 1.02 & 94 & 0.51 \\
+Claude Code (GLM-5.0)      & 67 & 0.83 & 72 & 0.88 & 52 & 1.23 & 72 & 0.53 \\
+Claude Code (Qwen3.5-27b)  & 62 & 0.70 & 80 & 0.68 & 38 & 1.17 & 48 & 0.50 \\
+Claude Code (MiniMax M-2.5)& 49 & 0.69 & 69 & 0.78 & 26 & 0.46 & 26 & 0.58 \\
+\midrule
+OpenCode (Opus-4.6)        & 81 & 0.73 & 92 & 0.82 & 46 & 0.97 & 92 & 0.50 \\
+OpenCode (GLM-5.0)         & 72 & 0.69 & 87 & 0.71 & 42 & 0.94 & 70 & 0.51 \\
+OpenCode (Qwen3.5-27b)     & 53 & 0.78 & 58 & 0.75 & 44 & 1.31 & 52 & 0.58 \\
+OpenCode (MiniMax M-2.5)   & 41 & 0.62 & 50 & 0.77 & 26 & 0.44 & 36 & 0.42 \\
+\midrule
+% --- Kernel-Specialized Agents ---
+\multicolumn{9}{l}{\textbf{Kernel-Specialized Agents}} \\
+\midrule
+AKO4all (Opus-4.6)              & 83 & 0.97 & 91 & \underline{1.00} & 64 & 1.62 & 84 & 0.61 \\
+CUDA Opt. Skill (MiniMax M-2.5) & 45 & 0.80 & 63 & 0.81 & 24 & 0.92 & 28 & 0.45 \\
+\midrule
+AutoKernel (GLM-5.0)            & 71 & 0.99 & 87 & \underline{1.00} & 43 & 1.40 & 66 & 0.75 \\
+AutoKernel (Qwen3.5-27b)        & 47 & \textbf{1.02} & 69 & \underline{1.00} & 16 & \underline{1.63} & 30 & \textbf{0.80} \\
+AutoKernel (MiniMax M-2.5)      & 43 & 0.89 & 66 & 0.87 & 20 & 1.52 & 16 & 0.46 \\
+\bottomrule
+\end{tabular}%
+}
+\end{table}
+
+\textbf{Finding 1: Model and Method Capabilities.}
+Claude Code (Opus-4.6) achieves the highest overall accuracy at 87\%, while AutoKernel (Qwen3.5-27b) achieves the highest overall speedup at 1.02$\times$. Comparing the two top performers reveals a fundamental trade-off: Claude Code outperforms the kernel-specialized AKO4all (Opus-4.6) at 83\% accuracy by 4~pp, but AKO4all achieves 0.97$\times$ speedup versus Claude Code's 0.78$\times$. Kernel-specialized agents prioritize performance optimization over correctness, sacrificing functional correctness on edge cases to maximize speedup. By contrast, vanilla agentic frameworks allocate more iterations to debugging and correctness verification, yielding higher pass rates at the cost of performance. Speedup distribution metrics (fast$_p$) for all configurations are detailed in Appendix~\ref{app:fastp_results}.
+
+Breaking down by operator source reveals where specialization pays off. On ATen operators, both Claude Code and OpenCode with Opus-4.6 reach 92\% accuracy, while kernel-specialized agents (AKO4all, AutoKernel with GLM-5.0/Qwen3.5-27b) achieve 1.00$\times$ speedup, perfectly matching the baseline. On vLLM operators, Claude Code (Opus-4.6) leads in accuracy at 68\%, but AutoKernel (Qwen3.5-27b) delivers 1.63$\times$ speedup---the highest across all sources---demonstrating that specialized agents excel at performance optimization on complex operators. On cuBLAS operators, Claude Code (Opus-4.6) achieves 94\% accuracy, yet even the best speedup (AutoKernel Qwen at 0.80$\times$) remains below the proprietary baseline. Model-specific generation bottlenecks and anti-hack interception patterns are detailed in Appendix~\ref{app:generation_behaviors}. 
+
+\textbf{Finding 2: Operator-Source Difficulty Hierarchy.}
+A strict, model-agnostic difficulty hierarchy emerges across the three operator sources. ATen operators are the most tractable: even weaker models achieve reasonable accuracy (e.g., Claude Code with MiniMax reaches 69\%), and speedup consistently clusters around 0.8--1.0$\times$, matching the framework baseline. This reflects ATen's design as a high-level framework API with well-defined semantics and moderate performance requirements.
+
+vLLM operators present the opposite profile---functional correctness is extremely challenging (accuracy drops sharply, e.g., Claude Code with MiniMax falls to 26\%), yet when successfully generated, they deliver genuine acceleration potential (AKO4all achieves 1.62$\times$, AutoKernel with Qwen reaches 1.63$\times$). This difficulty stems from vLLM's complex inference-specific operators (paged attention, KV cache management, mixed-precision quantization), which require intricate memory layouts and algorithmic understanding that LLMs struggle to implement correctly. However, the baseline implementations are not heavily hand-tuned, leaving room for optimization when correctness is achieved.
+
+cuBLAS operators occupy the middle ground: moderate accuracy is achievable (Claude Code with Opus reaches 94\%), but speedup is universally capped---virtually all configurations cluster tightly around 0.50$\times$, unable to surpass the hand-tuned proprietary baseline. This performance ceiling reflects cuBLAS's status as a closed-source, heavily optimized library representing decades of expert engineering. The baseline directly loads \texttt{libcublas.so} via \texttt{ctypes}, bypassing all high-level wrappers, making it nearly impossible for LLM-generated Triton kernels to match proprietary BLAS performance. This hierarchy reveals that ATen serves as the tractable baseline, vLLM tests optimization capability on complex kernels, and cuBLAS exposes the fundamental difficulty of matching closed-source performance.
+
+
+\subsection{KernelGenBench-MC: Cross-Platform Evaluation}
+\label{subsec:cross_platform_analysis}
+
+We extend the evaluation of the 110 ATen operators across six hardware platforms. Table~\ref{tab:cross_platform_main} presents the comprehensive results.
+
+% \begin{table}[htbp]
+% \centering
+% \setlength{\tabcolsep}{3.5pt}
+% \caption{Cross-platform evaluation on 110 ATen operators across six hardware platforms, showing whether correctness and speedup transfer across heterogeneous hardware backends.}
+% \label{tab:cross_platform_main}
+% \resizebox{\textwidth}{!}{%
+% \begin{tabular}{l | cc | cc | cc | cc | cc | cc}
+% \toprule
+% \multirow{2}{*}{\textbf{Method \& Setup}} & \multicolumn{2}{c|}{\textbf{NVIDIA}} & \multicolumn{2}{c|}{\textbf{Platform A}} & \multicolumn{2}{c|}{\textbf{Platform B}} & \multicolumn{2}{c|}{\textbf{Platform C}} & \multicolumn{2}{c|}{\textbf{Platform D}} & \multicolumn{2}{c}{\textbf{Platform E}} \\
+% \cmidrule(lr){2-3} \cmidrule(lr){4-5} \cmidrule(lr){6-7} \cmidrule(lr){8-9} \cmidrule(lr){10-11} \cmidrule(lr){12-13}
+% & Acc (\%) & Spd ($\times$) & Acc (\%) & Spd ($\times$) & Acc (\%) & Spd ($\times$) & Acc (\%) & Spd ($\times$) & Acc (\%) & Spd ($\times$) & Acc (\%) & Spd ($\times$) \\
+% \midrule
+% % --- LLM Sampling Methods ---
+% \multicolumn{13}{l}{\textbf{LLM Sampling Methods}} \\
+% \midrule
+% Pass@1 (Opus-4.6)           & 39 & 0.90 & 46 & 0.19 & 44 & 0.69 & 37 & 0.98 & 38 & 0.89 & 38 & 0.88 \\
+% Pass@1 (Qwen3.5-27b)        & 8  & 0.83 & 9  & 0.09 & 3  & 1.02 & 7  & 0.90 & 7  & 0.98 & 10 & 1.03 \\
+% Pass@1 (MiniMax M-2.5)      & 4  & 0.88 & 4  & 0.25 & 6  & 1.33 & 4  & 1.05 & 5  & 0.77 & 4  & 1.15 \\
+% \midrule
+% Pass@5 (Opus-4.6)           & 62 & 0.79 & 63 & 0.15 & 60 & 0.74 & 54 & 0.92 & 65 & 0.68 & 57 & 0.83 \\
+% Pass@5 (Qwen3.5-27b)        & 13 & 1.04 & 16 & 0.18 & 11 & 1.10 & 15 & 0.72 & 10 & 0.99 & 17 & 1.02 \\
+% Pass@5 (MiniMax M-2.5)      & 21 & 0.76 & 17 & 0.20 & 15 & 0.53 & 12 & 1.05 & 8  & 0.33 & 9  & 0.76 \\
+% \midrule
+% % --- Vanilla Agentic Frameworks ---
+% \multicolumn{13}{l}{\textbf{Vanilla Agentic Frameworks}} \\
+% \midrule
+% Claude Code (Opus-4.6)      & 92 & 0.86 & 89 & 0.18 & 93 & 0.80 & 88 & 0.87 & 96 & 0.89 & 83 & 0.83 \\
+% Claude Code (GLM-5.0)       & 67 & 0.83 & 65 & 0.16 & 65 & 0.96 & 65 & 0.81 & 59 & 0.90 & 37 & 0.77 \\
+% Claude Code (Qwen3.5-27b)   & 80 & 0.68 & 78 & 0.25 & 75 & 0.61 & 75 & 0.85 & 82 & 0.77 & 23 & 0.81 \\
+% Claude Code (MiniMax M-2.5) & 69 & 0.78 & 69 & 0.16 & 74 & 0.59 & 73 & 0.72 & 83 & 0.63 & 69 & 0.58 \\
+% \midrule
+% % --- Kernel-Specialized Agents ---
+% \multicolumn{13}{l}{\textbf{Kernel-Specialized Agents}} \\
+% \midrule
+% AKO4all (Opus-4.6)              & 89 & 1.00 & 84 & 0.30 & 88 & 1.09 & 88 & 1.08 & 86 & 1.12 & 80 & 1.07 \\
+% CUDA Opt. Skill (MiniMax M-2.5) & 63 & 0.81 & 53 & 0.21 & 64 & 0.77 & 65 & 0.81 & 67 & 0.77 & 58 & 0.79 \\
+% \midrule
+% AutoKernel (GLM-5.0)            & 87 & 1.00 & 53 & 0.82 & 56 & 1.01 & 64 & 0.99 & 59 & 1.00 & 25 & 1.01 \\
+% AutoKernel (Qwen3.5-27b)        & 69 & 1.00 & 40 & 0.37 & 75 & 1.03 & 65 & 1.00 & 74 & 1.04 & 21 & 1.01 \\
+% AutoKernel (MiniMax M-2.5)      & 66 & 0.87 & 61 & 0.66 & 71 & 1.36 & 66 & 0.99 & 71 & 1.04 & 50 & 1.02 \\
+% \bottomrule
+% \end{tabular}%
+% }
+% \end{table}
+
+\begin{table}[htbp]
+\centering
+\setlength{\tabcolsep}{3.5pt}
+\caption{Cross-platform evaluation on 110 ATen operators across six hardware platforms, showing whether correctness and speedup transfer across heterogeneous hardware backends.}
+\label{tab:cross_platform_main}
+\resizebox{\textwidth}{!}{%
+\begin{tabular}{l | cc | cc | cc | cc | cc | cc}
+\toprule
+\multirow{2}{*}{\textbf{Method \& Setup}} & \multicolumn{2}{c|}{\textbf{NVIDIA}} & \multicolumn{2}{c|}{\textbf{Platform A}} & \multicolumn{2}{c|}{\textbf{Platform B}} & \multicolumn{2}{c|}{\textbf{Platform C}} & \multicolumn{2}{c|}{\textbf{Platform D}} & \multicolumn{2}{c}{\textbf{Platform E}} \\
+\cmidrule(lr){2-3} \cmidrule(lr){4-5} \cmidrule(lr){6-7} \cmidrule(lr){8-9} \cmidrule(lr){10-11} \cmidrule(lr){12-13}
+& Acc (\%) & Spd ($\times$) & Acc (\%) & Spd ($\times$) & Acc (\%) & Spd ($\times$) & Acc (\%) & Spd ($\times$) & Acc (\%) & Spd ($\times$) & Acc (\%) & Spd ($\times$) \\
+\midrule
+% --- LLM Sampling Methods ---
+\multicolumn{13}{l}{\textbf{LLM Sampling Methods}} \\
+\midrule
+Pass@1 (Opus-4.6)           & 39 & 0.90 & 46 & 0.19 & 44 & 0.69 & 37 & 0.98 & 38 & 0.89 & 38 & 0.88 \\
+Pass@1 (Qwen3.5-27b)        & 8  & 0.83 & 9  & 0.09 & 3  & 1.02 & 7  & 0.90 & 7  & 0.98 & 10 & 1.03 \\
+Pass@1 (MiniMax M-2.5)      & 4  & 0.88 & 4  & 0.25 & 6  & \underline{1.33} & 4  & \underline{1.05} & 5  & 0.77 & 4  & \textbf{1.15} \\
+\midrule
+Pass@5 (Opus-4.6)           & 62 & 0.79 & 63 & 0.15 & 60 & 0.74 & 54 & 0.92 & 65 & 0.68 & 57 & 0.83 \\
+Pass@5 (Qwen3.5-27b)        & 13 & \textbf{1.04} & 16 & 0.18 & 11 & 1.10 & 15 & 0.72 & 10 & 0.99 & 17 & 1.02 \\
+Pass@5 (MiniMax M-2.5)      & 21 & 0.76 & 17 & 0.20 & 15 & 0.53 & 12 & \underline{1.05} & 8  & 0.33 & 9  & 0.76 \\
+\midrule
+% --- Vanilla Agentic Frameworks ---
+\multicolumn{13}{l}{\textbf{Vanilla Agentic Frameworks}} \\
+\midrule
+Claude Code (Opus-4.6)      & 92 & 0.86 & 89 & 0.18 & 93 & 0.80 & 88 & 0.87 & 96 & 0.89 & 83 & 0.83 \\
+Claude Code (GLM-5.0)       & 67 & 0.83 & 65 & 0.16 & 65 & 0.96 & 65 & 0.81 & 59 & 0.90 & 37 & 0.77 \\
+Claude Code (Qwen3.5-27b)   & 80 & 0.68 & 78 & 0.25 & 75 & 0.61 & 75 & 0.85 & 82 & 0.77 & 23 & 0.81 \\
+Claude Code (MiniMax M-2.5) & 69 & 0.78 & 69 & 0.16 & 74 & 0.59 & 73 & 0.72 & 83 & 0.63 & 69 & 0.58 \\
+\midrule
+% --- Kernel-Specialized Agents ---
+\multicolumn{13}{l}{\textbf{Kernel-Specialized Agents}} \\
+\midrule
+AKO4all (Opus-4.6)              & 89 & \underline{1.00} & 84 & 0.30 & 88 & 1.09 & 88 & \textbf{1.08} & 86 & \textbf{1.12} & 80 & \underline{1.07} \\
+CUDA Opt. Skill (MiniMax M-2.5) & 63 & 0.81 & 53 & 0.21 & 64 & 0.77 & 65 & 0.81 & 67 & 0.77 & 58 & 0.79 \\
+\midrule
+AutoKernel (GLM-5.0)            & 87 & \underline{1.00} & 53 & \textbf{0.82} & 56 & 1.01 & 64 & 0.99 & 59 & 1.00 & 25 & 1.01 \\
+AutoKernel (Qwen3.5-27b)        & 69 & \underline{1.00} & 40 & 0.37 & 75 & 1.03 & 65 & \underline{1.00} & 74 & \underline{1.04} & 21 & 1.01 \\
+AutoKernel (MiniMax M-2.5)      & 66 & 0.87 & 61 & \underline{0.66} & 71 & \textbf{1.36} & 66 & 0.99 & 71 & \underline{1.04} & 50 & 1.02 \\
+\bottomrule
+\end{tabular}%
+}
+\end{table}
+
+\textbf{Finding 1: Model and Method Capabilities Across Platforms.}
+For accuracy, Claude Code (Opus-4.6) consistently achieves the highest rates across most platforms: 92\% on NVIDIA, 89\% on Platform A, 93\% on Platform B, 88\% on Platform C, and 96\% on Platform D; Platform E is the exception, where Claude Code (MiniMax M-2.5) reaches 69\%. For speedup, kernel-specialized agents dominate: AKO4all (Opus-4.6) achieves the best overall performance with 1.00$\times$ on NVIDIA, 1.08$\times$ on Platform C, 1.12$\times$ on Platform D, and 1.07$\times$ on Platform E, while AutoKernel (MiniMax M-2.5) reaches the highest single-platform speedup at 1.36$\times$ on Platform B. However, kernel-specialized agents exhibit severe accuracy variance across platforms: AKO4all ranges from 89\% on NVIDIA to 80\% on Platform E, while AutoKernel (Qwen3.5-27b) spans 75\% on Platform B down to 21\% on Platform E—revealing that even state-of-the-art specialized methods struggle with cross-platform portability.
+
+This divergence stems from how methods utilize platform-specific information. Vanilla agentic frameworks like Claude Code provide minimal initial context but include hardware-specific constraints (e.g., API limitations, type strictness) in the prompt; agents actively leverage this information during debugging, adapting their implementations to platform quirks. Kernel-specialized agents, by contrast, focus heavily on performance tuning—profiling, block-size search, memory-access optimization—and often overlook the provided platform constraints, leading to compilation failures or runtime errors on non-NVIDIA backends despite achieving superior speedup when kernels do compile successfully. 
+
+\textbf{Finding 2: Platform-Specific Performance Divergence.}
+Cross-platform evaluation exposes severe performance heterogeneity. Platform A suffers from a catastrophic speedup collapse: Claude Code (Opus-4.6) maintains 89\% accuracy but achieves only 0.18$\times$ speedup, the lowest across all platforms. This 4.8$\times$ degradation relative to NVIDIA (0.86$\times$) reveals unoptimized backend implementations despite functional correctness. Platform E exhibits the opposite failure mode---accuracy collapse: Claude Code with Qwen3.5-27b drops to 23\%, and AutoKernel variants fall to 21--25\%, significantly lower than other platforms (typically 60--90\%). This reflects immature vendor compilers that frequently hang or crash when processing unstructured LLM-generated code, leading to compilation timeouts. Furthermore, non-NVIDIA platforms incur massive compilation overheads: Platform A requires 2.1$\times$ tokens and 2.0$\times$ time relative to NVIDIA (Figure~\ref{fig:radar_cross_platform}), forcing agents to burn iteration budgets on compilation debugging rather than kernel optimization.
+
+\begin{figure}[htbp]
+\centering
+\includegraphics[width=0.75\textwidth]{figures/fig2_radar_cross_platform.pdf}
+\caption{Platform A collapses speedup to 0.18$\times$ despite high accuracy, while non-NVIDIA platforms incur up to 2$\times$ compilation overhead. Left: radar chart showing accuracy and speedup across six platforms (110 ATen, Claude Code Opus-4.6). Right: token and time overhead relative to NVIDIA.}
+\label{fig:radar_cross_platform}
+\end{figure}
+
+\textbf{Finding 3: Cross-Platform Cost Overhead.}
+Non-NVIDIA platforms incur massive compilation overheads, as quantified in Figure~\ref{fig:radar_cross_platform} (right panel). Platform A exhibits the most severe overhead: 173M total tokens (2.06$\times$ NVIDIA's 84M baseline) and 18 hours (2.00$\times$ NVIDIA's 9 hours). Platform B requires 128M tokens (1.52$\times$) and 16 hours (1.78$\times$), while Platforms C, D, and E consume 107--123M tokens (1.27--1.46$\times$) and 15--16 hours (1.67--1.78$\times$). This overhead is not algorithmic—it is purely ecosystem friction from immature vendor compilers and incomplete Triton backend support.
+
+The root cause is that models lack prior exposure to heterogeneous hardware constraints during pretraining. When generating kernels for alternative platforms, agents must iteratively discover platform-specific limitations through trial and error: API availability (e.g., missing \texttt{tl.acosh} or \texttt{tl.math.tanh}), type system strictness (mixed int32/int64 loops that compile silently on CUDA but raise hard errors elsewhere), pointer addressing modes (32-bit vs 64-bit), and LLVM IR compatibility gaps. Each compilation failure forces the agent to burn tokens diagnosing opaque backend errors, adjusting code to satisfy undocumented constraints, and re-attempting compilation—consuming iteration budgets that would otherwise be spent on functional correctness or performance optimization. Platform A's 2$\times$ overhead directly reflects its backend's fragility: frequent compilation hangs and cryptic error messages force agents into extended debugging cycles, while Platform E's compiler instability (leading to the 21--25\% accuracy collapse noted in Finding 2) similarly inflates token costs as agents repeatedly retry failed compilations before timing out.
+
+\subsection{Accuracy-Speedup Gap}
+\label{subsec:accuracy_speedup_gap}
+
+Figure~\ref{fig:dumbbell} plots per-operator accuracy against speedup on the NVIDIA baseline across 16 configurations. A systematic pattern emerges: accuracy spans the full range (2\% for MiniMax Pass@1 to 87\% for Claude Code Opus-4.6), while speedup clusters tightly in a narrow band (0.62--1.01$\times$, with 14 of 16 configurations falling within 0.68--0.83$\times$). The two outliers are Qwen3.5 Pass@5 at 1.01$\times$ (11\% accuracy) and MiniMax OpenCode at 0.62$\times$ (41\% accuracy). This divergence reveals survivorship bias: operators that weaker models fail to solve are disproportionately the computationally complex ones, where the baseline is heavily hand-tuned and high speedup is hardest to achieve. Weaker configurations report higher average speedup not because they optimize better, but because they fail the complex tasks and avoid their severe performance penalties (e.g., the 0.50$\times$ ceiling of cuBLAS). They only survive the simpler operators where matching the baseline (achieving $\sim$0.8--1.0$\times$) is relatively easy. Consequently, speedup comparisons across methods are only meaningful when conditioned on matched operator subsets.
+
+\begin{figure}[htbp]
+\centering
+\includegraphics[width=\textwidth]{figures/fig3_dumbbell.pdf}
+\caption{Accuracy--speedup divergence: accuracy spans the full range while speedup clusters in a narrow band---``accuracy leaps, speedup stalls.''}
+\label{fig:dumbbell}
+\end{figure}
+
+
+\subsection{Trajectory Analysis}
+\label{subsec:trajectory}
+
+Analysis of hundreds of complete LLM and agent trajectories uncovers two distinct failure layers. \textbf{Universal algorithmic failures} occur across all platforms: infinite dispatch recursion (calling the overridden ATen operator internally triggers unbounded recursion), hallucinated Triton APIs (models generate calls to non-existent functions such as \texttt{tl.pow}, \texttt{tl.einsum}, \texttt{tl.gather}), and algorithmically hard operators (\texttt{matmul}, \texttt{sort}, \texttt{cumsum} require cross-block parallel algorithms that agents rarely converge to).
+
+\textbf{Heterogeneous-platform failures} directly explain the accuracy collapse on Platform E: LLVM IR incompatibility triggers \texttt{PassManager::run failed} errors (31 observed occurrences on one platform), 32-bit pointer addressing causes memory errors on large-tensor operators, and missing math APIs (\texttt{tl.acosh}, \texttt{tl.math.tanh}) must be manually reimplemented. Further failure patterns—8 universal and 6 platform-specific—are provided in Appendix~\ref{app:platform_failures}.
+
+\subsection{Agentic Cost Efficiency}
+\label{subsec:cost}
+
+The introduction of closed-loop execution brings significant economic and time overhead. Table~\ref{tab:nvidia_cost_ops} quantifies this cost on the NVIDIA A100 baseline. Kernel-specialized agents universally consume far more tokens and time than vanilla frameworks---their extended iteration budgets are spent optimizing kernel performance. AKO4all is the most extreme case, requiring 904M tokens and 83 hours---over 3$\times$ the token cost of Claude Code (Opus-4.6) at 263M---achieving 5.19M tokens per successful operator. Across all kernel-specialized methods (AKO4all, CUDA Optimized Skill, AutoKernel variants), the average tokens per successful operator is 5.11M, orders of magnitude higher than vanilla agentic frameworks (1.45--3.30M) and simple LLM sampling approaches. These results highlight the need for more cost-efficient agentic methods that can close the performance gap without prohibitive overhead. Ablation studies isolating the value of execution feedback are detailed in Appendix~\ref{app:ablation_full}.
+
+\begin{table}[htbp]
+\centering
+\caption{Agentic cost on NVIDIA A100 (210 operators). Total Tokens (M), Tokens per Successful Operator (M), and Total Time (h).}
+\label{tab:nvidia_cost_ops}
+\setlength{\tabcolsep}{3pt}
+\footnotesize
+\begin{tabular}{l | c c c}
+\toprule
+\textbf{Method} & \textbf{Total Tokens (M)} & \textbf{Tokens per Success (M)} & \textbf{Total Time (h)} \\
+\midrule
+Claude Code (Opus-4.6)     & 263 & 1.45 & 33 \\
+Claude Code (GLM-5.0)      & 243 & 1.67 & 45 \\
+Claude Code (Qwen3.5-27b)  & 381 & 2.93 & 48 \\
+Claude Code (MiniMax M-2.5)& 340 & 3.30 & 50 \\
+\midrule
+AKO4all (Opus-4.6)       & 904 & 5.19 & 83 \\
+CUDA Opt. Skill (MiniMax M-2.5) & 594 & 6.75 & 97 \\
+AutoKernel (GLM-5.0)      & 471 & 3.16 & 102 \\
+AutoKernel (Qwen3.5-27b)  & 475 & 4.80 & 102 \\
+AutoKernel (MiniMax M-2.5)& 508 & 5.64 & 105 \\
+\bottomrule
+\end{tabular}
+\end{table}
\ No newline at end of file
diff --git a/docs/mublas.md b/docs/mublas.md
new file mode 100644
index 0000000..8239890
--- /dev/null
+++ b/docs/mublas.md
@@ -0,0 +1,145 @@
+# cuBLAS → MUBLAS 迁移对照
+
+## 对照规则
+
+| cuBLAS | MUBLAS | 规则 |
+|--------|--------|------|
+| `cublas` 前缀 | `mublas` 前缀 | `cu` → `mu` |
+| `_v2` 后缀 | 去掉 | MUBLAS 无 `_v2` |
+| `_64` 后缀 | 去掉 | MUBLAS 无 `_64`，参数用 `int` 非 `int64_t` |
+| `cublasSgemmEx` | `mublasGemmEx` | 无类型前缀，单例泛型函数 |
+| `cublasHandle_t` | `mublasHandle_t` | |
+| `cudaStream_t` | `MUstream` | |
+| `cuComplex` | `muComplex` | |
+| `cuDoubleComplex` | `muDoubleComplex` | |
+| `CUBLAS_OP_*` | `MUBLAS_OP_*` | |
+| `CUBLAS_FILL_MODE_*` | `MUBLAS_FILL_MODE_*` | |
+| `CUBLAS_DIAG_*` | `MUBLAS_DIAG_*` | |
+| `CUBLAS_SIDE_*` | `MUBLAS_SIDE_*` | |
+| `CUBLAS_STATUS_*` | `MUBLAS_STATUS_*` | |
+| `<cublas_v2.h>` | `<mublas.h>` | |
+| `-lcublas` | `-lmublas` | |
+
+## 详细对照表
+
+```
+# ===== 函数替换 =====
+cublasCcopy_v2              → mublasCcopy
+cublasCdotu_v2              → mublasCdotu
+cublasCgemmStridedBatched   → mublasCgemmStridedBatched
+cublasCgemmStridedBatched_64→ mublasCgemmStridedBatched  (去 _64)
+cublasCgemm_v2              → mublasCgemm
+cublasCgemvBatched_64       → mublasCgemvBatched          (去 _64)
+cublasCgemvStridedBatched   → mublasCgemvStridedBatched
+cublasCgemv_v2              → mublasCgemv
+cublasCgeru_v2              → mublasCgeru
+cublasCsymm_v2              → mublasCsymm
+cublasCsymv_v2              → mublasCsymv
+cublasCsyrkEx               → mublasCsyrkEx
+cublasDasum_v2              → mublasDasum
+cublasDaxpy_v2              → mublasDaxpy
+cublasDcopy_v2              → mublasDcopy
+cublasDgemmBatched          → mublasDgemmBatched
+cublasDgemmStridedBatched   → mublasDgemmStridedBatched
+cublasDgemmStridedBatched_64→ mublasDgemmStridedBatched  (去 _64)
+cublasDgemm_v2              → mublasDgemm
+cublasDgemvBatched          → mublasDgemvBatched
+cublasDgemvStridedBatched   → mublasDgemvStridedBatched
+cublasDgemv_v2              → mublasDgemv
+cublasDsbmv_v2              → mublasDsbmv
+cublasDsyr2_v2              → mublasDsyr2
+cublasDtrsmBatched          → mublasDtrsmBatched
+cublasHgemmBatched          → mublasHgemmBatched
+cublasHgemmStridedBatched   → mublasHgemmStridedBatched
+cublasSaxpy_v2              → mublasSaxpy
+cublasSdgmm                 → mublasSdgmm
+cublasSdot_v2               → mublasSdot
+cublasSgeam                 → mublasSgeam
+cublasSgemmBatched_64       → mublasSgemmBatched          (去 _64)
+cublasSgemmEx               → mublasGemmEx                (去类型前缀 S)
+cublasSgemmStridedBatched   → mublasSgemmStridedBatched
+cublasSgemm_v2              → mublasSgemm
+cublasSgemvBatched          → mublasSgemvBatched
+cublasSgemvStridedBatched   → mublasSgemvStridedBatched
+cublasSger_v2               → mublasSger
+cublasSscal_v2              → mublasSscal
+cublasSsyrk_v2              → mublasSsyrk
+cublasStbmv_v2              → mublasStbmv
+cublasStrsm_v2              → mublasStrsm
+cublasStrsv_v2              → mublasStrsv
+cublasZdotc_v2              → mublasZdotc
+cublasZgemmBatched          → mublasZgemmBatched
+cublasZgemmStridedBatched   → mublasZgemmStridedBatched
+cublasZgemvBatched          → mublasZgemvBatched
+cublasZgemvStridedBatched   → mublasZgemvStridedBatched
+cublasZgerc_v2              → mublasZgerc
+cublasZswap_v2              → mublasZswap
+cublasZtrsmBatched          → mublasZtrsmBatched
+
+# ===== 类型替换 =====
+cublasHandle_t              → mublasHandle_t
+cublasStatus_t              → mublasStatus
+cublasOperation_t           → mublasOperation_t / mublasOperation
+cublasFillMode_t            → mublasFillMode_t
+cublasDiagType_t            → mublasDiagType_t
+cublasSideMode_t            → mublasSideMode_t
+cudaStream_t                → MUstream
+cuComplex                   → muComplex
+cuDoubleComplex             → muDoubleComplex
+__half                      → __half  (不变, 来自 musa_fp16.h)
+
+# ===== 枚举替换 =====
+CUBLAS_OP_N / T / C         → MUBLAS_OP_N / T / C
+CUBLAS_FILL_MODE_LOWER/UPPER→ MUBLAS_FILL_MODE_LOWER/UPPER (多 FULL=123 option)
+CUBLAS_DIAG_NON_UNIT/UNIT   → MUBLAS_DIAG_NON_UNIT/UNIT
+CUBLAS_SIDE_LEFT/RIGHT      → MUBLAS_SIDE_LEFT/RIGHT (多 BOTH=143 option)
+CUBLAS_STATUS_SUCCESS       → MUBLAS_STATUS_SUCCESS
+CUBLAS_STATUS_*             → MUBLAS_STATUS_*
+
+# ===== 辅助函数 =====
+cublasCreate                → mublasCreate
+cublasDestroy               → mublasDestroy
+cublasSetStream             → mublasSetStream
+cublasGetStream             → mublasGetStream
+cublasSetMathMode           → mublasSetMathMode
+cublasGetMathMode           → mublasGetMathMode
+cublasGetVersion            → mublasGetVersion
+cublasSetPointerMode        → mublasSetPointerMode
+cublasGetPointerMode        → mublasGetPointerMode
+cublasSetAtomicsMode        → mublasSetAtomicsMode
+cublasGetAtomicsMode        → mublasGetAtomicsMode
+cublasSetWorkspace          → mublasSetWorkspace
+cublasSetVectorAsync        → mublasSetVectorAsync
+cublasGetVectorAsync        → mublasGetVectorAsync
+cublasSetMatrixAsync        → mublasSetMatrixAsync
+cublasGetMatrixAsync        → mublasGetMatrixAsync
+```
+
+## Python import 替换示例
+
+```python
+# 原 cuBLAS
+from .cublasCcopy_v2 import cublasCcopy_v2
+from .cublasSgemmEx import cublasSgemmEx
+from .cublasSgemmBatched_64 import cublasSgemmBatched_64
+
+# 改为 MUBLAS
+from .mublasCcopy import mublasCcopy
+from .mublasGemmEx import mublasGemmEx         # 注意: 去 S 前缀
+from .mublasSgemmBatched import mublasSgemmBatched  # 去 _64
+```
+
+## 3 个例外
+
+| # | cuBLAS | MUBLAS | 原因 |
+|---|--------|--------|------|
+| 1 | `cublasSgemmEx` | `mublasGemmEx` | 摩尔用泛型 `GemmEx` 无类型前缀 |
+| 2 | `*_64` 系列 | 去掉 `_64` | 摩尔不存在 `_64` 变种，参数均为 `int` |
+| 3 | `*_v2` 系列 | 去掉 `_v2` | 摩尔使用 C99 接口无版本后缀 |
+
+## MUBLAS 版本信息
+
+- 版本: **1.10.6**
+- 库文件: `libmublas.so`, `libmublasLt.so`
+- 头文件: `mublas.h`, `mublas_v2.h`(5.1.0), `mublasLt.h`, `mublasXt.h`
+- 路径: `/usr/local/musa/include/`
\ No newline at end of file
diff --git a/docs/mublas_fix.md b/docs/mublas_fix.md
new file mode 100644
index 0000000..1e277de
--- /dev/null
+++ b/docs/mublas_fix.md
@@ -0,0 +1,458 @@
+# cuBLAS → MUBLAS 3 个问题具体修改方案
+
+---
+
+## 问题 1: `cublasSgemmEx` → `mublasGemmEx`
+
+### 问题描述
+函数名去掉了类型前缀 `S`，且 computeType/algo 枚举完全换了一套。
+
+### 函数签名对照
+
+```cpp
+// cuBLAS 原版
+cublasStatus_t cublasSgemmEx(
+    cublasHandle_t handle,
+    cublasOperation_t transa, cublasOperation_t transb,
+    int m, int n, int k,
+    const void *alpha,
+    const void *A, cudaDataType_t Atype, int lda,
+    const void *B, cudaDataType_t Btype, int ldb,
+    const void *beta,
+    void *C, cudaDataType_t Ctype, int ldc,
+    cudaDataType_t computeType,
+    cublasGemmAlgo_t algo
+);
+
+// MUBLAS 新版
+mublasStatus mublasGemmEx(
+    mublasHandle_t handle,
+    mublasOperation_t transA, mublasOperation_t transB,
+    int m, int n, int k,
+    const void *alpha,
+    const void *a, musaDataType_t a_type, int lda,
+    const void *b, musaDataType_t b_type, int ldb,
+    const void *beta,
+    void *c, musaDataType_t c_type, int ldc,
+    mublasComputeType_t compute_type,   // ← 类型变了！
+    mublasGemmAlgo_t algo               // ← 枚举变了！
+);
+```
+
+### computeType 对照
+
+| 用法场景 | cuBLAS `cudaDataType_t` | MUBLAS `mublasComputeType_t` |
+|----------|------------------------|------------------------------|
+| FP16 输入, FP16 累加 | `CUDA_R_16F` (值=2) | `MUBLAS_COMPUTE_16F` (值=64) |
+| FP16 输入, FP32 累加 | `CUDA_R_32F` (值=0) | `MUBLAS_COMPUTE_32F` (值=68) |
+| FP32 输入, FP32 累加 | `CUDA_R_32F` (值=0) | `MUBLAS_COMPUTE_32F` (值=68) |
+| TF32 输入, FP32 累加 | `CUDA_R_32F_FAST_TF32` (值=...) | `MUBLAS_COMPUTE_32F_FAST_TF32` (值=77) |
+| FP64 输入, FP64 累加 | `CUDA_R_64F` (值=4) | `MUBLAS_COMPUTE_64F` (值=70) |
+| INT32 输入, INT32 累加 | `CUDA_R_32I` (值=8) | `MUBLAS_COMPUTE_32I` (值=72) |
+
+**关键**: 不能只替换宏名，因为数值完全不同！
+
+### algo 对照
+
+| cuBLAS `cublasGemmAlgo_t` | MUBLAS `mublasGemmAlgo_t` |
+|---------------------------|---------------------------|
+| `CUBLAS_GEMM_DEFAULT` (很多值可用) | `MUBLAS_GEMM_DEFAULT` (值=0x0) |
+| `CUBLAS_GEMM_DEFAULT_TENSOR_OP` | `MUBLAS_GEMM_DEFAULT_TENSOR_OP` (值=0x1) |
+
+⚠️ cuBLAS 有 `CUBLAS_GEMM_ALGO0` 到 `CUBLAS_GEMM_ALGO15`、`CUBLAS_GEMM_DFALT` 等多个 algo，
+MUBLAS 只有 2 个。不能用数值索引。
+
+### 迁移代码 (C++ 封装方案)
+
+如果你有很多地方调用 `cublasSgemmEx`，最干净的方案是写一个兼容宏或内联函数：
+
+```cpp
+// === 方案 A: 内联兼容函数 (推荐) ===
+inline mublasStatus mublasSgemmEx(
+    mublasHandle_t handle,
+    mublasOperation_t transA, mublasOperation_t transB,
+    int m, int n, int k,
+    const void *alpha,
+    const void *A, musaDataType_t Atype, int lda,
+    const void *B, musaDataType_t Btype, int ldb,
+    const void *beta,
+    void *C, musaDataType_t Ctype, int ldc,
+    musaDataType_t oldComputeType,  // 旧的 cudaDataType_t 值
+    mublasGemmAlgo_t algo)
+{
+    // 把旧 cudaDataType_t 映射到 mublasComputeType_t
+    mublasComputeType_t newComputeType;
+    switch (oldComputeType) {
+        case 2:  // CUDA_R_16F 或 musa 16F
+            newComputeType = MUBLAS_COMPUTE_16F;   // 64
+            break;
+        case 0:  // CUDA_R_32F 或 musa 32F
+            newComputeType = MUBLAS_COMPUTE_32F;   // 68
+            break;
+        case 4:  // CUDA_R_64F 或 musa 64F
+            newComputeType = MUBLAS_COMPUTE_64F;   // 70
+            break;
+        case 8:  // CUDA_R_32I 或 musa 32I
+            newComputeType = MUBLAS_COMPUTE_32I;   // 72
+            break;
+        default:
+            newComputeType = MUBLAS_COMPUTE_32F;   // fallback
+    }
+    return mublasGemmEx(handle, transA, transB, m, n, k,
+                        alpha, A, Atype, lda, B, Btype, ldb,
+                        beta, C, Ctype, ldc,
+                        newComputeType, algo);
+}
+
+// 然后全局替换:
+// cublasSgemmEx(  →  mublasSgemmEx(
+```
+```cpp
+// === 方案 B: 逐处替换 ===
+// 旧代码:
+cublasSgemmEx(handle, CUBLAS_OP_N, CUBLAS_OP_N,
+              M, N, K,
+              &alpha, A, CUDA_R_16F, lda, B, CUDA_R_16F, ldb,
+              &beta, C, CUDA_R_16F, ldc,
+              CUDA_R_32F, CUBLAS_GEMM_DEFAULT);
+
+// 新代码:
+mublasGemmEx(handle, MUBLAS_OP_N, MUBLAS_OP_N,
+             M, N, K,
+             &alpha, A, MUSA_R_16F, lda, B, MUSA_R_16F, ldb,
+             &beta, C, MUSA_R_16F, ldc,
+             MUBLAS_COMPUTE_32F, MUBLAS_GEMM_DEFAULT);
+```
+```cpp
+// === 方案 C: Python/ctypes 用法 ===
+// 如果通过 ctypes 调用，直接改函数名和参数值:
+
+// 旧:
+lib.cublasSgemmEx(handle, 0, 0,  # CUBLAS_OP_N = 0,0
+#                  m, n, k,
+#                  alpha_ptr, a_ptr, 2, lda,  # CUDA_R_16F = 2
+#                  b_ptr, 2, ldb,
+#                  beta_ptr, c_ptr, 2, ldc,
+#                  0, 0)  # CUDA_R_32F = 0, CUBLAS_GEMM_DEFAULT = 0
+
+// 新:
+lib.mublasGemmEx(handle, 111, 111,  # MUBLAS_OP_N = 111,111
+#                 m, n, k,
+#                 alpha_ptr, a_ptr, 2, lda,   # musa 16F API 值也是 2
+#                 b_ptr, 2, ldb,
+#                 beta_ptr, c_ptr, 2, ldc,
+#                 68, 0)  # MUBLAS_COMPUTE_32F = 68, MUBLAS_GEMM_DEFAULT = 0
+```
+
+---
+
+## 问题 2: `_64` 后缀 → 参数类型 int64_t → int
+
+### 问题描述
+cuBLAS 中带 `_64` 后缀的函数使用 `int64_t` 作为矩阵维度参数。
+MUBLAS 去掉了 `_64` 变种，所有维度都用 `int` (int32_t)。
+
+### 受影响函数 (4个)
+
+| cuBLAS | MUBLAS |
+|--------|--------|
+| `cublasCgemmStridedBatched_64` | `mublasCgemmStridedBatched` |
+| `cublasDgemmStridedBatched_64` | `mublasDgemmStridedBatched` |
+| `cublasSgemmBatched_64` | `mublasSgemmBatched` |
+| `cublasCgemvBatched_64` | `mublasCgemvBatched` |
+
+### 迁移代码
+
+```cpp
+// === 旧代码 ===
+int64_t M = 65536, N = 65536, K = 65536;
+cublasDgemmStridedBatched_64(handle, CUBLAS_OP_N, CUBLAS_OP_N,
+                              M, N, K,          // int64_t
+                              &alpha, A, lda, strideA,
+                              B, ldb, strideB,
+                              &beta, C, ldc, strideC, batchCount);
+
+// === 新代码 ===
+int M = 65536, N = 65536, K = 65536;  // int64_t → int
+mublasDgemmStridedBatched(handle, MUBLAS_OP_N, MUBLAS_OP_N,
+                          M, N, K,              // int
+                          &alpha, A, lda, strideA,
+                          B, ldb, strideB,
+                          &beta, C, ldc, strideC, batchCount);
+```
+
+```cpp
+// === 如果一定有超大矩阵 (M*N*K > 2^31-1) ===
+// 方案 1: 拆成多个子矩阵分次计算
+// 方案 2: 检查 MUBLAS 新版是否重新提供了 _64 变种
+//         (MUBLAS 1.10.6 目前没有，未来可能加)
+// 方案 3: 使用 GemmEx + 手动分块
+
+// 安全检查宏:
+#define MUBLAS_CHECK_SIZE(val, name) \
+    do { \
+        if ((val) > INT32_MAX) { \
+            fprintf(stderr, "%s exceeds INT32_MAX: %ld\n", name, (long)(val)); \
+            return MUBLAS_STATUS_INVALID_SIZE; \
+        } \
+    } while(0)
+```
+
+### stride 参数
+stride 参数（`strideA`, `strideB`, `strideC`）在 MUBLAS 中仍然是 `long long int` (64位)，**不受影响**。只有 m, n, k 维度收窄为 int。
+
+---
+
+## 问题 3: `cudaStream_t` → `MUstream`
+
+### 问题描述
+MUBLAS 的 stream 类型是 `MUstream` 而非 `cudaStream_t`。
+
+但两者底层是兼容的，都指向 `struct MUstream_st*`。
+
+### 迁移代码
+
+```cpp
+// === 旧代码 ===
+cudaStream_t stream;
+cudaStreamCreate(&stream);
+cublasSetStream(handle, stream);
+cublasSetVectorAsync(n, elemSize, x, incx, y, incy, stream);
+
+// === 新代码 ===
+MUstream stream;                        // cudaStream_t → MUstream
+muStreamCreate(&stream);                // cudaStreamCreate → muStreamCreate
+mublasSetStream(handle, stream);
+mublasSetVectorAsync(n, elemSize, x, incx, y, incy, stream);
+
+// === 如果不想改接口，用 typedef 桥接 ===
+// 方案: 在公共头文件中加入:
+#if defined(__MUSACC__)
+    typedef MUstream cudaStream_t;      // MUSA 上 cudaStream_t 就是 MUstream
+#endif
+// 这样 mublasSetStream(handle, cudaStream_t_var) 直接可用
+```
+
+```cpp
+// === 所有受影响的 mublas 辅助函数 ===
+// mublasSetStream(handle, MUstream)
+// mublasGetStream(handle, MUstream*)
+// mublasSetKernelStream(handle, MUstream)       // MUBLAS 新增
+// mublasSetVectorAsync(..., MUstream)
+// mublasGetVectorAsync(..., MUstream)
+// mublasSetMatrixAsync(..., MUstream)
+// mublasGetMatrixAsync(..., MUstream)
+```
+
+---
+
+## 附: 枚举值问题 —— 推荐修复方案
+
+**这是 50 个函数都受影响的问题**，不只是那 3 个。
+
+### 方案 A: 统一头文件桥接 (推荐)
+
+```c
+// mublas_compat.h — 包含此头文件即可安全迁移
+#include <mublas.h>
+
+// 如果代码里还有裸数字，加编译期检查
+#if defined(__MUSACC__)
+    // 强制使用命名常量
+    #define MUBLAS_SAFE_OP_N    MUBLAS_OP_N
+    #define MUBLAS_SAFE_OP_T    MUBLAS_OP_T
+    #define MUBLAS_SAFE_FILL_L  MUBLAS_FILL_MODE_LOWER
+    #define MUBLAS_SAFE_FILL_U  MUBLAS_FILL_MODE_UPPER
+    // ... 等等
+#endif
+```
+
+### 方案 B: 逐文件 sed 检查
+
+```bash
+# 检查是否有裸数字用作枚举 (只检查特定场景)
+# 匹配带 trans/op/fill/diag/side 参数位置的可能裸数字
+grep -rnP "(trans|op|fill|diag|side).*,\s*[012]\s*,.*\)" *.c *.cpp *.cu
+
+# 如果搜到结果，该处可能传了裸数字而不是枚举常量，需要改为:
+#   0 → MUBLAS_OP_N / MUBLAS_FILL_MODE_LOWER / MUBLAS_DIAG_NON_UNIT / MUBLAS_SIDE_LEFT
+#   1 → MUBLAS_OP_T / MUBLAS_FILL_MODE_UPPER / MUBLAS_DIAG_UNIT / MUBLAS_SIDE_RIGHT
+#   2 → MUBLAS_OP_C
+```
+
+### 方案 C: Python 侧迁移
+
+```python
+# 旧代码
+class CublasOp:
+    N = 0
+    T = 1
+    C = 2
+
+class CublasFill:
+    LOWER = 0
+    UPPER = 1
+
+# 新代码
+class MublasOp:
+    N = 111
+    T = 112
+    C = 113
+
+class MublasFill:
+    UPPER = 121
+    LOWER = 122
+    FULL  = 123    # MUBLAS 新增
+
+class MublasDiag:
+    NON_UNIT = 131
+    UNIT     = 132
+
+class MublasSide:
+    LEFT  = 141
+    RIGHT = 142
+    BOTH  = 143    # MUBLAS 新增
+```
+
+---
+
+## 完整迁移脚本
+
+```bash
+#!/bin/bash
+# migrate_cublas_to_mublas.sh — 自动迁移脚本
+
+TARGET_DIR=${1:-.}
+
+echo "=== Step 1: 函数名替换 ==="
+# 基本替换
+find "$TARGET_DIR" -name "*.c" -o -name "*.cpp" -o -name "*.h" -o -name "*.cu" -o -name "*.py" | while read f; do
+    # _v2 后缀去掉
+    sed -i 's/cublasCcopy_v2/mublasCcopy/g' "$f"
+    sed -i 's/cublasCdotu_v2/mublasCdotu/g' "$f"
+    sed -i 's/cublasZdotc_v2/mublasZdotc/g' "$f"
+    sed -i 's/cublasCgemm_v2/mublasCgemm/g' "$f"
+    sed -i 's/cublasCgemv_v2/mublasCgemv/g' "$f"
+    sed -i 's/cublasCgeru_v2/mublasCgeru/g' "$f"
+    sed -i 's/cublasCsymm_v2/mublasCsymm/g' "$f"
+    sed -i 's/cublasCsymv_v2/mublasCsymv/g' "$f"
+    sed -i 's/cublasDasum_v2/mublasDasum/g' "$f"
+    sed -i 's/cublasDaxpy_v2/mublasDaxpy/g' "$f"
+    sed -i 's/cublasDcopy_v2/mublasDcopy/g' "$f"
+    sed -i 's/cublasDgemm_v2/mublasDgemm/g' "$f"
+    sed -i 's/cublasDgemv_v2/mublasDgemv/g' "$f"
+    sed -i 's/cublasDsbmv_v2/mublasDsbmv/g' "$f"
+    sed -i 's/cublasDsyr2_v2/mublasDsyr2/g' "$f"
+    sed -i 's/cublasSaxpy_v2/mublasSaxpy/g' "$f"
+    sed -i 's/cublasSdot_v2/mublasSdot/g' "$f"
+    sed -i 's/cublasSgemm_v2/mublasSgemm/g' "$f"
+    sed -i 's/cublasSger_v2/mublasSger/g' "$f"
+    sed -i 's/cublasSscal_v2/mublasSscal/g' "$f"
+    sed -i 's/cublasSsyrk_v2/mublasSsyrk/g' "$f"
+    sed -i 's/cublasStbmv_v2/mublasStbmv/g' "$f"
+    sed -i 's/cublasStrsm_v2/mublasStrsm/g' "$f"
+    sed -i 's/cublasStrsv_v2/mublasStrsv/g' "$f"
+    sed -i 's/cublasZgerc_v2/mublasZgerc/g' "$f"
+    sed -i 's/cublasZswap_v2/mublasZswap/g' "$f"
+
+    # 不带 _v2 的
+    sed -i 's/cublasCgemmStridedBatched\b/mublasCgemmStridedBatched/g' "$f"
+    sed -i 's/cublasDgemmStridedBatched\b/mublasDgemmStridedBatched/g' "$f"
+    sed -i 's/cublasHgemmStridedBatched\b/mublasHgemmStridedBatched/g' "$f"
+    sed -i 's/cublasSgemmStridedBatched\b/mublasSgemmStridedBatched/g' "$f"
+    sed -i 's/cublasCgemvStridedBatched/mublasCgemvStridedBatched/g' "$f"
+    sed -i 's/cublasDgemvStridedBatched/mublasDgemvStridedBatched/g' "$f"
+    sed -i 's/cublasSgemvStridedBatched/mublasSgemvStridedBatched/g' "$f"
+    sed -i 's/cublasZgemvStridedBatched/mublasZgemvStridedBatched/g' "$f"
+    sed -i 's/cublasCgemmStridedBatched/mublasCgemmStridedBatched/g' "$f"
+    sed -i 's/cublasDgemmBatched/mublasDgemmBatched/g' "$f"
+    sed -i 's/cublasHgemmBatched/mublasHgemmBatched/g' "$f"
+    sed -i 's/cublasZgemmBatched/mublasZgemmBatched/g' "$f"
+    sed -i 's/cublasDgemvBatched/mublasDgemvBatched/g' "$f"
+    sed -i 's/cublasSgemvBatched/mublasSgemvBatched/g' "$f"
+    sed -i 's/cublasCgemvBatched/mublasCgemvBatched/g' "$f"
+    sed -i 's/cublasZgemvBatched/mublasZgemvBatched/g' "$f"
+    sed -i 's/cublasDtrsmBatched/mublasDtrsmBatched/g' "$f"
+    sed -i 's/cublasZtrsmBatched/mublasZtrsmBatched/g' "$f"
+    sed -i 's/cublasSdgmm/mublasSdgmm/g' "$f"
+    sed -i 's/cublasSgeam/mublasSgeam/g' "$f"
+    sed -i 's/cublasCsyrkEx/mublasCsyrkEx/g' "$f"
+done
+
+echo "=== Step 2: _64 后缀去掉 ==="
+find "$TARGET_DIR" -name "*.c" -o -name "*.cpp" -o -name "*.h" -o -name "*.cu" -o -name "*.py" | while read f; do
+    sed -i 's/cublasCgemmStridedBatched_64/mublasCgemmStridedBatched/g' "$f"
+    sed -i 's/cublasDgemmStridedBatched_64/mublasDgemmStridedBatched/g' "$f"
+    sed -i 's/cublasSgemmBatched_64/mublasSgemmBatched/g' "$f"
+    sed -i 's/cublasCgemvBatched_64/mublasCgemvBatched/g' "$f"
+done
+
+echo "=== Step 3: cublasSgemmEx → mublasGemmEx (特殊处理) ==="
+find "$TARGET_DIR" -name "*.c" -o -name "*.cpp" -o -name "*.h" -o -name "*.cu" -o -name "*.py" | while read f; do
+    sed -i 's/cublasSgemmEx\b/mublasGemmEx/g' "$f"
+done
+
+echo "=== Step 4: 辅助函数 ==="
+find "$TARGET_DIR" -name "*.c" -o -name "*.cpp" -o -name "*.h" -o -name "*.cu" -o -name "*.py" | while read f; do
+    sed -i 's/cublasCreate/mublasCreate/g' "$f"
+    sed -i 's/cublasDestroy/mublasDestroy/g' "$f"
+    sed -i 's/cublasSetStream/mublasSetStream/g' "$f"
+    sed -i 's/cublasGetStream/mublasGetStream/g' "$f"
+    sed -i 's/cublasSetMathMode/mublasSetMathMode/g' "$f"
+    sed -i 's/cublasGetMathMode/mublasGetMathMode/g' "$f"
+    sed -i 's/cublasGetVersion/mublasGetVersion/g' "$f"
+    sed -i 's/cublasSetPointerMode/mublasSetPointerMode/g' "$f"
+    sed -i 's/cublasGetPointerMode/mublasGetPointerMode/g' "$f"
+    sed -i 's/cublasSetAtomicsMode/mublasSetAtomicsMode/g' "$f"
+    sed -i 's/cublasGetAtomicsMode/mublasGetAtomicsMode/g' "$f"
+    sed -i 's/cublasSetWorkspace/mublasSetWorkspace/g' "$f"
+    sed -i 's/cublasSetVectorAsync/mublasSetVectorAsync/g' "$f"
+    sed -i 's/cublasGetVectorAsync/mublasGetVectorAsync/g' "$f"
+    sed -i 's/cublasSetMatrixAsync/mublasSetMatrixAsync/g' "$f"
+    sed -i 's/cublasGetMatrixAsync/mublasGetMatrixAsync/g' "$f"
+done
+
+echo "=== Step 5: 枚举/类型替换 ==="
+find "$TARGET_DIR" -name "*.c" -o -name "*.cpp" -o -name "*.h" -o -name "*.cu" -o -name "*.py" | while read f; do
+    sed -i 's/CUBLAS_OP_N\b/MUBLAS_OP_N/g' "$f"
+    sed -i 's/CUBLAS_OP_T\b/MUBLAS_OP_T/g' "$f"
+    sed -i 's/CUBLAS_OP_C\b/MUBLAS_OP_C/g' "$f"
+    sed -i 's/CUBLAS_FILL_MODE_LOWER/MUBLAS_FILL_MODE_LOWER/g' "$f"
+    sed -i 's/CUBLAS_FILL_MODE_UPPER/MUBLAS_FILL_MODE_UPPER/g' "$f"
+    sed -i 's/CUBLAS_DIAG_NON_UNIT/MUBLAS_DIAG_NON_UNIT/g' "$f"
+    sed -i 's/CUBLAS_DIAG_UNIT/MUBLAS_DIAG_UNIT/g' "$f"
+    sed -i 's/CUBLAS_SIDE_LEFT\b/MUBLAS_SIDE_LEFT/g' "$f"
+    sed -i 's/CUBLAS_SIDE_RIGHT\b/MUBLAS_SIDE_RIGHT/g' "$f"
+    sed -i 's/CUBLAS_STATUS_SUCCESS/MUBLAS_STATUS_SUCCESS/g' "$f"
+    sed -i 's/CUBLAS_POINTER_MODE_HOST/MUBLAS_POINTER_MODE_HOST/g' "$f"
+    sed -i 's/CUBLAS_POINTER_MODE_DEVICE/MUBLAS_POINTER_MODE_DEVICE/g' "$f"
+    sed -i 's/CUBLAS_GEMM_DEFAULT\b/MUBLAS_GEMM_DEFAULT/g' "$f"
+    sed -i 's/CUBLAS_GEMM_DEFAULT_TENSOR_OP/MUBLAS_GEMM_DEFAULT_TENSOR_OP/g' "$f"
+    # 类型
+    sed -i 's/cublasHandle_t/mublasHandle_t/g' "$f"
+    sed -i 's/cublasStatus_t/mublasStatus/g' "$f"
+    sed -i 's/cublasOperation_t/mublasOperation_t/g' "$f"
+    sed -i 's/cublasFillMode_t/mublasFillMode_t/g' "$f"
+    sed -i 's/cublasDiagType_t/mublasDiagType_t/g' "$f"
+    sed -i 's/cublasSideMode_t/mublasSideMode_t/g' "$f"
+    sed -i 's/cuComplex\b/muComplex/g' "$f"
+    sed -i 's/cuDoubleComplex/muDoubleComplex/g' "$f"
+    sed -i 's/cudaStream_t/MUstream/g' "$f"
+done
+
+echo "=== Step 6: 头文件和链接 ==="
+find "$TARGET_DIR" -name "*.c" -o -name "*.cpp" -o -name "*.h" -o -name "*.cu" -o -name "*.py" | while read f; do
+    sed -i 's|#include <cublas_v2.h>|#include <mublas.h>|g' "$f"
+    sed -i 's|#include <cuda_fp16.h>|#include <musa_fp16.h>|g' "$f"
+    sed -i 's/-lcublas/-lmublas/g' "$f"
+    # shared library
+    sed -i 's/libcublas/libmublas/g' "$f"
+done
+
+echo "=== Done ==="
+echo "⚠️  请手动检查:"
+echo "  1. cublasSgemmEx → mublasGemmEx 的 computeType 参数值"
+echo "  2. _64 函数的 int64_t→int 类型转换"
+echo "  3. 是否有裸数字 0/1/2 用作枚举 (grep -rnP '(trans|op|fill|diag|side).*[012]' .)"
+echo "  4. cudaStream_t 相关流操作是否需要适配 muStreamCreate/muStreamDestroy"
+```
\ No newline at end of file
diff --git a/docs/source/development/custom-operators.md b/docs/source/development/custom-operators.md
index 9df9d69..3691382 100644
--- a/docs/source/development/custom-operators.md
+++ b/docs/source/development/custom-operators.md
@@ -80,8 +80,7 @@ def test_accuracy_my_op(shape, dtype):
     y = torch.randn(M, N, device='cuda', dtype=dtype)
 
     ref_out = kernelgenbench.baseline.my_op(x, y)
-    # During verification, kernelgenbench.triton.my_op is the generated kernel
-    act_out = kernelgenbench.baseline.my_op(x.clone(), y.clone())
+    act_out = kernelgenbench.triton.my_op(x.clone(), y.clone())
 
     assert_close(act_out, ref_out, dtype)
 ```
diff --git a/docs/source/getting-started/index.md b/docs/source/getting-started/index.md
index eee633d..a28c592 100644
--- a/docs/source/getting-started/index.md
+++ b/docs/source/getting-started/index.md
@@ -61,7 +61,7 @@ export OPENAI_API_KEY=your_key
 export OPENAI_BASE_URL=http://your-endpoint/v1  # Optional, for custom endpoints
 ```
 
-### Step 3: Install Claude Code CLI (for Agent Track)
+### Install Claude Code CLI (for Agent Track)
 
 If you plan to use the Agent Track, install the Claude Code CLI:
 
diff --git a/docs/source/glossary/index.md b/docs/source/glossary/index.md
index 73a59c9..2c5db12 100644
--- a/docs/source/glossary/index.md
+++ b/docs/source/glossary/index.md
@@ -38,6 +38,10 @@ KernelGenBench-aten
 KernelGenBench-cublas
 : A dataset subset containing 50 cuBLAS operators, available only on NVIDIA platforms due to library dependencies.
 
+{.glossary}
+KernelGenBench-nocublas
+: A dataset subset containing 160 operators (ATen + vLLM), used for NVIDIA evaluation without cuBLAS dependency.
+
 {.glossary}
 KernelGenBench-MS
 : The Multi-Source sub-benchmark evaluating 210 operators from three sources (ATen, vLLM, cuBLAS) on NVIDIA hardware.